diff options
Diffstat (limited to 'storage/innobase/log/log0recv.cc')
-rw-r--r-- | storage/innobase/log/log0recv.cc | 3783 |
1 files changed, 3783 insertions, 0 deletions
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc new file mode 100644 index 00000000..eb34fd8e --- /dev/null +++ b/storage/innobase/log/log0recv.cc @@ -0,0 +1,3783 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0recv.cc +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#include "univ.i" + +#include <map> +#include <string> +#include <my_service_manager.h> + +#include "log0recv.h" + +#ifdef HAVE_MY_AES_H +#include <my_aes.h> +#endif + +#include "log0crypt.h" +#include "mem0mem.h" +#include "buf0buf.h" +#include "buf0dblwr.h" +#include "buf0flu.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0page.h" +#include "page0cur.h" +#include "trx0undo.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "fil0fil.h" +#include "buf0rea.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0pagecompress.h" + +/** Read-ahead area in applying log records to file pages */ +#define RECV_READ_AHEAD_AREA 32U + +/** The recovery system */ +recv_sys_t recv_sys; +/** TRUE when recv_init_crash_recovery() has been called. */ +bool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** TRUE if writing to the redo log (mtr_commit) is forbidden. +Protected by log_sys.mutex. */ +bool recv_no_log_write = false; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start(). */ +bool recv_lsn_checks_on; + +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this becomes TRUE if +the log record hash table becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +true means that recovery is running and no operations on the log file +are allowed yet: the variable name is misleading. */ +bool recv_no_ibuf_operations; + +/** The maximum lsn we see for a page during the recovery process. If this +is bigger than the lsn we are able to scan up to, that is an indication that +the recovery failed and the database may be corrupt. */ +static lsn_t recv_max_page_lsn; + +/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */ +struct log_phys_t : public log_rec_t +{ + /** start LSN of the mini-transaction (not necessarily of this record) */ + const lsn_t start_lsn; +private: + /** @return the start of length and data */ + const byte *start() const + { + return my_assume_aligned<sizeof(size_t)> + (reinterpret_cast<const byte*>(&start_lsn + 1)); + } + /** @return the start of length and data */ + byte *start() + { return const_cast<byte*>(const_cast<const log_phys_t*>(this)->start()); } + /** @return the length of the following record */ + uint16_t len() const { uint16_t i; memcpy(&i, start(), 2); return i; } + + /** @return start of the log records */ + byte *begin() { return start() + 2; } + /** @return end of the log records */ + byte *end() { byte *e= begin() + len(); ut_ad(!*e); return e; } +public: + /** @return start of the log records */ + const byte *begin() const { return const_cast<log_phys_t*>(this)->begin(); } + /** @return end of the log records */ + const byte *end() const { return const_cast<log_phys_t*>(this)->end(); } + + /** Determine the allocated size of the object. + @param len length of recs, excluding terminating NUL byte + @return the total allocation size */ + static inline size_t alloc_size(size_t len); + + /** Constructor. + @param start_lsn start LSN of the mini-transaction + @param lsn mtr_t::commit_lsn() of the mini-transaction + @param recs the first log record for the page in the mini-transaction + @param size length of recs, in bytes, excluding terminating NUL byte */ + log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) : + log_rec_t(lsn), start_lsn(start_lsn) + { + ut_ad(start_lsn); + ut_ad(start_lsn < lsn); + const uint16_t len= static_cast<uint16_t>(size); + ut_ad(len == size); + memcpy(start(), &len, 2); + reinterpret_cast<byte*>(memcpy(begin(), recs, size))[size]= 0; + } + + /** Append a record to the log. + @param recs log to append + @param size size of the log, in bytes */ + void append(const byte *recs, size_t size) + { + ut_ad(start_lsn < lsn); + uint16_t l= len(); + reinterpret_cast<byte*>(memcpy(end(), recs, size))[size]= 0; + l= static_cast<uint16_t>(l + size); + memcpy(start(), &l, 2); + } + + /** Apply an UNDO_APPEND record. + @see mtr_t::undo_append() + @param block undo log page + @param data undo log record + @param len length of the undo log record + @return whether the operation failed (inconcistency was noticed) */ + static bool undo_append(const buf_block_t &block, const byte *data, + size_t len) + { + ut_ad(len > 2); + byte *free_p= my_assume_aligned<2> + (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame); + const uint16_t free= mach_read_from_2(free_p); + if (UNIV_UNLIKELY(free < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE || + free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END)) + { + ib::error() << "Not applying UNDO_APPEND due to corruption on " + << block.page.id(); + return true; + } + + byte *p= block.frame + free; + mach_write_to_2(free_p, free + 4 + len); + memcpy(p, free_p, 2); + p+= 2; + memcpy(p, data, len); + p+= len; + mach_write_to_2(p, free); + return false; + } + + /** The status of apply() */ + enum apply_status { + /** The page was not affected */ + APPLIED_NO= 0, + /** The page was modified */ + APPLIED_YES, + /** The page was modified, affecting the encryption parameters */ + APPLIED_TO_ENCRYPTION, + /** The page was modified, affecting the tablespace header */ + APPLIED_TO_FSP_HEADER + }; + + /** Apply log to a page frame. + @param[in,out] block buffer block + @param[in,out] last_offset last byte offset, for same_page records + @return whether any log was applied to the page */ + apply_status apply(const buf_block_t &block, uint16_t &last_offset) const + { + const byte * const recs= begin(); + byte *const frame= block.page.zip.ssize + ? block.page.zip.data : block.frame; + const size_t size= block.physical_size(); + apply_status applied= APPLIED_NO; + + for (const byte *l= recs;;) + { + const byte b= *l++; + if (!b) + return applied; + ut_ad((b & 0x70) != RESERVED); + size_t rlen= b & 0xf; + if (!rlen) + { + const size_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + if (!(b & 0x80)) + { + /* Skip the page identifier. It has already been validated. */ + size_t idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen < rlen); + ut_ad(mlog_decode_varint(l) == block.page.id().space()); + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen <= rlen); + ut_ad(mlog_decode_varint(l) == block.page.id().page_no()); + l+= idlen; + rlen-= idlen; + last_offset= 0; + } + + switch (b & 0x70) { + case FREE_PAGE: + ut_ad(last_offset == 0); + goto next_not_same_page; + case INIT_PAGE: + if (UNIV_LIKELY(rlen == 0)) + { + memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size); + mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id().page_no()); + memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8); + mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space()); + last_offset= FIL_PAGE_TYPE; + next_after_applying: + if (applied == APPLIED_NO) + applied= APPLIED_YES; + } + else + { + record_corrupted: + if (!srv_force_recovery) + { + recv_sys.found_corrupt_log= true; + return applied; + } + next_not_same_page: + last_offset= 1; /* the next record must not be same_page */ + } + next: + l+= rlen; + continue; + } + + ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) == + block.page.id().page_no()); + ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) == + block.page.id().space()); + ut_ad(last_offset <= 1 || last_offset > 8); + ut_ad(last_offset <= size); + + switch (b & 0x70) { + case OPTION: + goto next; + case EXTENDED: + if (UNIV_UNLIKELY(block.page.id().page_no() < 3 || + block.page.zip.ssize)) + goto record_corrupted; + static_assert(INIT_ROW_FORMAT_REDUNDANT == 0, "compatiblity"); + static_assert(INIT_ROW_FORMAT_DYNAMIC == 1, "compatibility"); + if (UNIV_UNLIKELY(!rlen)) + goto record_corrupted; + switch (const byte subtype= *l) { + uint8_t ll; + size_t prev_rec, hdr_size; + default: + goto record_corrupted; + case INIT_ROW_FORMAT_REDUNDANT: + case INIT_ROW_FORMAT_DYNAMIC: + if (UNIV_UNLIKELY(rlen != 1)) + goto record_corrupted; + page_create_low(&block, *l != INIT_ROW_FORMAT_REDUNDANT); + break; + case UNDO_INIT: + if (UNIV_UNLIKELY(rlen != 1)) + goto record_corrupted; + trx_undo_page_init(block); + break; + case UNDO_APPEND: + if (UNIV_UNLIKELY(rlen <= 3)) + goto record_corrupted; + if (undo_append(block, ++l, --rlen) && !srv_force_recovery) + { +page_corrupted: + ib::error() << "Set innodb_force_recovery=1 to ignore corruption."; + recv_sys.found_corrupt_log= true; + return applied; + } + break; + case INSERT_HEAP_REDUNDANT: + case INSERT_REUSE_REDUNDANT: + case INSERT_HEAP_DYNAMIC: + case INSERT_REUSE_DYNAMIC: + if (UNIV_UNLIKELY(rlen < 2)) + goto record_corrupted; + rlen--; + ll= mlog_decode_varint_length(*++l); + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + prev_rec= mlog_decode_varint(l); + ut_ad(prev_rec != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + static_assert(INSERT_HEAP_REDUNDANT == 4, "compatibility"); + static_assert(INSERT_REUSE_REDUNDANT == 5, "compatibility"); + static_assert(INSERT_HEAP_DYNAMIC == 6, "compatibility"); + static_assert(INSERT_REUSE_DYNAMIC == 7, "compatibility"); + if (subtype & 2) + { + size_t shift= 0; + if (subtype & 1) + { + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + shift= mlog_decode_varint(l); + ut_ad(shift != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + } + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + size_t enc_hdr_l= mlog_decode_varint(l); + ut_ad(enc_hdr_l != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t hdr_c= mlog_decode_varint(l); + ut_ad(hdr_c != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + size_t data_c= mlog_decode_varint(l); + ut_ad(data_c != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + if (page_apply_insert_dynamic(block, subtype & 1, prev_rec, + shift, enc_hdr_l, hdr_c, data_c, + l, rlen) && !srv_force_recovery) + goto page_corrupted; + } + else + { + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t header= mlog_decode_varint(l); + ut_ad(header != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t hdr_c= mlog_decode_varint(l); + ut_ad(hdr_c != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t data_c= mlog_decode_varint(l); + rlen-= ll; + l+= ll; + if (page_apply_insert_redundant(block, subtype & 1, prev_rec, + header, hdr_c, data_c, + l, rlen) && !srv_force_recovery) + goto page_corrupted; + } + break; + case DELETE_ROW_FORMAT_REDUNDANT: + if (UNIV_UNLIKELY(rlen < 2 || rlen > 4)) + goto record_corrupted; + rlen--; + ll= mlog_decode_varint_length(*++l); + if (UNIV_UNLIKELY(ll != rlen)) + goto record_corrupted; + if (page_apply_delete_redundant(block, mlog_decode_varint(l)) && + !srv_force_recovery) + goto page_corrupted; + break; + case DELETE_ROW_FORMAT_DYNAMIC: + if (UNIV_UNLIKELY(rlen < 2)) + goto record_corrupted; + rlen--; + ll= mlog_decode_varint_length(*++l); + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + prev_rec= mlog_decode_varint(l); + ut_ad(prev_rec != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + hdr_size= mlog_decode_varint(l); + ut_ad(hdr_size != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 3 || ll != rlen)) + goto record_corrupted; + if (page_apply_delete_dynamic(block, prev_rec, hdr_size, + mlog_decode_varint(l)) && + !srv_force_recovery) + goto page_corrupted; + break; + } + last_offset= FIL_PAGE_TYPE; + goto next_after_applying; + case WRITE: + case MEMSET: + case MEMMOVE: + if (UNIV_UNLIKELY(last_offset == 1)) + goto record_corrupted; + const size_t olen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(l); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= size)) + goto record_corrupted; + if (UNIV_UNLIKELY(offset + last_offset < 8 || + offset + last_offset >= size)) + goto record_corrupted; + last_offset= static_cast<uint16_t>(last_offset + offset); + l+= olen; + rlen-= olen; + size_t llen= rlen; + if ((b & 0x70) == WRITE) + { + if (UNIV_UNLIKELY(rlen + last_offset > size)) + goto record_corrupted; + memcpy(frame + last_offset, l, llen); + if (UNIV_LIKELY(block.page.id().page_no())); + else if (llen == 11 + MY_AES_BLOCK_SIZE && + last_offset == FSP_HEADER_OFFSET + MAGIC_SZ + + fsp_header_get_encryption_offset(block.zip_size())) + applied= APPLIED_TO_ENCRYPTION; + else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 && + last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE) + applied= APPLIED_TO_FSP_HEADER; + next_after_applying_write: + ut_ad(llen + last_offset <= size); + last_offset= static_cast<uint16_t>(last_offset + llen); + goto next_after_applying; + } + llen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(llen > rlen || llen > 3)) + goto record_corrupted; + const uint32_t len= mlog_decode_varint(l); + ut_ad(len != MLOG_DECODE_ERROR); + if (UNIV_UNLIKELY(len + last_offset > size)) + goto record_corrupted; + l+= llen; + rlen-= llen; + llen= len; + if ((b & 0x70) == MEMSET) + { + ut_ad(rlen <= llen); + if (UNIV_UNLIKELY(rlen != 1)) + { + size_t s; + for (s= 0; s < llen; s+= rlen) + memcpy(frame + last_offset + s, l, rlen); + memcpy(frame + last_offset + s, l, llen - s); + } + else + memset(frame + last_offset, *l, llen); + goto next_after_applying_write; + } + const size_t slen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(slen != rlen || slen > 3)) + goto record_corrupted; + uint32_t s= mlog_decode_varint(l); + ut_ad(slen != MLOG_DECODE_ERROR); + if (s & 1) + s= last_offset - (s >> 1) - 1; + else + s= last_offset + (s >> 1) + 1; + if (UNIV_LIKELY(s >= 8 && s + llen <= size)) + { + memmove(frame + last_offset, frame + s, llen); + goto next_after_applying_write; + } + } + goto record_corrupted; + } + } +}; + + +inline size_t log_phys_t::alloc_size(size_t len) +{ + return len + (1 + 2 + sizeof(log_phys_t)); +} + + +/** Tablespace item during recovery */ +struct file_name_t { + /** Tablespace file name (FILE_MODIFY) */ + std::string name; + /** Tablespace object (NULL if not valid or not found) */ + fil_space_t* space = nullptr; + + /** Tablespace status. */ + enum fil_status { + /** Normal tablespace */ + NORMAL, + /** Deleted tablespace */ + DELETED, + /** Missing tablespace */ + MISSING + }; + + /** Status of the tablespace */ + fil_status status; + + /** FSP_SIZE of tablespace */ + uint32_t size = 0; + + /** Freed pages of tablespace */ + range_set freed_ranges; + + /** Dummy flags before they have been read from the .ibd file */ + static constexpr uint32_t initial_flags = FSP_FLAGS_FCRC32_MASK_MARKER; + /** FSP_SPACE_FLAGS of tablespace */ + uint32_t flags = initial_flags; + + /** Constructor */ + file_name_t(std::string name_, bool deleted) + : name(std::move(name_)), status(deleted ? DELETED: NORMAL) {} + + /** Add the freed pages */ + void add_freed_page(uint32_t page_no) { freed_ranges.add_value(page_no); } + + /** Remove the freed pages */ + void remove_freed_page(uint32_t page_no) + { + if (freed_ranges.empty()) return; + freed_ranges.remove_value(page_no); + } +}; + +/** Map of dirty tablespaces during recovery */ +typedef std::map< + ulint, + file_name_t, + std::less<ulint>, + ut_allocator<std::pair<const ulint, file_name_t> > > recv_spaces_t; + +static recv_spaces_t recv_spaces; + +/** The last parsed FILE_RENAME records */ +static std::map<uint32_t,std::string> renamed_spaces; + +/** Report an operation to create, delete, or rename a file during backup. +@param[in] space_id tablespace identifier +@param[in] create whether the file is being created +@param[in] name file name (not NUL-terminated) +@param[in] len length of name, in bytes +@param[in] new_name new file name (NULL if not rename) +@param[in] new_len length of new_name, in bytes (0 if NULL) */ +void (*log_file_op)(ulint space_id, bool create, + const byte* name, ulint len, + const byte* new_name, ulint new_len); + +/** Information about initializing page contents during redo log processing. +FIXME: Rely on recv_sys.pages! */ +class mlog_init_t +{ +public: + /** A page initialization operation that was parsed from + the redo log */ + struct init { + /** log sequence number of the page initialization */ + lsn_t lsn; + /** Whether btr_page_create() avoided a read of the page. + + At the end of the last recovery batch, mark_ibuf_exist() + will mark pages for which this flag is set. */ + bool created; + }; + +private: + typedef std::map<const page_id_t, init, + std::less<const page_id_t>, + ut_allocator<std::pair<const page_id_t, init> > > + map; + /** Map of page initialization operations. + FIXME: Merge this to recv_sys.pages! */ + map inits; +public: + /** Record that a page will be initialized by the redo log. + @param[in] page_id page identifier + @param[in] lsn log sequence number + @return whether the state was changed */ + bool add(const page_id_t page_id, lsn_t lsn) + { + ut_ad(mutex_own(&recv_sys.mutex)); + const init init = { lsn, false }; + std::pair<map::iterator, bool> p = inits.insert( + map::value_type(page_id, init)); + ut_ad(!p.first->second.created); + if (p.second) return true; + if (p.first->second.lsn >= init.lsn) return false; + p.first->second = init; + return true; + } + + /** Get the last stored lsn of the page id and its respective + init/load operation. + @param[in] page_id page id + @param[in,out] init initialize log or load log + @return the latest page initialization; + not valid after releasing recv_sys.mutex. */ + init& last(page_id_t page_id) + { + ut_ad(mutex_own(&recv_sys.mutex)); + return inits.find(page_id)->second; + } + + /** Determine if a page will be initialized or freed after a time. + @param page_id page identifier + @param lsn log sequence number + @return whether page_id will be freed or initialized after lsn */ + bool will_avoid_read(page_id_t page_id, lsn_t lsn) const + { + ut_ad(mutex_own(&recv_sys.mutex)); + auto i= inits.find(page_id); + return i != inits.end() && i->second.lsn > lsn; + } + + /** At the end of each recovery batch, reset the 'created' flags. */ + void reset() + { + ut_ad(mutex_own(&recv_sys.mutex)); + ut_ad(recv_no_ibuf_operations); + for (map::value_type& i : inits) { + i.second.created = false; + } + } + + /** On the last recovery batch, mark whether there exist + buffered changes for the pages that were initialized + by buf_page_create() and still reside in the buffer pool. + @param[in,out] mtr dummy mini-transaction */ + void mark_ibuf_exist(mtr_t& mtr) + { + ut_ad(mutex_own(&recv_sys.mutex)); + mtr.start(); + + for (const map::value_type& i : inits) { + if (!i.second.created) { + continue; + } + if (buf_block_t* block = buf_page_get_low( + i.first, 0, RW_X_LATCH, nullptr, + BUF_GET_IF_IN_POOL, __FILE__, __LINE__, + &mtr, nullptr, false)) { + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + switch (fil_page_get_type( + block->page.zip.data)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + if (page_zip_decompress( + &block->page.zip, + block->frame, + true)) { + break; + } + ib::error() << "corrupted " + << block->page.id(); + } + } + if (recv_no_ibuf_operations) { + mtr.commit(); + mtr.start(); + continue; + } + mutex_exit(&recv_sys.mutex); + block->page.ibuf_exist = ibuf_page_exists( + block->page.id(), block->zip_size()); + mtr.commit(); + mtr.start(); + mutex_enter(&recv_sys.mutex); + } + } + + mtr.commit(); + } + + /** Clear the data structure */ + void clear() { inits.clear(); } +}; + +static mlog_init_t mlog_init; + +/** Process a record that indicates that a tablespace is +being shrunk in size. +@param page_id first page identifier that is not in the file +@param lsn log sequence number of the shrink operation */ +inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn) +{ + DBUG_ENTER("recv_sys_t::trim"); + DBUG_LOG("ib_log", + "discarding log beyond end of tablespace " + << page_id << " before LSN " << lsn); + ut_ad(mutex_own(&mutex)); + for (recv_sys_t::map::iterator p = pages.lower_bound(page_id); + p != pages.end() && p->first.space() == page_id.space();) { + recv_sys_t::map::iterator r = p++; + if (r->second.trim(lsn)) { + pages.erase(r); + } + } + if (fil_space_t* space = fil_space_get(page_id.space())) { + ut_ad(UT_LIST_GET_LEN(space->chain) == 1); + fil_node_t* file = UT_LIST_GET_FIRST(space->chain); + ut_ad(file->is_open()); + os_file_truncate(file->name, file->handle, + os_offset_t{page_id.page_no()} + << srv_page_size_shift, true); + } + DBUG_VOID_RETURN; +} + +void recv_sys_t::open_log_files_if_needed() +{ + if (!recv_sys.files.empty()) + return; + + for (auto &&path : get_existing_log_files_paths()) + { + recv_sys.files.emplace_back(std::move(path)); + ut_a(recv_sys.files.back().open(true) == DB_SUCCESS); + } +} + +void recv_sys_t::read(os_offset_t total_offset, span<byte> buf) +{ + open_log_files_if_needed(); + + size_t file_idx= static_cast<size_t>(total_offset / log_sys.log.file_size); + os_offset_t offset= total_offset % log_sys.log.file_size; + dberr_t err= recv_sys.files[file_idx].read(offset, buf); + ut_a(err == DB_SUCCESS); +} + +inline size_t recv_sys_t::files_size() +{ + open_log_files_if_needed(); + return files.size(); +} + +/** Process a file name from a FILE_* record. +@param[in,out] name file name +@param[in] len length of the file name +@param[in] space_id the tablespace ID +@param[in] deleted whether this is a FILE_DELETE record */ +static +void +fil_name_process(char* name, ulint len, ulint space_id, bool deleted) +{ + if (srv_operation == SRV_OPERATION_BACKUP) { + return; + } + + ut_ad(srv_operation == SRV_OPERATION_NORMAL + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + + /* We will also insert space=NULL into the map, so that + further checks can ensure that a FILE_MODIFY record was + scanned before applying any page records for the space_id. */ + + os_normalize_path(name); + const file_name_t fname(std::string(name, len), deleted); + std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace( + space_id, fname); + ut_ad(p.first->first == space_id); + + file_name_t& f = p.first->second; + + if (deleted) { + /* Got FILE_DELETE */ + + if (!p.second && f.status != file_name_t::DELETED) { + f.status = file_name_t::DELETED; + if (f.space != NULL) { + fil_space_free(space_id, false); + f.space = NULL; + } + } + + ut_ad(f.space == NULL); + } else if (p.second // the first FILE_MODIFY or FILE_RENAME + || f.name != fname.name) { + fil_space_t* space; + + /* Check if the tablespace file exists and contains + the space_id. If not, ignore the file after displaying + a note. Abort if there are multiple files with the + same space_id. */ + switch (fil_ibd_load(space_id, name, space)) { + case FIL_LOAD_OK: + ut_ad(space != NULL); + + if (!f.space) { + if (f.size + || f.flags != f.initial_flags) { + fil_space_set_recv_size_and_flags( + space->id, f.size, f.flags); + } + + f.space = space; + goto same_space; + } else if (f.space == space) { +same_space: + f.name = fname.name; + f.status = file_name_t::NORMAL; + } else { + ib::error() << "Tablespace " << space_id + << " has been found in two places: '" + << f.name << "' and '" << name << "'." + " You must delete one of them."; + recv_sys.found_corrupt_fs = true; + } + break; + + case FIL_LOAD_ID_CHANGED: + ut_ad(space == NULL); + break; + + case FIL_LOAD_NOT_FOUND: + /* No matching tablespace was found; maybe it + was renamed, and we will find a subsequent + FILE_* record. */ + ut_ad(space == NULL); + + if (srv_force_recovery) { + /* Without innodb_force_recovery, + missing tablespaces will only be + reported in + recv_init_crash_recovery_spaces(). + Enable some more diagnostics when + forcing recovery. */ + + ib::info() + << "At LSN: " << recv_sys.recovered_lsn + << ": unable to open file " << name + << " for tablespace " << space_id; + } + break; + + case FIL_LOAD_INVALID: + ut_ad(space == NULL); + if (srv_force_recovery == 0) { + ib::warn() << "We do not continue the crash" + " recovery, because the table may" + " become corrupt if we cannot apply" + " the log records in the InnoDB log to" + " it. To fix the problem and start" + " mysqld:"; + ib::info() << "1) If there is a permission" + " problem in the file and mysqld" + " cannot open the file, you should" + " modify the permissions."; + ib::info() << "2) If the tablespace is not" + " needed, or you can restore an older" + " version from a backup, then you can" + " remove the .ibd file, and use" + " --innodb_force_recovery=1 to force" + " startup without this file."; + ib::info() << "3) If the file system or the" + " disk is broken, and you cannot" + " remove the .ibd file, you can set" + " --innodb_force_recovery."; + recv_sys.found_corrupt_fs = true; + break; + } + + ib::info() << "innodb_force_recovery was set to " + << srv_force_recovery << ". Continuing crash" + " recovery even though we cannot access the" + " files for tablespace " << space_id << "."; + break; + } + } +} + +/** Clean up after recv_sys_t::create() */ +void recv_sys_t::close() +{ + ut_ad(this == &recv_sys); + + if (is_initialised()) + { + dblwr.pages.clear(); + ut_d(mutex_enter(&mutex)); + clear(); + ut_d(mutex_exit(&mutex)); + + if (buf) + { + ut_free_dodump(buf, RECV_PARSING_BUF_SIZE); + buf= nullptr; + } + + last_stored_lsn= 0; + mutex_free(&mutex); + } + + recv_spaces.clear(); + renamed_spaces.clear(); + mlog_init.clear(); + + close_files(); +} + +/** Initialize the redo log recovery subsystem. */ +void recv_sys_t::create() +{ + ut_ad(this == &recv_sys); + ut_ad(!is_initialised()); + mutex_create(LATCH_ID_RECV_SYS, &mutex); + + apply_log_recs = false; + apply_batch_on = false; + + buf = static_cast<byte*>(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE, + PSI_INSTRUMENT_ME)); + len = 0; + parse_start_lsn = 0; + scanned_lsn = 0; + scanned_checkpoint_no = 0; + recovered_offset = 0; + recovered_lsn = 0; + found_corrupt_log = false; + found_corrupt_fs = false; + mlog_checkpoint_lsn = 0; + + progress_time = time(NULL); + recv_max_page_lsn = 0; + + memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces); + last_stored_lsn = 1; + UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU); +} + +/** Clear a fully processed set of stored redo log records. */ +inline void recv_sys_t::clear() +{ + ut_ad(mutex_own(&mutex)); + apply_log_recs= false; + apply_batch_on= false; + ut_ad(!after_apply || !UT_LIST_GET_LAST(blocks)); + pages.clear(); + + for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; ) + { + buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block); + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); + UT_LIST_REMOVE(blocks, block); + MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size); + buf_block_free(block); + block= prev_block; + } +} + +/** Free most recovery data structures. */ +void recv_sys_t::debug_free() +{ + ut_ad(this == &recv_sys); + ut_ad(is_initialised()); + mutex_enter(&mutex); + + recovery_on= false; + pages.clear(); + ut_free_dodump(buf, RECV_PARSING_BUF_SIZE); + + buf= nullptr; + + mutex_exit(&mutex); +} + +inline void *recv_sys_t::alloc(size_t len) +{ + ut_ad(mutex_own(&mutex)); + ut_ad(len); + ut_ad(len <= srv_page_size); + + buf_block_t *block= UT_LIST_GET_FIRST(blocks); + if (UNIV_UNLIKELY(!block)) + { +create_block: + block= buf_block_alloc(); + block->page.access_time= 1U << 16 | + ut_calc_align<uint16_t>(static_cast<uint16_t>(len), ALIGNMENT); + static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2"); + UT_LIST_ADD_FIRST(blocks, block); + MEM_MAKE_ADDRESSABLE(block->frame, len); + MEM_NOACCESS(block->frame + len, srv_page_size - len); + return my_assume_aligned<ALIGNMENT>(block->frame); + } + + size_t free_offset= static_cast<uint16_t>(block->page.access_time); + ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT)); + if (UNIV_UNLIKELY(!free_offset)) + { + ut_ad(srv_page_size == 65536); + goto create_block; + } + ut_ad(free_offset <= srv_page_size); + free_offset+= len; + + if (free_offset > srv_page_size) + goto create_block; + + block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 | + ut_calc_align<uint16_t>(static_cast<uint16_t>(free_offset), ALIGNMENT); + MEM_MAKE_ADDRESSABLE(block->frame + free_offset - len, len); + return my_assume_aligned<ALIGNMENT>(block->frame + free_offset - len); +} + + +/** Free a redo log snippet. +@param data buffer returned by alloc() */ +inline void recv_sys_t::free(const void *data) +{ + ut_ad(!ut_align_offset(data, ALIGNMENT)); + data= page_align(data); + ut_ad(mutex_own(&mutex)); + + /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(), + we must acquire and hold the buffer pool mutex here. */ + ut_ad(!buf_pool.resize_in_progress()); + + auto *chunk= buf_pool.chunks; + for (auto i= buf_pool.n_chunks; i--; chunk++) + { + if (data < chunk->blocks->frame) + continue; + const size_t offs= (reinterpret_cast<const byte*>(data) - + chunk->blocks->frame) >> srv_page_size_shift; + if (offs >= chunk->size) + continue; + buf_block_t *block= &chunk->blocks[offs]; + ut_ad(block->frame == data); + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); + ut_ad(static_cast<uint16_t>(block->page.access_time - 1) < + srv_page_size); + ut_ad(block->page.access_time >= 1U << 16); + if (!((block->page.access_time -= 1U << 16) >> 16)) + { + UT_LIST_REMOVE(blocks, block); + MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size); + buf_block_free(block); + } + return; + } + ut_ad(0); +} + + +/** Read a log segment to log_sys.buf. +@param[in,out] start_lsn in: read area start, +out: the last read valid lsn +@param[in] end_lsn read area end +@return whether no invalid blocks (e.g checksum mismatch) were found */ +bool log_t::file::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn) +{ + ulint len; + bool success = true; + mysql_mutex_assert_owner(&log_sys.mutex); + ut_ad(!(*start_lsn % OS_FILE_LOG_BLOCK_SIZE)); + ut_ad(!(end_lsn % OS_FILE_LOG_BLOCK_SIZE)); + byte* buf = log_sys.buf; +loop: + lsn_t source_offset = calc_lsn_offset_old(*start_lsn); + + ut_a(end_lsn - *start_lsn <= ULINT_MAX); + len = (ulint) (end_lsn - *start_lsn); + + ut_ad(len != 0); + + const bool at_eof = (source_offset % file_size) + len > file_size; + if (at_eof) { + /* If the above condition is true then len (which is ulint) + is > the expression below, so the typecast is ok */ + len = ulint(file_size - (source_offset % file_size)); + } + + log_sys.n_log_ios++; + + ut_a((source_offset >> srv_page_size_shift) <= ULINT_MAX); + + recv_sys.read(source_offset, {buf, len}); + + for (ulint l = 0; l < len; l += OS_FILE_LOG_BLOCK_SIZE, + buf += OS_FILE_LOG_BLOCK_SIZE, + (*start_lsn) += OS_FILE_LOG_BLOCK_SIZE) { + const ulint block_number = log_block_get_hdr_no(buf); + + if (block_number != log_block_convert_lsn_to_no(*start_lsn)) { + /* Garbage or an incompletely written log block. + We will not report any error, because this can + happen when InnoDB was killed while it was + writing redo log. We simply treat this as an + abrupt end of the redo log. */ +fail: + end_lsn = *start_lsn; + success = false; + break; + } + + ulint crc = log_block_calc_checksum_crc32(buf); + ulint cksum = log_block_get_checksum(buf); + + DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", { + static int block_counter; + if (block_counter++ == 0) { + cksum = crc + 1; + } + }); + + DBUG_EXECUTE_IF("log_checksum_mismatch", { cksum = crc + 1; }); + + if (UNIV_UNLIKELY(crc != cksum)) { + ib::error_or_warn(srv_operation!=SRV_OPERATION_BACKUP) + << "Invalid log block checksum. block: " + << block_number + << " checkpoint no: " + << log_block_get_checkpoint_no(buf) + << " expected: " << crc + << " found: " << cksum; + goto fail; + } + + if (is_encrypted() + && !log_crypt(buf, *start_lsn, + OS_FILE_LOG_BLOCK_SIZE, + LOG_DECRYPT)) { + goto fail; + } + + ulint dl = log_block_get_data_len(buf); + if (dl < LOG_BLOCK_HDR_SIZE + || (dl != OS_FILE_LOG_BLOCK_SIZE + && dl > log_sys.trailer_offset())) { + recv_sys.found_corrupt_log = true; + goto fail; + } + } + + if (recv_sys.report(time(NULL))) { + ib::info() << "Read redo log up to LSN=" << *start_lsn; + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Read redo log up to LSN=" LSN_PF, + *start_lsn); + } + + if (*start_lsn != end_lsn) { + goto loop; + } + + return(success); +} + + + +/********************************************************//** +Copies a log segment from the most up-to-date log group to the other log +groups, so that they all contain the latest log data. Also writes the info +about the latest checkpoint to the groups, and inits the fields in the group +memory structs to up-to-date values. */ +static +void +recv_synchronize_groups() +{ + const lsn_t recovered_lsn = recv_sys.recovered_lsn; + + /* Read the last recovered log block to the recovery system buffer: + the block is always incomplete */ + + lsn_t start_lsn = ut_uint64_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); + log_sys.log.read_log_seg(&start_lsn, + start_lsn + OS_FILE_LOG_BLOCK_SIZE); + log_sys.log.set_fields(recovered_lsn); + + /* Copy the checkpoint info to the log; remember that we have + incremented checkpoint_no by one, and the info will not be written + over the max checkpoint info, thus making the preservation of max + checkpoint info on disk certain */ + + if (!srv_read_only_mode) { + log_write_checkpoint_info(0); + mysql_mutex_lock(&log_sys.mutex); + } +} + +/** Check the consistency of a log header block. +@param[in] log header block +@return true if ok */ +static +bool +recv_check_log_header_checksum( + const byte* buf) +{ + return(log_block_get_checksum(buf) + == log_block_calc_checksum_crc32(buf)); +} + +static bool redo_file_sizes_are_correct() +{ + auto paths= get_existing_log_files_paths(); + auto get_size= [](const std::string &path) { + return os_file_get_size(path.c_str()).m_total_size; + }; + os_offset_t size= get_size(paths[0]); + + auto it= + std::find_if(paths.begin(), paths.end(), [&](const std::string &path) { + return get_size(path) != size; + }); + + if (it == paths.end()) + return true; + + ib::error() << "Log file " << *it << " is of different size " + << get_size(*it) << " bytes than other log files " << size + << " bytes!"; + return false; +} + +/** Calculate the checksum for a log block using the pre-10.2.2 algorithm. */ +inline uint32_t log_block_calc_checksum_format_0(const byte *b) +{ + uint32_t sum= 1; + const byte *const end= &b[512 - 4]; + + for (uint32_t sh= 0; b < end; ) + { + sum&= 0x7FFFFFFFUL; + sum+= uint32_t{*b} << sh++; + sum+= *b++; + if (sh > 24) + sh= 0; + } + + return sum; +} + +/** Determine if a redo log from before MariaDB 10.2.2 is clean. +@return error code +@retval DB_SUCCESS if the redo log is clean +@retval DB_CORRUPTION if the redo log is corrupted +@retval DB_ERROR if the redo log is not empty */ +ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() +{ + uint64_t max_no= 0; + byte *buf= log_sys.buf; + + ut_ad(log_sys.log.format == 0); + + if (!redo_file_sizes_are_correct()) + return DB_CORRUPTION; + + /** Offset of the first checkpoint checksum */ + constexpr uint CHECKSUM_1= 288; + /** Offset of the second checkpoint checksum */ + constexpr uint CHECKSUM_2= CHECKSUM_1 + 4; + /** the checkpoint LSN field */ + constexpr uint CHECKPOINT_LSN= 8; + /** Most significant bits of the checkpoint offset */ + constexpr uint OFFS_HI= CHECKSUM_2 + 12; + /** Least significant bits of the checkpoint offset */ + constexpr uint OFFS_LO= 16; + + lsn_t lsn= 0; + + for (ulint field= LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; + field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) + { + log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE}); + + if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1)) != + mach_read_from_4(buf + CHECKSUM_1) || + static_cast<uint32_t>(ut_fold_binary(buf + CHECKPOINT_LSN, + CHECKSUM_2 - CHECKPOINT_LSN)) != + mach_read_from_4(buf + CHECKSUM_2)) + { + DBUG_LOG("ib_log", "invalid pre-10.2.2 checkpoint " << field); + continue; + } + + if (!log_crypt_101_read_checkpoint(buf)) + { + ib::error() << "Decrypting checkpoint failed"; + continue; + } + + const uint64_t checkpoint_no= mach_read_from_8(buf); + + DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found", + checkpoint_no, + mach_read_from_8(buf + CHECKPOINT_LSN))); + + if (checkpoint_no >= max_no) + { + max_no= checkpoint_no; + lsn= mach_read_from_8(buf + CHECKPOINT_LSN); + log_sys.log.set_lsn(lsn); + log_sys.log.set_lsn_offset(lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 | + mach_read_from_4(buf + OFFS_LO)); + } + } + + if (!lsn) + { + ib::error() << "Upgrade after a crash is not supported." + " This redo log was created before MariaDB 10.2.2," + " and we did not find a valid checkpoint." + " Please follow the instructions at" + " https://mariadb.com/kb/en/library/upgrading/"; + return DB_ERROR; + } + + log_sys.set_lsn(lsn); + log_sys.set_flushed_lsn(lsn); + const lsn_t source_offset= log_sys.log.calc_lsn_offset_old(lsn); + + static constexpr char NO_UPGRADE_RECOVERY_MSG[]= + "Upgrade after a crash is not supported." + " This redo log was created before MariaDB 10.2.2"; + + recv_sys.read(source_offset & ~511, {buf, 512}); + + if (log_block_calc_checksum_format_0(buf) != log_block_get_checksum(buf) && + !log_crypt_101_read_block(buf, lsn)) + { + ib::error() << NO_UPGRADE_RECOVERY_MSG << ", and it appears corrupted."; + return DB_CORRUPTION; + } + + if (mach_read_from_2(buf + 4) == (source_offset & 511)) + { + /* Mark the redo log for upgrading. */ + srv_log_file_size= 0; + recv_sys.parse_start_lsn= recv_sys.recovered_lsn= recv_sys.scanned_lsn= + recv_sys.mlog_checkpoint_lsn = lsn; + log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn= + log_sys.write_lsn= log_sys.current_flush_lsn= lsn; + log_sys.next_checkpoint_no= 0; + return DB_SUCCESS; + } + + if (buf[20 + 32 * 9] == 2) + ib::error() << "Cannot decrypt log for upgrading." + " The encrypted log was created before MariaDB 10.2.2."; + else + ib::error() << NO_UPGRADE_RECOVERY_MSG << "."; + + return DB_ERROR; +} + +/** Calculate the offset of a log sequence number +in an old redo log file (during upgrade check). +@param[in] lsn log sequence number +@return byte offset within the log */ +inline lsn_t log_t::file::calc_lsn_offset_old(lsn_t lsn) const +{ + const lsn_t size= capacity() * recv_sys.files_size(); + lsn_t l= lsn - this->lsn; + if (longlong(l) < 0) + { + l= lsn_t(-longlong(l)) % size; + l= size - l; + } + + l+= lsn_offset - LOG_FILE_HDR_SIZE * (1 + lsn_offset / file_size); + l%= size; + return l + LOG_FILE_HDR_SIZE * (1 + l / (file_size - LOG_FILE_HDR_SIZE)); +} + +/** Determine if a redo log from MariaDB 10.2.2+, 10.3, or 10.4 is clean. +@return error code +@retval DB_SUCCESS if the redo log is clean +@retval DB_CORRUPTION if the redo log is corrupted +@retval DB_ERROR if the redo log is not empty */ +static dberr_t recv_log_recover_10_4() +{ + const lsn_t lsn = log_sys.log.get_lsn(); + const lsn_t source_offset = log_sys.log.calc_lsn_offset_old(lsn); + byte* buf = log_sys.buf; + + if (!redo_file_sizes_are_correct()) { + return DB_CORRUPTION; + } + + recv_sys.read(source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1), + {buf, OS_FILE_LOG_BLOCK_SIZE}); + + ulint crc = log_block_calc_checksum_crc32(buf); + ulint cksum = log_block_get_checksum(buf); + + if (UNIV_UNLIKELY(crc != cksum)) { + ib::error() << "Invalid log block checksum." + << " block: " + << log_block_get_hdr_no(buf) + << " checkpoint no: " + << log_block_get_checkpoint_no(buf) + << " expected: " << crc + << " found: " << cksum; + return DB_CORRUPTION; + } + + if (log_sys.log.is_encrypted() + && !log_crypt(buf, lsn & ~511, 512, LOG_DECRYPT)) { + return DB_ERROR; + } + + /* On a clean shutdown, the redo log will be logically empty + after the checkpoint lsn. */ + + if (log_block_get_data_len(buf) + != (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) { + return DB_ERROR; + } + + /* Mark the redo log for upgrading. */ + srv_log_file_size = 0; + recv_sys.parse_start_lsn = recv_sys.recovered_lsn + = recv_sys.scanned_lsn + = recv_sys.mlog_checkpoint_lsn = lsn; + log_sys.set_lsn(lsn); + log_sys.set_flushed_lsn(lsn); + log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn + = log_sys.write_lsn = log_sys.current_flush_lsn = lsn; + log_sys.next_checkpoint_no = 0; + return DB_SUCCESS; +} + +/** Find the latest checkpoint in the log header. +@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 +@return error code or DB_SUCCESS */ +dberr_t +recv_find_max_checkpoint(ulint* max_field) +{ + ib_uint64_t max_no; + ib_uint64_t checkpoint_no; + ulint field; + byte* buf; + + max_no = 0; + *max_field = 0; + + buf = log_sys.checkpoint_buf; + + log_sys.log.read(0, {buf, OS_FILE_LOG_BLOCK_SIZE}); + /* Check the header page checksum. There was no + checksum in the first redo log format (version 0). */ + log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT); + log_sys.log.subformat = log_sys.log.format != log_t::FORMAT_3_23 + ? mach_read_from_4(buf + LOG_HEADER_SUBFORMAT) + : 0; + if (log_sys.log.format != log_t::FORMAT_3_23 + && !recv_check_log_header_checksum(buf)) { + ib::error() << "Invalid redo log header checksum."; + return(DB_CORRUPTION); + } + + char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1]; + + memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator); + /* Ensure that the string is NUL-terminated. */ + creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR] = 0; + + switch (log_sys.log.format) { + case log_t::FORMAT_3_23: + return recv_log_recover_pre_10_2(); + case log_t::FORMAT_10_2: + case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_3: + case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_4: + case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_5: + case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: + break; + default: + ib::error() << "Unsupported redo log format." + " The redo log was created with " << creator << "."; + return(DB_ERROR); + } + + for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2; + field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) { + log_sys.log.read(field, {buf, OS_FILE_LOG_BLOCK_SIZE}); + + const ulint crc32 = log_block_calc_checksum_crc32(buf); + const ulint cksum = log_block_get_checksum(buf); + + if (crc32 != cksum) { + DBUG_PRINT("ib_log", + ("invalid checkpoint," + " at " ULINTPF + ", checksum " ULINTPFx + " expected " ULINTPFx, + field, cksum, crc32)); + continue; + } + + if (log_sys.is_encrypted() + && !log_crypt_read_checkpoint_buf(buf)) { + ib::error() << "Reading checkpoint" + " encryption info failed."; + continue; + } + + checkpoint_no = mach_read_from_8( + buf + LOG_CHECKPOINT_NO); + + DBUG_PRINT("ib_log", + ("checkpoint " UINT64PF " at " LSN_PF " found", + checkpoint_no, mach_read_from_8( + buf + LOG_CHECKPOINT_LSN))); + + if (checkpoint_no >= max_no) { + *max_field = field; + max_no = checkpoint_no; + log_sys.log.set_lsn(mach_read_from_8( + buf + LOG_CHECKPOINT_LSN)); + log_sys.log.set_lsn_offset(mach_read_from_8( + buf + LOG_CHECKPOINT_OFFSET)); + log_sys.next_checkpoint_no = checkpoint_no; + } + } + + if (*max_field == 0) { + /* Before 10.2.2, we could get here during database + initialization if we created an LOG_FILE_NAME file that + was filled with zeroes, and were killed. After + 10.2.2, we would reject such a file already earlier, + when checking the file header. */ + ib::error() << "No valid checkpoint found" + " (corrupted redo log)." + " You can try --innodb-force-recovery=6" + " as a last resort."; + return(DB_ERROR); + } + + switch (log_sys.log.format) { + case log_t::FORMAT_10_5: + case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: + break; + default: + if (dberr_t err = recv_log_recover_10_4()) { + ib::error() + << "Upgrade after a crash is not supported." + " The redo log was created with " << creator + << (err == DB_ERROR + ? "." : ", and it appears corrupted."); + return err; + } + } + + return(DB_SUCCESS); +} + +/*******************************************************//** +Calculates the new value for lsn when more data is added to the log. */ +static +lsn_t +recv_calc_lsn_on_data_add( +/*======================*/ + lsn_t lsn, /*!< in: old lsn */ + ib_uint64_t len) /*!< in: this many bytes of data is + added, log block headers not included */ +{ + unsigned frag_len = static_cast<unsigned>(lsn % OS_FILE_LOG_BLOCK_SIZE) + - LOG_BLOCK_HDR_SIZE; + unsigned payload_size = log_sys.payload_size(); + ut_ad(frag_len < payload_size); + lsn_t lsn_len = len; + lsn_len += (lsn_len + frag_len) / payload_size + * (OS_FILE_LOG_BLOCK_SIZE - payload_size); + + return(lsn + lsn_len); +} + +/** Trim old log records for a page. +@param start_lsn oldest log sequence number to preserve +@return whether all the log for the page was trimmed */ +inline bool page_recv_t::trim(lsn_t start_lsn) +{ + while (log.head) + { + if (log.head->lsn >= start_lsn) return false; + last_offset= 1; /* the next record must not be same_page */ + log_rec_t *next= log.head->next; + recv_sys.free(log.head); + log.head= next; + } + log.tail= nullptr; + return true; +} + + +inline void page_recv_t::recs_t::clear() +{ + ut_ad(mutex_own(&recv_sys.mutex)); + for (const log_rec_t *l= head; l; ) + { + const log_rec_t *next= l->next; + recv_sys.free(l); + l= next; + } + head= tail= nullptr; +} + + +/** Ignore any earlier redo log records for this page. */ +inline void page_recv_t::will_not_read() +{ + ut_ad(state == RECV_NOT_PROCESSED || state == RECV_WILL_NOT_READ); + state= RECV_WILL_NOT_READ; + log.clear(); +} + + +/** Register a redo log snippet for a page. +@param page_id page identifier +@param start_lsn start LSN of the mini-transaction +@param lsn @see mtr_t::commit_lsn() +@param recs redo log snippet @see log_t::FORMAT_10_5 +@param len length of l, in bytes */ +inline void recv_sys_t::add(const page_id_t page_id, + lsn_t start_lsn, lsn_t lsn, const byte *l, + size_t len) +{ + ut_ad(mutex_own(&mutex)); + std::pair<map::iterator, bool> p= pages.emplace(map::value_type + (page_id, page_recv_t())); + page_recv_t& recs= p.first->second; + ut_ad(p.second == recs.log.empty()); + + switch (*l & 0x70) { + case FREE_PAGE: case INIT_PAGE: + recs.will_not_read(); + mlog_init.add(page_id, start_lsn); /* FIXME: remove this! */ + /* fall through */ + default: + log_phys_t *tail= static_cast<log_phys_t*>(recs.log.last()); + if (!tail) + break; + if (tail->start_lsn != start_lsn) + break; + ut_ad(tail->lsn == lsn); + buf_block_t *block= UT_LIST_GET_LAST(blocks); + ut_ad(block); + const size_t used= static_cast<uint16_t>(block->page.access_time - 1) + 1; + ut_ad(used >= ALIGNMENT); + const byte *end= const_cast<const log_phys_t*>(tail)->end(); + if (!((reinterpret_cast<size_t>(end + len) ^ + reinterpret_cast<size_t>(end)) & ~(ALIGNMENT - 1))) + { + /* Use already allocated 'padding' bytes */ +append: + MEM_MAKE_ADDRESSABLE(end + 1, len); + /* Append to the preceding record for the page */ + tail->append(l, len); + return; + } + if (end <= &block->frame[used - ALIGNMENT] || &block->frame[used] >= end) + break; /* Not the last allocated record in the page */ + const size_t new_used= static_cast<size_t>(end - block->frame + len + 1); + ut_ad(new_used > used); + if (new_used > srv_page_size) + break; + block->page.access_time= (block->page.access_time & ~0U << 16) | + ut_calc_align<uint16_t>(static_cast<uint16_t>(new_used), ALIGNMENT); + goto append; + } + recs.log.append(new (alloc(log_phys_t::alloc_size(len))) + log_phys_t(start_lsn, lsn, l, len)); +} + +/** Store/remove the freed pages in fil_name_t of recv_spaces. +@param[in] page_id freed or init page_id +@param[in] freed TRUE if page is freed */ +static void store_freed_or_init_rec(page_id_t page_id, bool freed) +{ + uint32_t space_id= page_id.space(); + uint32_t page_no= page_id.page_no(); + if (is_predefined_tablespace(space_id)) + { + if (!srv_immediate_scrub_data_uncompressed) + return; + fil_space_t *space; + if (space_id == TRX_SYS_SPACE) + space= fil_system.sys_space; + else + space= fil_space_get(space_id); + + space->free_page(page_no, freed); + return; + } + + recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); + if (i != recv_spaces.end() && i->first == space_id) + { + if (freed) + i->second.add_freed_page(page_no); + else + i->second.remove_freed_page(page_no); + } +} + +/** Parse and register one mini-transaction in log_t::FORMAT_10_5. +@param checkpoint_lsn the log sequence number of the latest checkpoint +@param store whether to store the records +@param apply whether to apply file-level log records +@return whether FILE_CHECKPOINT record was seen the first time, +or corruption was noticed */ +bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply) +{ + mysql_mutex_assert_owner(&log_sys.mutex); + ut_ad(mutex_own(&mutex)); + ut_ad(parse_start_lsn); + ut_ad(log_sys.is_physical()); + + bool last_phase= (*store == STORE_IF_EXISTS); + const byte *const end= buf + len; +loop: + const byte *const log= buf + recovered_offset; + const lsn_t start_lsn= recovered_lsn; + + /* Check that the entire mini-transaction is included within the buffer */ + const byte *l; + uint32_t rlen; + for (l= log; l < end; l+= rlen) + { + if (!*l) + goto eom_found; + if (UNIV_LIKELY((*l & 0x70) != RESERVED)); + else if (srv_force_recovery) + ib::warn() << "Ignoring unknown log record at LSN " << recovered_lsn; + else + { +malformed: + ib::error() << "Malformed log record;" + " set innodb_force_recovery=1 to ignore."; +corrupted: + const size_t trailing_bytes= std::min<size_t>(100, size_t(end - l)); + ib::info() << "Dump from the start of the mini-transaction (LSN=" + << start_lsn << ") to " + << trailing_bytes << " bytes after the record:"; + ut_print_buf(stderr, log, l - log + trailing_bytes); + putc('\n', stderr); + found_corrupt_log= true; + return true; + } + rlen= *l++ & 0xf; + if (l + (rlen ? rlen : 16) >= end) + break; + if (!rlen) + { + rlen= mlog_decode_varint_length(*l); + if (l + rlen >= end) + break; + const uint32_t addlen= mlog_decode_varint(l); + if (UNIV_UNLIKELY(addlen == MLOG_DECODE_ERROR)) + { + ib::error() << "Corrupted record length"; + goto corrupted; + } + rlen= addlen + 15; + } + } + + /* Not the entire mini-transaction was present. */ + return false; + +eom_found: + ut_ad(!*l); + ut_d(const byte *const el= l + 1); + + const lsn_t end_lsn= recv_calc_lsn_on_data_add(start_lsn, l + 1 - log); + if (UNIV_UNLIKELY(end_lsn > scanned_lsn)) + /* The log record filled a log block, and we require that also the + next log block should have been scanned in */ + return false; + + ut_d(std::set<page_id_t> freed); +#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */ + /* Pages that have been modified in this mini-transaction. + If a mini-transaction writes INIT_PAGE for a page, it should not have + written any log records for the page. Unfortunately, this does not + hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress() + can be invoked in a pessimistic operation, even after log has + been written for other pages. */ + ut_d(std::set<page_id_t> modified); +#endif + + uint32_t space_id= 0, page_no= 0, last_offset= 0; +#if 1 /* MDEV-14425 FIXME: remove this */ + bool got_page_op= false; +#endif + for (l= log; l < end; l+= rlen) + { + const byte *const recs= l; + const byte b= *l++; + + if (!b) + break; + ut_ad(UNIV_LIKELY(b & 0x70) != RESERVED || srv_force_recovery); + rlen= b & 0xf; + ut_ad(l + rlen < end); + ut_ad(rlen || l + 16 < end); + if (!rlen) + { + const uint32_t lenlen= mlog_decode_varint_length(*l); + ut_ad(l + lenlen < end); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + ut_ad(l + rlen < end); + uint32_t idlen; + if ((b & 0x80) && got_page_op) + { + /* This record is for the same page as the previous one. */ + if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE)) + { +record_corrupted: + /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */ + if (!srv_force_recovery) + goto malformed; + ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn; + last_offset= 1; /* the next record must not be same_page */ + continue; + } + goto same_page; + } + last_offset= 0; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen)) + { +page_id_corrupted: + if (!srv_force_recovery) + { + ib::error() << "Corrupted page identifier at " << recovered_lsn + << "; set innodb_force_recovery=1 to ignore the record."; + goto corrupted; + } + ib::warn() << "Ignoring corrupted page identifier at LSN " + << recovered_lsn; + continue; + } + space_id= mlog_decode_varint(l); + if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR)) + goto page_id_corrupted; + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen)) + goto page_id_corrupted; + page_no= mlog_decode_varint(l); + if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR)) + goto page_id_corrupted; + l+= idlen; + rlen-= idlen; + got_page_op = !(b & 0x80); + if (got_page_op && apply && !is_predefined_tablespace(space_id)) + { + recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); + if (i != recv_spaces.end() && i->first == space_id); + else if (recovered_lsn < mlog_checkpoint_lsn) + /* We have not seen all records between the checkpoint and + FILE_CHECKPOINT. There should be a FILE_DELETE for this + tablespace later. */ + recv_spaces.emplace_hint(i, space_id, file_name_t("", false)); + else + { + const page_id_t id(space_id, page_no); + if (!srv_force_recovery) + { + ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id + << " at " << recovered_lsn + << "; set innodb_force_recovery=1 to ignore the record."; + goto corrupted; + } + ib::warn() << "Ignoring record for " << id << " at " << recovered_lsn; + continue; + } + } +same_page: + DBUG_PRINT("ib_log", + ("scan " LSN_PF ": rec %x len %zu page %u:%u", + recovered_lsn, b, static_cast<size_t>(l + rlen - recs), + space_id, page_no)); + + if (got_page_op) + { + const page_id_t id(space_id, page_no); + ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id)); + ut_ad(freed.find(id) == freed.end()); + switch (b & 0x70) { + case FREE_PAGE: + ut_ad(freed.emplace(id).second); + last_offset= 1; /* the next record must not be same_page */ + goto free_or_init_page; + case INIT_PAGE: + last_offset= FIL_PAGE_TYPE; + free_or_init_page: + store_freed_or_init_rec(id, (b & 0x70) == FREE_PAGE); + if (UNIV_UNLIKELY(rlen != 0)) + goto record_corrupted; + break; + case EXTENDED: + if (UNIV_UNLIKELY(!rlen)) + goto record_corrupted; + if (rlen == 1 && *l == TRIM_PAGES) + { +#if 0 /* For now, we can only truncate an undo log tablespace */ + if (UNIV_UNLIKELY(!space_id || !page_no)) + goto record_corrupted; +#else + if (!srv_is_undo_tablespace(space_id) || + page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) + goto record_corrupted; + static_assert(UT_ARR_SIZE(truncated_undo_spaces) == + TRX_SYS_MAX_UNDO_SPACES, "compatibility"); + truncated_undo_spaces[space_id - srv_undo_space_id_start]= + { recovered_lsn, page_no }; +#endif + last_offset= 1; /* the next record must not be same_page */ + continue; + } + last_offset= FIL_PAGE_TYPE; + break; + case RESERVED: + case OPTION: + continue; + case WRITE: + case MEMMOVE: + case MEMSET: + if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1)) + goto record_corrupted; + const uint32_t olen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(l); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= srv_page_size)) + goto record_corrupted; + last_offset+= offset; + if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size)) + goto record_corrupted; + l+= olen; + rlen-= olen; + if ((b & 0x70) == WRITE) + { + if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) + goto record_corrupted; + if (UNIV_UNLIKELY(!page_no) && apply) + { + const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE && + last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4; + const bool has_flags= last_offset <= + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS && + last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + 4; + if (has_size || has_flags) + { + recv_spaces_t::iterator it= recv_spaces.find(space_id); + const uint32_t size= has_size + ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + l - + last_offset) + : 0; + const uint32_t flags= has_flags + ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + l - + last_offset) + : file_name_t::initial_flags; + if (it == recv_spaces.end()) + ut_ad(!mlog_checkpoint_lsn || space_id == TRX_SYS_SPACE || + srv_is_undo_tablespace(space_id)); + else if (!it->second.space) + { + if (has_size) + it->second.size= size; + if (has_flags) + it->second.flags= flags; + } + fil_space_set_recv_size_and_flags(space_id, size, flags); + } + } + last_offset+= rlen; + break; + } + uint32_t llen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(llen > rlen || llen > 3)) + goto record_corrupted; + const uint32_t len= mlog_decode_varint(l); + ut_ad(len != MLOG_DECODE_ERROR); + if (UNIV_UNLIKELY(last_offset + len > srv_page_size)) + goto record_corrupted; + l+= llen; + rlen-= llen; + llen= len; + if ((b & 0x70) == MEMSET) + { + if (UNIV_UNLIKELY(rlen > llen)) + goto record_corrupted; + last_offset+= llen; + break; + } + const uint32_t slen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(slen != rlen || slen > 3)) + goto record_corrupted; + uint32_t s= mlog_decode_varint(l); + ut_ad(slen != MLOG_DECODE_ERROR); + if (s & 1) + s= last_offset - (s >> 1) - 1; + else + s= last_offset + (s >> 1) + 1; + if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size)) + goto record_corrupted; + last_offset+= llen; + break; + } +#if 0 && defined UNIV_DEBUG + switch (b & 0x70) { + case RESERVED: + case OPTION: + ut_ad(0); /* we did "continue" earlier */ + break; + case FREE_PAGE: + break; + default: + ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE); + } +#endif + const bool is_init= (b & 0x70) <= INIT_PAGE; + switch (*store) { + case STORE_IF_EXISTS: + if (fil_space_t *space= fil_space_t::get(space_id)) + { + const auto size= space->get_size(); + space->release(); + if (!size) + continue; + } + else + continue; + /* fall through */ + case STORE_YES: + if (!mlog_init.will_avoid_read(id, start_lsn)) + add(id, start_lsn, end_lsn, recs, + static_cast<size_t>(l + rlen - recs)); + continue; + case STORE_NO: + if (!is_init) + continue; + mlog_init.add(id, start_lsn); + map::iterator i= pages.find(id); + if (i == pages.end()) + continue; + i->second.log.clear(); + pages.erase(i); + } + } +#if 1 /* MDEV-14425 FIXME: this must be in the checkpoint file only! */ + else if (rlen) + { + switch (b & 0xf0) { +# if 1 /* MDEV-14425 FIXME: Remove this! */ + case FILE_CHECKPOINT: + if (space_id == 0 && page_no == 0 && rlen == 8) + { + const lsn_t lsn= mach_read_from_8(l); + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) + fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n", + lsn, lsn != checkpoint_lsn + ? "ignored" + : mlog_checkpoint_lsn ? "reread" : "read", + recovered_lsn); + + DBUG_PRINT("ib_log", ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF, + lsn, lsn != checkpoint_lsn + ? "ignored" + : mlog_checkpoint_lsn ? "reread" : "read", + recovered_lsn)); + + if (lsn == checkpoint_lsn) + { + /* There can be multiple FILE_CHECKPOINT for the same LSN. */ + if (mlog_checkpoint_lsn) + continue; + mlog_checkpoint_lsn= recovered_lsn; + l+= 8; + recovered_offset= l - buf; + return true; + } + continue; + } +# endif + /* fall through */ + default: + if (!srv_force_recovery) + goto malformed; + ib::warn() << "Ignoring malformed log record at LSN " << recovered_lsn; + continue; + case FILE_DELETE: + case FILE_MODIFY: + case FILE_RENAME: + if (UNIV_UNLIKELY(page_no != 0)) + { + file_rec_error: + if (!srv_force_recovery) + { + ib::error() << "Corrupted file-level record;" + " set innodb_force_recovery=1 to ignore."; + goto corrupted; + } + + ib::warn() << "Ignoring corrupted file-level record at LSN " + << recovered_lsn; + continue; + } + /* fall through */ + case FILE_CREATE: + if (UNIV_UNLIKELY(!space_id || page_no)) + goto file_rec_error; + /* There is no terminating NUL character. Names must end in .ibd. + For FILE_RENAME, there is a NUL between the two file names. */ + const char * const fn= reinterpret_cast<const char*>(l); + const char *fn2= static_cast<const char*>(memchr(fn, 0, rlen)); + + if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME))) + goto file_rec_error; + + const char * const fnend= fn2 ? fn2 : fn + rlen; + const char * const fn2end= fn2 ? fn + rlen : nullptr; + + if (fn2) + { + fn2++; + if (memchr(fn2, 0, fn2end - fn2)) + goto file_rec_error; + if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4)) + goto file_rec_error; + } + + if (is_predefined_tablespace(space_id)) + goto file_rec_error; + if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4)) + goto file_rec_error; + + const char saved_end= fn[rlen]; + const_cast<char&>(fn[rlen])= '\0'; + fil_name_process(const_cast<char*>(fn), fnend - fn, space_id, + (b & 0xf0) == FILE_DELETE); + if (fn2) + fil_name_process(const_cast<char*>(fn2), fn2end - fn2, space_id, + false); + if ((b & 0xf0) < FILE_MODIFY && log_file_op) + log_file_op(space_id, (b & 0xf0) == FILE_CREATE, + l, static_cast<ulint>(fnend - fn), + reinterpret_cast<const byte*>(fn2), + fn2 ? static_cast<ulint>(fn2end - fn2) : 0); + const_cast<char&>(fn[rlen])= saved_end; + + if (fn2 && apply) + { + const size_t len= fn2end - fn2; + auto r= renamed_spaces.emplace(space_id, std::string{fn2, len}); + if (!r.second) + r.first->second= std::string{fn2, len}; + } + if (UNIV_UNLIKELY(found_corrupt_fs)) + return true; + } + } +#endif + else + goto malformed; + } + + ut_ad(l == el); + recovered_offset= l - buf; + recovered_lsn= end_lsn; + if (is_memory_exhausted(store) && last_phase) + return false; + goto loop; +} + +/** Apply the hashed log records to the page, if the page lsn is less than the +lsn of a log record. +@param[in,out] block buffer pool page +@param[in,out] mtr mini-transaction +@param[in,out] p recovery address +@param[in,out] space tablespace, or NULL if not looked up yet +@param[in,out] init page initialization operation, or NULL */ +static void recv_recover_page(buf_block_t* block, mtr_t& mtr, + const recv_sys_t::map::iterator& p, + fil_space_t* space = NULL, + mlog_init_t::init* init = NULL) +{ + ut_ad(mutex_own(&recv_sys.mutex)); + ut_ad(recv_sys.apply_log_recs); + ut_ad(recv_needed_recovery); + ut_ad(!init || init->created); + ut_ad(!init || init->lsn); + ut_ad(block->page.id() == p->first); + ut_ad(!p->second.is_being_processed()); + ut_ad(!space || space->id == block->page.id().space()); + ut_ad(log_sys.is_physical()); + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { + ib::info() << "Applying log to page " << block->page.id(); + } + + DBUG_PRINT("ib_log", ("Applying log to page %u:%u", + block->page.id().space(), + block->page.id().page_no())); + + p->second.state = page_recv_t::RECV_BEING_PROCESSED; + + mutex_exit(&recv_sys.mutex); + + byte *frame = UNIV_LIKELY_NULL(block->page.zip.data) + ? block->page.zip.data + : block->frame; + const lsn_t page_lsn = init + ? 0 + : mach_read_from_8(frame + FIL_PAGE_LSN); + bool free_page = false; + lsn_t start_lsn = 0, end_lsn = 0; + ut_d(lsn_t recv_start_lsn = 0); + const lsn_t init_lsn = init ? init->lsn : 0; + + bool skipped_after_init = false; + + for (const log_rec_t* recv : p->second.log) { + const log_phys_t* l = static_cast<const log_phys_t*>(recv); + ut_ad(l->lsn); + ut_ad(end_lsn <= l->lsn); + ut_ad(l->lsn <= log_sys.log.scanned_lsn); + + ut_ad(l->start_lsn); + ut_ad(recv_start_lsn <= l->start_lsn); + ut_d(recv_start_lsn = l->start_lsn); + + if (l->start_lsn < page_lsn) { + /* This record has already been applied. */ + DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF + " < " LSN_PF, + block->page.id().space(), + block->page.id().page_no(), + l->start_lsn, page_lsn)); + skipped_after_init = true; + end_lsn = l->lsn; + continue; + } + + if (l->start_lsn < init_lsn) { + DBUG_PRINT("ib_log", ("init skip %u:%u LSN " LSN_PF + " < " LSN_PF, + block->page.id().space(), + block->page.id().page_no(), + l->start_lsn, init_lsn)); + skipped_after_init = false; + end_lsn = l->lsn; + continue; + } + + /* There is no need to check LSN for just initialized pages. */ + if (skipped_after_init) { + skipped_after_init = false; + ut_ad(end_lsn == page_lsn); + if (end_lsn != page_lsn) + ib::warn() + << "The last skipped log record LSN " + << end_lsn + << " is not equal to page LSN " + << page_lsn; + } + + end_lsn = l->lsn; + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { + ib::info() << "apply " << l->start_lsn + << ": " << block->page.id(); + } + + DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u", + l->start_lsn, + block->page.id().space(), + block->page.id().page_no())); + + log_phys_t::apply_status a= l->apply(*block, + p->second.last_offset); + + switch (a) { + case log_phys_t::APPLIED_NO: + ut_ad(!mtr.has_modifications()); + free_page = true; + start_lsn = 0; + continue; + case log_phys_t::APPLIED_YES: + goto set_start_lsn; + case log_phys_t::APPLIED_TO_FSP_HEADER: + case log_phys_t::APPLIED_TO_ENCRYPTION: + break; + } + + if (fil_space_t* s = space + ? space + : fil_space_t::get(block->page.id().space())) { + switch (a) { + case log_phys_t::APPLIED_TO_FSP_HEADER: + s->flags = mach_read_from_4( + FSP_HEADER_OFFSET + + FSP_SPACE_FLAGS + frame); + s->size_in_header = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SIZE + + frame); + s->free_limit = mach_read_from_4( + FSP_HEADER_OFFSET + + FSP_FREE_LIMIT + frame); + s->free_len = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_FREE + + FLST_LEN + frame); + break; + default: + byte* b= frame + + fsp_header_get_encryption_offset( + block->zip_size()) + + FSP_HEADER_OFFSET; + if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) { + break; + } + b += MAGIC_SZ; + if (*b != CRYPT_SCHEME_UNENCRYPTED + && *b != CRYPT_SCHEME_1) { + break; + } + if (b[1] != MY_AES_BLOCK_SIZE) { + break; + } + if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4] + > FIL_ENCRYPTION_OFF) { + break; + } + fil_crypt_parse(s, b); + } + + if (!space) { + s->release(); + } + } + +set_start_lsn: + if (recv_sys.found_corrupt_log && !srv_force_recovery) { + break; + } + + if (!start_lsn) { + start_lsn = l->start_lsn; + } + } + + if (start_lsn) { + ut_ad(end_lsn >= start_lsn); + mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn); + if (UNIV_LIKELY(frame == block->frame)) { + mach_write_to_8(srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + + frame, end_lsn); + } else { + buf_zip_decompress(block, false); + } + + buf_block_modify_clock_inc(block); + mysql_mutex_lock(&log_sys.flush_order_mutex); + buf_flush_note_modification(block, start_lsn, end_lsn); + mysql_mutex_unlock(&log_sys.flush_order_mutex); + } else if (free_page && init) { + /* There have been no operations that modify the page. + Any buffered changes must not be merged. A subsequent + buf_page_create() from a user thread should discard + any buffered changes. */ + init->created = false; + ut_ad(!mtr.has_modifications()); + block->page.status = buf_page_t::FREED; + } + + /* Make sure that committing mtr does not change the modification + lsn values of page */ + + mtr.discard_modifications(); + mtr.commit(); + + time_t now = time(NULL); + + mutex_enter(&recv_sys.mutex); + + if (recv_max_page_lsn < page_lsn) { + recv_max_page_lsn = page_lsn; + } + + ut_ad(p->second.is_being_processed()); + ut_ad(!recv_sys.pages.empty()); + + if (recv_sys.report(now)) { + const ulint n = recv_sys.pages.size(); + ib::info() << "To recover: " << n << " pages from log"; + service_manager_extend_timeout( + INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n); + } +} + +/** Remove records for a corrupted page. +This function should only be called when innodb_force_recovery is set. +@param page_id corrupted page identifier */ +ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id) +{ + mutex_enter(&mutex); + map::iterator p= pages.find(page_id); + if (p != pages.end()) + { + p->second.log.clear(); + pages.erase(p); + } + mutex_exit(&mutex); +} + +/** Apply any buffered redo log to a page that was just read from a data file. +@param[in,out] space tablespace +@param[in,out] bpage buffer pool page */ +void recv_recover_page(fil_space_t* space, buf_page_t* bpage) +{ + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE); + buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); + + /* Move the ownership of the x-latch on the page to + this OS thread, so that we can acquire a second + x-latch on it. This is needed for the operations to + the page to pass the debug checks. */ + rw_lock_x_lock_move_ownership(&block->lock); + buf_block_buf_fix_inc(block, __FILE__, __LINE__); + rw_lock_x_lock(&block->lock); + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); + + mutex_enter(&recv_sys.mutex); + if (recv_sys.apply_log_recs) { + recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id()); + if (p != recv_sys.pages.end() + && !p->second.is_being_processed()) { + recv_recover_page(block, mtr, p, space); + p->second.log.clear(); + recv_sys.pages.erase(p); + goto func_exit; + } + } + + mtr.commit(); +func_exit: + mutex_exit(&recv_sys.mutex); + ut_ad(mtr.has_committed()); +} + +/** Reads in pages which have hashed log records, from an area around a given +page number. +@param[in] page_id page id */ +static void recv_read_in_area(page_id_t page_id) +{ + uint32_t page_nos[RECV_READ_AHEAD_AREA]; + compile_time_assert(ut_is_2pow(RECV_READ_AHEAD_AREA)); + page_id.set_page_no(ut_2pow_round(page_id.page_no(), + RECV_READ_AHEAD_AREA)); + const ulint up_limit = page_id.page_no() + RECV_READ_AHEAD_AREA; + uint32_t* p = page_nos; + + for (recv_sys_t::map::iterator i= recv_sys.pages.lower_bound(page_id); + i != recv_sys.pages.end() + && i->first.space() == page_id.space() + && i->first.page_no() < up_limit; i++) { + if (i->second.state == page_recv_t::RECV_NOT_PROCESSED + && !buf_pool.page_hash_contains(i->first)) { + i->second.state = page_recv_t::RECV_BEING_READ; + *p++ = i->first.page_no(); + } + } + + if (p != page_nos) { + mutex_exit(&recv_sys.mutex); + buf_read_recv_pages(page_id.space(), page_nos, + ulint(p - page_nos)); + mutex_enter(&recv_sys.mutex); + } +} + +/** Attempt to initialize a page based on redo log records. +@param page_id page identifier +@param p iterator pointing to page_id +@param mtr mini-transaction +@param b pre-allocated buffer pool block +@return whether the page was successfully initialized */ +inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id, + map::iterator &p, mtr_t &mtr, + buf_block_t *b) +{ + ut_ad(mutex_own(&mutex)); + ut_ad(p->first == page_id); + page_recv_t &recs= p->second; + ut_ad(recs.state == page_recv_t::RECV_WILL_NOT_READ); + buf_block_t* block= nullptr; + mlog_init_t::init &i= mlog_init.last(page_id); + const lsn_t end_lsn = recs.log.last()->lsn; + if (end_lsn < i.lsn) + DBUG_LOG("ib_log", "skip log for page " << page_id + << " LSN " << end_lsn << " < " << i.lsn); + else if (fil_space_t *space= fil_space_t::get(page_id.space())) + { + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + block= buf_page_create(space, page_id.page_no(), space->zip_size(), &mtr, + b); + if (UNIV_UNLIKELY(block != b)) + { + /* The page happened to exist in the buffer pool, or it was just + being read in. Before buf_page_get_with_no_latch() returned to + buf_page_create(), all changes must have been applied to the + page already. */ + ut_ad(recv_sys.pages.find(page_id) == recv_sys.pages.end()); + mtr.commit(); + block= nullptr; + } + else + { + ut_ad(&recs == &recv_sys.pages.find(page_id)->second); + i.created= true; + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + recv_recover_page(block, mtr, p, space, &i); + ut_ad(mtr.has_committed()); + recs.log.clear(); + map::iterator r= p++; + recv_sys.pages.erase(r); + } + space->release(); + } + + return block; +} + +/** Attempt to initialize a page based on redo log records. +@param page_id page identifier +@return whether the page was successfully initialized */ +buf_block_t *recv_sys_t::recover_low(const page_id_t page_id) +{ + buf_block_t *free_block= buf_LRU_get_free_block(false); + buf_block_t *block= nullptr; + + mutex_enter(&mutex); + map::iterator p= pages.find(page_id); + + if (p != pages.end() && p->second.state == page_recv_t::RECV_WILL_NOT_READ) + { + mtr_t mtr; + block= recover_low(page_id, p, mtr, free_block); + ut_ad(!block || block == free_block); + } + + mutex_exit(&mutex); + if (UNIV_UNLIKELY(!block)) + buf_pool.free_block(free_block); + return block; +} + +/** Apply buffered log to persistent data pages. +@param last_batch whether it is possible to write more redo log */ +void recv_sys_t::apply(bool last_batch) +{ + ut_ad(srv_operation == SRV_OPERATION_NORMAL || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + + mutex_enter(&mutex); + + while (apply_batch_on) + { + bool abort= found_corrupt_log; + mutex_exit(&mutex); + + if (abort) + return; + + os_thread_sleep(500000); + mutex_enter(&mutex); + } + +#ifdef SAFE_MUTEX + DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex)); +#endif /* SAFE_MUTEX */ + + recv_no_ibuf_operations = !last_batch || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT; + + mtr_t mtr; + + if (!pages.empty()) + { + const char *msg= last_batch + ? "Starting final batch to recover " + : "Starting a batch to recover "; + const ulint n= pages.size(); + ib::info() << msg << n << " pages from redo log."; + sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log", msg, n); + + apply_log_recs= true; + apply_batch_on= true; + + for (auto id= srv_undo_tablespaces_open; id--;) + { + const trunc& t= truncated_undo_spaces[id]; + if (t.lsn) + trim(page_id_t(id + srv_undo_space_id_start, t.pages), t.lsn); + } + + fil_system.extend_to_recv_size(); + + buf_block_t *free_block= buf_LRU_get_free_block(false); + + for (map::iterator p= pages.begin(); p != pages.end(); ) + { + const page_id_t page_id= p->first; + page_recv_t &recs= p->second; + ut_ad(!recs.log.empty()); + + switch (recs.state) { + case page_recv_t::RECV_BEING_READ: + case page_recv_t::RECV_BEING_PROCESSED: + p++; + continue; + case page_recv_t::RECV_WILL_NOT_READ: + if (UNIV_LIKELY(!!recover_low(page_id, p, mtr, free_block))) + { + mutex_exit(&mutex); + free_block= buf_LRU_get_free_block(false); + mutex_enter(&mutex); +next_page: + p= pages.lower_bound(page_id); + } + continue; + case page_recv_t::RECV_NOT_PROCESSED: + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + if (buf_block_t *block= buf_page_get_low(page_id, 0, RW_X_LATCH, + nullptr, BUF_GET_IF_IN_POOL, + __FILE__, __LINE__, + &mtr, nullptr, false)) + { + buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); + recv_recover_page(block, mtr, p); + ut_ad(mtr.has_committed()); + } + else + { + mtr.commit(); + recv_read_in_area(page_id); + break; + } + map::iterator r= p++; + r->second.log.clear(); + pages.erase(r); + continue; + } + + goto next_page; + } + + buf_pool.free_block(free_block); + + /* Wait until all the pages have been processed */ + while (!pages.empty() || buf_pool.n_pend_reads) + { + const bool abort= found_corrupt_log || found_corrupt_fs; + + if (found_corrupt_fs && !srv_force_recovery) + ib::info() << "Set innodb_force_recovery=1 to ignore corrupted pages."; + + mutex_exit(&mutex); + + if (abort) + return; + os_thread_sleep(500000); + mutex_enter(&mutex); + } + } + + if (last_batch) + /* We skipped this in buf_page_create(). */ + mlog_init.mark_ibuf_exist(mtr); + else + { + mlog_init.reset(); + mysql_mutex_unlock(&log_sys.mutex); + } + + mysql_mutex_assert_not_owner(&log_sys.mutex); + mutex_exit(&mutex); + + /* Instead of flushing, last_batch could sort the buf_pool.flush_list + in ascending order of buf_page_t::oldest_modification. */ + buf_flush_sync(); + + if (!last_batch) + { + buf_pool_invalidate(); + mysql_mutex_lock(&log_sys.mutex); + } +#if 1 /* Mariabackup FIXME: Remove or adjust rename_table_in_prepare() */ + else if (srv_operation != SRV_OPERATION_NORMAL); +#endif + else + { + /* In the last batch, we will apply any rename operations. */ + for (auto r : renamed_spaces) + { + const uint32_t id= r.first; + fil_space_t *space= fil_space_t::get(id); + if (!space) + continue; + ut_ad(UT_LIST_GET_LEN(space->chain) == 1); + const char *old= space->chain.start->name; + if (r.second != old) + { + bool exists; + os_file_type_t ftype; + const char *new_name= r.second.c_str(); + if (!os_file_status(new_name, &exists, &ftype) || exists) + { + ib::error() << "Cannot replay rename of tablespace " << id + << " from '" << old << "' to '" << r.second << + (exists ? "' because the target file exists" : "'"); + found_corrupt_fs= true; + } + else + { + size_t base= r.second.rfind(OS_PATH_SEPARATOR); + ut_ad(base != std::string::npos); + size_t start= r.second.rfind(OS_PATH_SEPARATOR, base - 1); + if (start == std::string::npos) + start= 0; + else + ++start; + /* Keep only databasename/tablename without .ibd suffix */ + std::string space_name(r.second, start, r.second.size() - start - 4); + ut_ad(space_name[base - start] == OS_PATH_SEPARATOR); +#if OS_PATH_SEPARATOR != '/' + space_name[base - start]= '/'; +#endif + mysql_mutex_lock(&log_sys.mutex); + if (dberr_t err= space->rename(space_name.c_str(), r.second.c_str(), + false)) + { + ib::error() << "Cannot replay rename of tablespace " << id + << " to '" << r.second << "': " << err; + found_corrupt_fs= true; + } + mysql_mutex_unlock(&log_sys.mutex); + } + } + space->release(); + } + renamed_spaces.clear(); + } + + mutex_enter(&mutex); + + ut_d(after_apply= true); + clear(); + mutex_exit(&mutex); +} + +/** Check whether the number of read redo log blocks exceeds the maximum. +Store last_stored_lsn if the recovery is not in the last phase. +@param[in,out] store whether to store page operations +@return whether the memory is exhausted */ +inline bool recv_sys_t::is_memory_exhausted(store_t *store) +{ + if (*store == STORE_NO || + UT_LIST_GET_LEN(blocks) * 3 < buf_pool.get_n_pages()) + return false; + if (*store == STORE_YES) + last_stored_lsn= recovered_lsn; + *store= STORE_NO; + DBUG_PRINT("ib_log",("Ran out of memory and last stored lsn " LSN_PF + " last stored offset " ULINTPF "\n", + recovered_lsn, recovered_offset)); + return true; +} + +/** Adds data from a new log block to the parsing buffer of recv_sys if +recv_sys.parse_start_lsn is non-zero. +@param[in] log_block log block to add +@param[in] scanned_lsn lsn of how far we were able to find + data in this log block +@return true if more data added */ +bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn) +{ + ulint more_len; + ulint data_len; + ulint start_offset; + ulint end_offset; + + ut_ad(scanned_lsn >= recv_sys.scanned_lsn); + + if (!recv_sys.parse_start_lsn) { + /* Cannot start parsing yet because no start point for + it found */ + return(false); + } + + data_len = log_block_get_data_len(log_block); + + if (recv_sys.parse_start_lsn >= scanned_lsn) { + + return(false); + + } else if (recv_sys.scanned_lsn >= scanned_lsn) { + + return(false); + + } else if (recv_sys.parse_start_lsn > recv_sys.scanned_lsn) { + more_len = (ulint) (scanned_lsn - recv_sys.parse_start_lsn); + } else { + more_len = (ulint) (scanned_lsn - recv_sys.scanned_lsn); + } + + if (more_len == 0) { + return(false); + } + + ut_ad(data_len >= more_len); + + start_offset = data_len - more_len; + + if (start_offset < LOG_BLOCK_HDR_SIZE) { + start_offset = LOG_BLOCK_HDR_SIZE; + } + + end_offset = std::min<ulint>(data_len, log_sys.trailer_offset()); + + ut_ad(start_offset <= end_offset); + + if (start_offset < end_offset) { + memcpy(recv_sys.buf + recv_sys.len, + log_block + start_offset, end_offset - start_offset); + + recv_sys.len += end_offset - start_offset; + + ut_a(recv_sys.len <= RECV_PARSING_BUF_SIZE); + } + + return(true); +} + +/** Moves the parsing buffer data left to the buffer start. */ +void recv_sys_justify_left_parsing_buf() +{ + memmove(recv_sys.buf, recv_sys.buf + recv_sys.recovered_offset, + recv_sys.len - recv_sys.recovered_offset); + + recv_sys.len -= recv_sys.recovered_offset; + + recv_sys.recovered_offset = 0; +} + +/** Scan redo log from a buffer and stores new log data to the parsing buffer. +Parse and hash the log records if new data found. +Apply log records automatically when the hash table becomes full. +@param[in,out] store whether the records should be + stored into recv_sys.pages; this is + reset if just debug checking is + needed, or when the num_max_blocks in + recv_sys runs out +@param[in] log_block log segment +@param[in] checkpoint_lsn latest checkpoint LSN +@param[in] start_lsn buffer start LSN +@param[in] end_lsn buffer end LSN +@param[in,out] contiguous_lsn it is known that all groups contain + contiguous log data upto this lsn +@param[out] group_scanned_lsn scanning succeeded upto this lsn +@return true if not able to scan any more in this log group */ +static bool recv_scan_log_recs( + store_t* store, + const byte* log_block, + lsn_t checkpoint_lsn, + lsn_t start_lsn, + lsn_t end_lsn, + lsn_t* contiguous_lsn, + lsn_t* group_scanned_lsn) +{ + lsn_t scanned_lsn = start_lsn; + bool finished = false; + ulint data_len; + bool more_data = false; + bool apply = recv_sys.mlog_checkpoint_lsn != 0; + ulint recv_parsing_buf_size = RECV_PARSING_BUF_SIZE; + const bool last_phase = (*store == STORE_IF_EXISTS); + ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(end_lsn >= start_lsn + OS_FILE_LOG_BLOCK_SIZE); + ut_ad(log_sys.is_physical()); + + const byte* const log_end = log_block + + ulint(end_lsn - start_lsn); + constexpr ulint sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT; + + do { + ut_ad(!finished); + + if (log_block_get_flush_bit(log_block)) { + /* This block was a start of a log flush operation: + we know that the previous flush operation must have + been completed for all log groups before this block + can have been flushed to any of the groups. Therefore, + we know that log data is contiguous up to scanned_lsn + in all non-corrupt log groups. */ + + if (scanned_lsn > *contiguous_lsn) { + *contiguous_lsn = scanned_lsn; + } + } + + data_len = log_block_get_data_len(log_block); + + if (scanned_lsn + data_len > recv_sys.scanned_lsn + && log_block_get_checkpoint_no(log_block) + < recv_sys.scanned_checkpoint_no + && (recv_sys.scanned_checkpoint_no + - log_block_get_checkpoint_no(log_block) + > 0x80000000UL)) { + + /* Garbage from a log buffer flush which was made + before the most recent database recovery */ + finished = true; + break; + } + + if (!recv_sys.parse_start_lsn + && (log_block_get_first_rec_group(log_block) > 0)) { + + /* We found a point from which to start the parsing + of log records */ + + recv_sys.parse_start_lsn = scanned_lsn + + log_block_get_first_rec_group(log_block); + recv_sys.scanned_lsn = recv_sys.parse_start_lsn; + recv_sys.recovered_lsn = recv_sys.parse_start_lsn; + } + + scanned_lsn += data_len; + + if (data_len == LOG_BLOCK_HDR_SIZE + sizeof_checkpoint + && scanned_lsn == checkpoint_lsn + sizeof_checkpoint + && log_block[LOG_BLOCK_HDR_SIZE] + == (FILE_CHECKPOINT | (SIZE_OF_FILE_CHECKPOINT - 2)) + && checkpoint_lsn == mach_read_from_8( + (LOG_BLOCK_HDR_SIZE + 1 + 2) + + log_block)) { + /* The redo log is logically empty. */ + ut_ad(recv_sys.mlog_checkpoint_lsn == 0 + || recv_sys.mlog_checkpoint_lsn + == checkpoint_lsn); + recv_sys.mlog_checkpoint_lsn = checkpoint_lsn; + DBUG_PRINT("ib_log", ("found empty log; LSN=" LSN_PF, + scanned_lsn)); + finished = true; + break; + } + + if (scanned_lsn > recv_sys.scanned_lsn) { + ut_ad(!srv_log_file_created); + if (!recv_needed_recovery) { + recv_needed_recovery = true; + + if (srv_read_only_mode) { + ib::warn() << "innodb_read_only" + " prevents crash recovery"; + return(true); + } + + ib::info() << "Starting crash recovery from" + " checkpoint LSN=" << checkpoint_lsn + << "," << recv_sys.scanned_lsn; + } + + /* We were able to find more log data: add it to the + parsing buffer if parse_start_lsn is already + non-zero */ + + DBUG_EXECUTE_IF( + "reduce_recv_parsing_buf", + recv_parsing_buf_size = RECV_SCAN_SIZE * 2; + ); + + if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE + >= recv_parsing_buf_size) { + ib::error() << "Log parsing buffer overflow." + " Recovery may have failed!"; + + recv_sys.found_corrupt_log = true; + + if (!srv_force_recovery) { + ib::error() + << "Set innodb_force_recovery" + " to ignore this error."; + return(true); + } + } else if (!recv_sys.found_corrupt_log) { + more_data = recv_sys_add_to_parsing_buf( + log_block, scanned_lsn); + } + + recv_sys.scanned_lsn = scanned_lsn; + recv_sys.scanned_checkpoint_no + = log_block_get_checkpoint_no(log_block); + } + + /* During last phase of scanning, there can be redo logs + left in recv_sys.buf to parse & store it in recv_sys.heap */ + if (last_phase + && recv_sys.recovered_lsn < recv_sys.scanned_lsn) { + more_data = true; + } + + if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + /* Log data for this group ends here */ + finished = true; + break; + } else { + log_block += OS_FILE_LOG_BLOCK_SIZE; + } + } while (log_block < log_end); + + *group_scanned_lsn = scanned_lsn; + + mutex_enter(&recv_sys.mutex); + + if (more_data && !recv_sys.found_corrupt_log) { + /* Try to parse more log records */ + if (recv_sys.parse(checkpoint_lsn, store, apply)) { + ut_ad(recv_sys.found_corrupt_log + || recv_sys.found_corrupt_fs + || recv_sys.mlog_checkpoint_lsn + == recv_sys.recovered_lsn); + finished = true; + goto func_exit; + } + + recv_sys.is_memory_exhausted(store); + + if (recv_sys.recovered_offset > recv_parsing_buf_size / 4 + || (recv_sys.recovered_offset + && recv_sys.len + >= recv_parsing_buf_size - RECV_SCAN_SIZE)) { + /* Move parsing buffer data to the buffer start */ + recv_sys_justify_left_parsing_buf(); + } + + /* Need to re-parse the redo log which're stored + in recv_sys.buf */ + if (last_phase && *store == STORE_NO) { + finished = false; + } + } + +func_exit: + mutex_exit(&recv_sys.mutex); + return(finished); +} + +/** Scans log from a buffer and stores new log data to the parsing buffer. +Parses and hashes the log records if new data found. +@param[in] checkpoint_lsn latest checkpoint log sequence number +@param[in,out] contiguous_lsn log sequence number +until which all redo log has been scanned +@param[in] last_phase whether changes +can be applied to the tablespaces +@return whether rescan is needed (not everything was stored) */ +static +bool +recv_group_scan_log_recs( + lsn_t checkpoint_lsn, + lsn_t* contiguous_lsn, + bool last_phase) +{ + DBUG_ENTER("recv_group_scan_log_recs"); + DBUG_ASSERT(!last_phase || recv_sys.mlog_checkpoint_lsn > 0); + + mutex_enter(&recv_sys.mutex); + recv_sys.len = 0; + recv_sys.recovered_offset = 0; + recv_sys.clear(); + recv_sys.parse_start_lsn = *contiguous_lsn; + recv_sys.scanned_lsn = *contiguous_lsn; + recv_sys.recovered_lsn = *contiguous_lsn; + recv_sys.scanned_checkpoint_no = 0; + ut_ad(recv_max_page_lsn == 0); + mutex_exit(&recv_sys.mutex); + + lsn_t start_lsn; + lsn_t end_lsn; + store_t store = recv_sys.mlog_checkpoint_lsn == 0 + ? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES); + + log_sys.log.scanned_lsn = end_lsn = *contiguous_lsn = + ut_uint64_align_down(*contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE); + ut_d(recv_sys.after_apply = last_phase); + + do { + if (last_phase && store == STORE_NO) { + store = STORE_IF_EXISTS; + recv_sys.apply(false); + /* Rescan the redo logs from last stored lsn */ + end_lsn = recv_sys.recovered_lsn; + } + + start_lsn = ut_uint64_align_down(end_lsn, + OS_FILE_LOG_BLOCK_SIZE); + end_lsn = start_lsn; + log_sys.log.read_log_seg(&end_lsn, start_lsn + RECV_SCAN_SIZE); + } while (end_lsn != start_lsn + && !recv_scan_log_recs(&store, log_sys.buf, checkpoint_lsn, + start_lsn, end_lsn, contiguous_lsn, + &log_sys.log.scanned_lsn)); + + if (recv_sys.found_corrupt_log || recv_sys.found_corrupt_fs) { + DBUG_RETURN(false); + } + + DBUG_PRINT("ib_log", ("%s " LSN_PF " completed", + last_phase ? "rescan" : "scan", + log_sys.log.scanned_lsn)); + + DBUG_RETURN(store == STORE_NO); +} + +/** Report a missing tablespace for which page-redo log exists. +@param[in] err previous error code +@param[in] i tablespace descriptor +@return new error code */ +static +dberr_t +recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i) +{ + if (srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT) { + if (i->second.name.find(TEMP_TABLE_PATH_PREFIX) + != std::string::npos) { + ib::warn() << "Tablespace " << i->first << " was not" + " found at " << i->second.name << " when" + " restoring a (partial?) backup. All redo log" + " for this file will be ignored!"; + } + return(err); + } + + if (srv_force_recovery == 0) { + ib::error() << "Tablespace " << i->first << " was not" + " found at " << i->second.name << "."; + + if (err == DB_SUCCESS) { + ib::error() << "Set innodb_force_recovery=1 to" + " ignore this and to permanently lose" + " all changes to the tablespace."; + err = DB_TABLESPACE_NOT_FOUND; + } + } else { + ib::warn() << "Tablespace " << i->first << " was not" + " found at " << i->second.name << ", and" + " innodb_force_recovery was set. All redo log" + " for this tablespace will be ignored!"; + } + + return(err); +} + +/** Report the missing tablespace and discard the redo logs for the deleted +tablespace. +@param[in] rescan rescan of redo logs is needed + if hash table ran out of memory +@param[out] missing_tablespace missing tablespace exists or not +@return error code or DB_SUCCESS. */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +recv_validate_tablespace(bool rescan, bool& missing_tablespace) +{ + dberr_t err = DB_SUCCESS; + + mutex_enter(&recv_sys.mutex); + + for (recv_sys_t::map::iterator p = recv_sys.pages.begin(); + p != recv_sys.pages.end();) { + ut_ad(!p->second.log.empty()); + const ulint space = p->first.space(); + if (is_predefined_tablespace(space)) { +next: + p++; + continue; + } + + recv_spaces_t::iterator i = recv_spaces.find(space); + ut_ad(i != recv_spaces.end()); + + switch (i->second.status) { + case file_name_t::NORMAL: + goto next; + case file_name_t::MISSING: + err = recv_init_missing_space(err, i); + i->second.status = file_name_t::DELETED; + /* fall through */ + case file_name_t::DELETED: + recv_sys_t::map::iterator r = p++; + r->second.log.clear(); + recv_sys.pages.erase(r); + continue; + } + ut_ad(0); + } + + if (err != DB_SUCCESS) { +func_exit: + mutex_exit(&recv_sys.mutex); + return(err); + } + + /* When rescan is not needed, recv_sys.pages will contain the + entire redo log. If rescan is needed or innodb_force_recovery + is set, we can ignore missing tablespaces. */ + for (const recv_spaces_t::value_type& rs : recv_spaces) { + if (UNIV_LIKELY(rs.second.status != file_name_t::MISSING)) { + continue; + } + + missing_tablespace = true; + + if (srv_force_recovery > 0) { + ib::warn() << "Tablespace " << rs.first + <<" was not found at " << rs.second.name + <<", and innodb_force_recovery was set." + <<" All redo log for this tablespace" + <<" will be ignored!"; + continue; + } + + if (!rescan) { + ib::info() << "Tablespace " << rs.first + << " was not found at '" + << rs.second.name << "', but there" + <<" were no modifications either."; + } + } + + if (!rescan || srv_force_recovery > 0) { + missing_tablespace = false; + } + + err = DB_SUCCESS; + goto func_exit; +} + +/** Check if all tablespaces were found for crash recovery. +@param[in] rescan rescan of redo logs is needed +@param[out] missing_tablespace missing table exists +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace) +{ + bool flag_deleted = false; + + ut_ad(!srv_read_only_mode); + ut_ad(recv_needed_recovery); + + for (recv_spaces_t::value_type& rs : recv_spaces) { + ut_ad(!is_predefined_tablespace(rs.first)); + ut_ad(rs.second.status != file_name_t::DELETED + || !rs.second.space); + + if (rs.second.status == file_name_t::DELETED) { + /* The tablespace was deleted, + so we can ignore any redo log for it. */ + flag_deleted = true; + } else if (rs.second.space != NULL) { + /* The tablespace was found, and there + are some redo log records for it. */ + fil_names_dirty(rs.second.space); + + /* Add the freed page ranges in the respective + tablespace */ + if (!rs.second.freed_ranges.empty() + && (srv_immediate_scrub_data_uncompressed + || rs.second.space->is_compressed())) { + + rs.second.space->add_free_ranges( + std::move(rs.second.freed_ranges)); + } + } else if (rs.second.name == "") { + ib::error() << "Missing FILE_CREATE, FILE_DELETE" + " or FILE_MODIFY before FILE_CHECKPOINT" + " for tablespace " << rs.first; + recv_sys.found_corrupt_log = true; + return(DB_CORRUPTION); + } else { + rs.second.status = file_name_t::MISSING; + flag_deleted = true; + } + + ut_ad(rs.second.status == file_name_t::DELETED + || rs.second.name != ""); + } + + if (flag_deleted) { + return recv_validate_tablespace(rescan, missing_tablespace); + } + + return DB_SUCCESS; +} + +/** Start recovering from a redo log checkpoint. +@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN +of first system tablespace page +@return error code or DB_SUCCESS */ +dberr_t +recv_recovery_from_checkpoint_start(lsn_t flush_lsn) +{ + ulint max_cp_field; + lsn_t checkpoint_lsn; + bool rescan = false; + ib_uint64_t checkpoint_no; + lsn_t contiguous_lsn; + byte* buf; + dberr_t err = DB_SUCCESS; + + ut_ad(srv_operation == SRV_OPERATION_NORMAL + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex)); + ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0); + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex)); + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + + ib::info() << "innodb_force_recovery=6 skips redo log apply"; + + return(DB_SUCCESS); + } + + recv_sys.recovery_on = true; + + mysql_mutex_lock(&log_sys.mutex); + + err = recv_find_max_checkpoint(&max_cp_field); + + if (err != DB_SUCCESS) { + + recv_sys.recovered_lsn = log_sys.get_lsn(); + mysql_mutex_unlock(&log_sys.mutex); + return(err); + } + + buf = log_sys.checkpoint_buf; + log_sys.log.read(max_cp_field, {buf, OS_FILE_LOG_BLOCK_SIZE}); + + checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN); + checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); + + /* Start reading the log from the checkpoint lsn. The variable + contiguous_lsn contains an lsn up to which the log is known to + be contiguously written. */ + recv_sys.mlog_checkpoint_lsn = 0; + + ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size); + + const lsn_t end_lsn = mach_read_from_8( + buf + LOG_CHECKPOINT_END_LSN); + + ut_ad(recv_sys.pages.empty()); + contiguous_lsn = checkpoint_lsn; + switch (log_sys.log.format) { + case 0: + mysql_mutex_unlock(&log_sys.mutex); + return DB_SUCCESS; + default: + if (end_lsn == 0) { + break; + } + if (end_lsn >= checkpoint_lsn) { + contiguous_lsn = end_lsn; + break; + } + recv_sys.found_corrupt_log = true; + mysql_mutex_unlock(&log_sys.mutex); + return(DB_ERROR); + } + + size_t sizeof_checkpoint; + + if (!log_sys.is_physical()) { + sizeof_checkpoint = 9/* size of MLOG_CHECKPOINT */; + goto completed; + } + + /* Look for FILE_CHECKPOINT. */ + recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false); + /* The first scan should not have stored or applied any records. */ + ut_ad(recv_sys.pages.empty()); + ut_ad(!recv_sys.found_corrupt_fs); + + if (srv_read_only_mode && recv_needed_recovery) { + mysql_mutex_unlock(&log_sys.mutex); + return(DB_READ_ONLY); + } + + if (recv_sys.found_corrupt_log && !srv_force_recovery) { + mysql_mutex_unlock(&log_sys.mutex); + ib::warn() << "Log scan aborted at LSN " << contiguous_lsn; + return(DB_ERROR); + } + + if (recv_sys.mlog_checkpoint_lsn == 0) { + lsn_t scan_lsn = log_sys.log.scanned_lsn; + if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) { + mysql_mutex_unlock(&log_sys.mutex); + ib::error err; + err << "Missing FILE_CHECKPOINT"; + if (end_lsn) { + err << " at " << end_lsn; + } + err << " between the checkpoint " << checkpoint_lsn + << " and the end " << scan_lsn << "."; + return(DB_ERROR); + } + + log_sys.log.scanned_lsn = checkpoint_lsn; + } else { + contiguous_lsn = checkpoint_lsn; + rescan = recv_group_scan_log_recs( + checkpoint_lsn, &contiguous_lsn, false); + + if ((recv_sys.found_corrupt_log && !srv_force_recovery) + || recv_sys.found_corrupt_fs) { + mysql_mutex_unlock(&log_sys.mutex); + return(DB_ERROR); + } + } + + /* NOTE: we always do a 'recovery' at startup, but only if + there is something wrong we will print a message to the + user about recovery: */ + sizeof_checkpoint= SIZE_OF_FILE_CHECKPOINT; + +completed: + if (flush_lsn == checkpoint_lsn + sizeof_checkpoint + && recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) { + /* The redo log is logically empty. */ + } else if (checkpoint_lsn != flush_lsn) { + ut_ad(!srv_log_file_created); + + if (checkpoint_lsn + sizeof_checkpoint < flush_lsn) { + ib::warn() + << "Are you sure you are using the right " + << LOG_FILE_NAME + << " to start up the database? Log sequence " + "number in the " + << LOG_FILE_NAME << " is " << checkpoint_lsn + << ", less than the log sequence number in " + "the first system tablespace file header, " + << flush_lsn << "."; + } + + if (!recv_needed_recovery) { + + ib::info() + << "The log sequence number " << flush_lsn + << " in the system tablespace does not match" + " the log sequence number " + << checkpoint_lsn << " in the " + << LOG_FILE_NAME << "!"; + + if (srv_read_only_mode) { + ib::error() << "innodb_read_only" + " prevents crash recovery"; + mysql_mutex_unlock(&log_sys.mutex); + return(DB_READ_ONLY); + } + + recv_needed_recovery = true; + } + } + + log_sys.set_lsn(recv_sys.recovered_lsn); + if (UNIV_LIKELY(log_sys.get_flushed_lsn() < recv_sys.recovered_lsn)) { + /* This may already have been set by create_log_file() + if no logs existed when the server started up. */ + log_sys.set_flushed_lsn(recv_sys.recovered_lsn); + } + + if (recv_needed_recovery) { + bool missing_tablespace = false; + + err = recv_init_crash_recovery_spaces( + rescan, missing_tablespace); + + if (err != DB_SUCCESS) { + mysql_mutex_unlock(&log_sys.mutex); + return(err); + } + + /* If there is any missing tablespace and rescan is needed + then there is a possiblity that hash table will not contain + all space ids redo logs. Rescan the remaining unstored + redo logs for the validation of missing tablespace. */ + ut_ad(rescan || !missing_tablespace); + + while (missing_tablespace) { + DBUG_PRINT("ib_log", ("Rescan of redo log to validate " + "the missing tablespace. Scan " + "from last stored LSN " LSN_PF, + recv_sys.last_stored_lsn)); + + lsn_t recent_stored_lsn = recv_sys.last_stored_lsn; + rescan = recv_group_scan_log_recs( + checkpoint_lsn, &recent_stored_lsn, false); + + ut_ad(!recv_sys.found_corrupt_fs); + + missing_tablespace = false; + + err = recv_sys.found_corrupt_log + ? DB_ERROR + : recv_validate_tablespace( + rescan, missing_tablespace); + + if (err != DB_SUCCESS) { + mysql_mutex_unlock(&log_sys.mutex); + return err; + } + + rescan = true; + } + + recv_sys.parse_start_lsn = checkpoint_lsn; + + if (srv_operation == SRV_OPERATION_NORMAL) { + buf_dblwr.recover(); + } + + ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN); + + if (rescan) { + contiguous_lsn = checkpoint_lsn; + + recv_group_scan_log_recs( + checkpoint_lsn, &contiguous_lsn, true); + + if ((recv_sys.found_corrupt_log + && !srv_force_recovery) + || recv_sys.found_corrupt_fs) { + mysql_mutex_unlock(&log_sys.mutex); + return(DB_ERROR); + } + } + } else { + ut_ad(!rescan || recv_sys.pages.empty()); + } + + if (log_sys.is_physical() + && (log_sys.log.scanned_lsn < checkpoint_lsn + || log_sys.log.scanned_lsn < recv_max_page_lsn)) { + + ib::error() << "We scanned the log up to " + << log_sys.log.scanned_lsn + << ". A checkpoint was at " << checkpoint_lsn << " and" + " the maximum LSN on a database page was " + << recv_max_page_lsn << ". It is possible that the" + " database is now corrupt!"; + } + + if (recv_sys.recovered_lsn < checkpoint_lsn) { + mysql_mutex_unlock(&log_sys.mutex); + + ib::error() << "Recovered only to lsn:" + << recv_sys.recovered_lsn + << " checkpoint_lsn: " << checkpoint_lsn; + + return(DB_ERROR); + } + + log_sys.next_checkpoint_lsn = checkpoint_lsn; + log_sys.next_checkpoint_no = checkpoint_no + 1; + + recv_synchronize_groups(); + + ut_ad(recv_needed_recovery + || checkpoint_lsn == recv_sys.recovered_lsn); + + log_sys.write_lsn = log_sys.get_lsn(); + log_sys.buf_free = log_sys.write_lsn % OS_FILE_LOG_BLOCK_SIZE; + log_sys.buf_next_to_write = log_sys.buf_free; + + log_sys.last_checkpoint_lsn = checkpoint_lsn; + + if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) { + /* Write a FILE_CHECKPOINT marker as the first thing, + before generating any other redo log. This ensures + that subsequent crash recovery will be possible even + if the server were killed soon after this. */ + fil_names_clear(log_sys.last_checkpoint_lsn, true); + } + + log_sys.next_checkpoint_no = ++checkpoint_no; + + mutex_enter(&recv_sys.mutex); + + recv_sys.apply_log_recs = true; + recv_no_ibuf_operations = false; + ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + + mutex_exit(&recv_sys.mutex); + + mysql_mutex_unlock(&log_sys.mutex); + + recv_lsn_checks_on = true; + + /* The database is now ready to start almost normal processing of user + transactions: transaction rollbacks and the application of the log + records in the hash table can be run in background. */ + + return(DB_SUCCESS); +} + +bool recv_dblwr_t::validate_page(const page_id_t page_id, + const byte *page, + const fil_space_t *space, + byte *tmp_buf) +{ + if (page_id.page_no() == 0) + { + ulint flags= fsp_header_get_flags(page); + if (!fil_space_t::is_valid_flags(flags, page_id.space())) + { + ulint cflags= fsp_flags_convert_from_101(flags); + if (cflags == ULINT_UNDEFINED) + { + ib::warn() << "Ignoring a doublewrite copy of page " << page_id + << "due to invalid flags " << ib::hex(flags); + return false; + } + + flags= cflags; + } + + /* Page 0 is never page_compressed or encrypted. */ + return !buf_page_is_corrupted(true, page, flags); + } + + ut_ad(tmp_buf); + byte *tmp_frame= tmp_buf; + byte *tmp_page= tmp_buf + srv_page_size; + const uint16_t page_type= mach_read_from_2(page + FIL_PAGE_TYPE); + const bool expect_encrypted= space->crypt_data && + space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + + if (space->full_crc32()) + return !buf_page_is_corrupted(true, page, space->flags); + + if (expect_encrypted && + mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)) + { + if (!fil_space_verify_crypt_checksum(page, space->zip_size())) + return false; + if (page_type != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) + return true; + if (space->zip_size()) + return false; + memcpy(tmp_page, page, space->physical_size()); + if (!fil_space_decrypt(space, tmp_frame, tmp_page)) + return false; + } + + switch (page_type) { + case FIL_PAGE_PAGE_COMPRESSED: + memcpy(tmp_page, page, space->physical_size()); + /* fall through */ + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + if (space->zip_size()) + return false; /* ROW_FORMAT=COMPRESSED cannot be page_compressed */ + ulint decomp= fil_page_decompress(tmp_frame, tmp_page, space->flags); + if (!decomp) + return false; /* decompression failed */ + if (decomp == srv_page_size) + return false; /* the page was not compressed (invalid page type) */ + return !buf_page_is_corrupted(true, tmp_page, space->flags); + } + + return !buf_page_is_corrupted(true, page, space->flags); +} + +byte *recv_dblwr_t::find_page(const page_id_t page_id, + const fil_space_t *space, byte *tmp_buf) +{ + byte *result= NULL; + lsn_t max_lsn= 0; + + for (byte *page : pages) + { + if (page_get_page_no(page) != page_id.page_no() || + page_get_space_id(page) != page_id.space()) + continue; + const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN); + if (lsn <= max_lsn || + !validate_page(page_id, page, space, tmp_buf)) + { + /* Mark processed for subsequent iterations in buf_dblwr_t::recover() */ + memset(page + FIL_PAGE_LSN, 0, 8); + continue; + } + max_lsn= lsn; + result= page; + } + + return result; +} |