diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/include/log0recv.h | |
parent | Initial commit. (diff) | |
download | mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/include/log0recv.h')
-rw-r--r-- | storage/innobase/include/log0recv.h | 491 |
1 files changed, 491 insertions, 0 deletions
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h new file mode 100644 index 00000000..6d75e15a --- /dev/null +++ b/storage/innobase/include/log0recv.h @@ -0,0 +1,491 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.h +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "ut0new.h" +#include "buf0types.h" +#include "log0log.h" +#include "mtr0types.h" + +#include <deque> +#include <map> + +/** @return whether recovery is currently running. */ +#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) + +ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Apply any buffered redo log to a page. +@param space tablespace +@param bpage buffer pool page +@return whether the page was recovered correctly */ +bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); + +/** Start recovering from a redo log checkpoint. +of first system tablespace page +@return error code or DB_SUCCESS */ +dberr_t recv_recovery_from_checkpoint_start(); + +/** Report an operation to create, delete, or rename a file during backup. +@param[in] space_id tablespace identifier +@param[in] type file operation redo log type +@param[in] name file name (not NUL-terminated) +@param[in] len length of name, in bytes +@param[in] new_name new file name (NULL if not rename) +@param[in] new_len length of new_name, in bytes (0 if NULL) */ +extern void (*log_file_op)(uint32_t space_id, int type, + const byte* name, ulint len, + const byte* new_name, ulint new_len); + +/** Report an operation which does undo log tablespace truncation +during backup +@param space_id undo tablespace identifier */ +extern void (*undo_space_trunc)(uint32_t space_id); + +/** Report an operation which does INIT_PAGE for page0 during backup. +@param space_id tablespace identifier */ +extern void (*first_page_init)(uint32_t space_id); + +/** Stored redo log record */ +struct log_rec_t +{ + log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); } + log_rec_t()= delete; + log_rec_t(const log_rec_t&)= delete; + log_rec_t &operator=(const log_rec_t&)= delete; + + /** next record */ + log_rec_t *next; + /** mtr_t::commit_lsn() of the mini-transaction */ + const lsn_t lsn; +}; + +struct recv_dblwr_t +{ + /** Add a page frame to the doublewrite recovery buffer. */ + void add(byte *page) { pages.push_front(page); } + + /** Validate the page. + @param page_id page identifier + @param page page contents + @param space the tablespace of the page (not available for page 0) + @param tmp_buf 2*srv_page_size for decrypting and decompressing any + page_compressed or encrypted pages + @return whether the page is valid */ + bool validate_page(const page_id_t page_id, const byte *page, + const fil_space_t *space, byte *tmp_buf); + + /** Find a doublewrite copy of a page. + @param page_id page identifier + @param space tablespace (not available for page_id.page_no()==0) + @param tmp_buf 2*srv_page_size for decrypting and decompressing any + page_compressed or encrypted pages + @return page frame + @retval NULL if no valid page for page_id was found */ + byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL, + byte *tmp_buf= NULL); + + /** Restore the first page of the given tablespace from + doublewrite buffer. + @param space_id tablespace identifier + @param name tablespace filepath + @param file tablespace file handle + @return whether the operation failed */ + bool restore_first_page(uint32_t space_id, const char *name, os_file_t file); + + typedef std::deque<byte*, ut_allocator<byte*> > list; + + /** Recovered doublewrite buffer page frames */ + list pages; +}; + +/** recv_sys.pages entry; protected by recv_sys.mutex */ +struct page_recv_t +{ + /** Recovery status: 0=not in progress, 1=log is being applied, + -1=log has been applied and the entry may be erased. + Transitions from 1 to -1 are NOT protected by recv_sys.mutex. */ + Atomic_relaxed<int8_t> being_processed{0}; + /** Whether reading the page will be skipped */ + bool skip_read= false; + /** Latest written byte offset when applying the log records. + @see mtr_t::m_last_offset */ + uint16_t last_offset= 1; + /** log records for a page */ + class recs_t + { + /** The first log record */ + log_rec_t *head= nullptr; + /** The last log record */ + log_rec_t *tail= nullptr; + friend struct page_recv_t; + public: + /** Append a redo log snippet for the page + @param recs log snippet */ + void append(log_rec_t* recs) + { + if (tail) + tail->next= recs; + else + head= recs; + tail= recs; + } + /** Remove the last records for the page + @param start_lsn start of the removed log */ + ATTRIBUTE_COLD void rewind(lsn_t start_lsn); + + /** @return the last log snippet */ + const log_rec_t* last() const { return tail; } + /** @return the last log snippet */ + log_rec_t* last() { return tail; } + + class iterator + { + log_rec_t *cur; + public: + iterator(log_rec_t* rec) : cur(rec) {} + log_rec_t* operator*() const { return cur; } + iterator &operator++() { cur= cur->next; return *this; } + bool operator!=(const iterator& i) const { return cur != i.cur; } + }; + iterator begin() { return head; } + iterator end() { return NULL; } + bool empty() const { ut_ad(!head == !tail); return !head; } + /** Clear and free the records; @see recv_sys_t::add() */ + void clear(); + } log; + + /** Trim old log records for a page. + @param start_lsn oldest log sequence number to preserve + @return whether all the log for the page was trimmed */ + inline bool trim(lsn_t start_lsn); + /** Ignore any earlier redo log records for this page. */ + inline void will_not_read(); +}; + +/** A page initialization operation that was parsed from the redo log */ +struct recv_init +{ + /** log sequence number of the page initialization */ + lsn_t lsn; + /** Whether btr_page_create() avoided a read of the page. + At the end of the last recovery batch, mark_ibuf_exist() + will mark pages for which this flag is set. */ + bool created; +}; + +/** Recovery system data structure */ +struct recv_sys_t +{ + using init= recv_init; + + /** mutex protecting this as well as some of page_recv_t */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; +private: + /** set when finding a corrupt log block or record, or there is a + log parsing buffer overflow */ + bool found_corrupt_log; + /** set when an inconsistency with the file system contents is detected + during log scan or apply */ + bool found_corrupt_fs; +public: + /** @return maximum guaranteed size of a mini-transaction on recovery */ + static constexpr size_t MTR_SIZE_MAX{1U << 20}; + + /** whether we are applying redo log records during crash recovery */ + bool recovery_on; + /** whether recv_recover_page(), invoked from buf_page_t::read_complete(), + should apply log records*/ + bool apply_log_recs; + /** number of bytes in log_sys.buf */ + size_t len; + /** start offset of non-parsed log records in log_sys.buf */ + size_t offset; + /** log sequence number of the first non-parsed record */ + lsn_t lsn; + /** log sequence number of the last parsed mini-transaction */ + lsn_t scanned_lsn; + /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ + lsn_t file_checkpoint; + /** the time when progress was last reported */ + time_t progress_time; + + using map = std::map<const page_id_t, page_recv_t, + std::less<const page_id_t>, + ut_allocator<std::pair<const page_id_t, page_recv_t>>>; + /** buffered records waiting to be applied to pages */ + map pages; + +private: + /** iterator to pages, used by parse() */ + map::iterator pages_it; + + /** Process a record that indicates that a tablespace size is being shrunk. + @param page_id first page that is not in the file + @param lsn log sequence number of the shrink operation */ + inline void trim(const page_id_t page_id, lsn_t lsn); + + /** Undo tablespaces for which truncate has been logged + (indexed by page_id_t::space() - srv_undo_space_id_start) */ + struct trunc + { + /** log sequence number of FILE_CREATE, or 0 if none */ + lsn_t lsn; + /** truncated size of the tablespace, or 0 if not truncated */ + unsigned pages; + } truncated_undo_spaces[127]; + +public: + /** The contents of the doublewrite buffer */ + recv_dblwr_t dblwr; + + __attribute__((warn_unused_result)) + inline dberr_t read(os_offset_t offset, span<byte> buf); + inline size_t files_size(); + void close_files(); + + /** Advance pages_it if it matches the iterator */ + void pages_it_invalidate(const map::iterator &p) + { + mysql_mutex_assert_owner(&mutex); + if (pages_it == p) + pages_it++; + } + /** Invalidate pages_it if it points to the given tablespace */ + void pages_it_invalidate(uint32_t space_id) + { + mysql_mutex_assert_owner(&mutex); + if (pages_it != pages.end() && pages_it->first.space() == space_id) + pages_it= pages.end(); + } + +private: + /** Attempt to initialize a page based on redo log records. + @param p iterator + @param mtr mini-transaction + @param b pre-allocated buffer pool block + @param init page initialization + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records + @retval -1 if the page cannot be recovered due to corruption */ + inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr, + buf_block_t *b, init &init); + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records + @retval -1 if the page cannot be recovered due to corruption */ + ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id); + + /** All found log files (multiple ones are possible if we are upgrading + from before MariaDB Server 10.5.1) */ + std::vector<log_file_t> files; + + /** Base node of the redo block list. + List elements are linked via buf_block_t::unzip_LRU. */ + UT_LIST_BASE_NODE_T(buf_block_t) blocks; + + /** Allocate a block from the buffer pool for recv_sys.pages */ + ATTRIBUTE_COLD buf_block_t *add_block(); + + /** Wait for buffer pool to become available. + @param pages number of buffer pool pages needed */ + ATTRIBUTE_COLD void wait_for_pool(size_t pages); + + /** Free log for processed pages. */ + void garbage_collect(); + + /** Apply a recovery batch. + @param space_id current tablespace identifier + @param space current tablespace + @param free_block spare buffer block + @param last_batch whether it is possible to write more redo log + @return whether the caller must provide a new free_block */ + bool apply_batch(uint32_t space_id, fil_space_t *&space, + buf_block_t *&free_block, bool last_batch); + +public: + /** Apply buffered log to persistent data pages. + @param last_batch whether it is possible to write more redo log */ + void apply(bool last_batch); + +#ifdef UNIV_DEBUG + /** whether all redo log in the current batch has been applied */ + bool after_apply= false; +#endif + /** Initialize the redo log recovery subsystem. */ + void create(); + + /** Free most recovery data structures. */ + void debug_free(); + + /** Clean up after create() */ + void close(); + + bool is_initialised() const { return scanned_lsn != 0; } + + /** Find the latest checkpoint. + @return error code or DB_SUCCESS */ + dberr_t find_checkpoint(); + + /** Register a redo log snippet for a page. + @param it page iterator + @param start_lsn start LSN of the mini-transaction + @param lsn @see mtr_t::commit_lsn() + @param l redo log snippet + @param len length of l, in bytes + @return whether we ran out of memory */ + bool add(map::iterator it, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len); + + /** Parsing result */ + enum parse_mtr_result { + /** a record was successfully parsed */ + OK, + /** the log ended prematurely (need to read more) */ + PREMATURE_EOF, + /** the end of the log was reached */ + GOT_EOF, + /** parse<true>(l, false) ran out of memory */ + GOT_OOM + }; + +private: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction. + @tparam store whether to store the records + @param l log data source + @param if_exists if store: whether to check if the tablespace exists */ + template<typename source,bool store> + inline parse_mtr_result parse(source &l, bool if_exists) noexcept; + + /** Rewind a mini-transaction when parse() runs out of memory. + @param l log data source + @param begin start of the mini-transaction */ + template<typename source> + ATTRIBUTE_COLD void rewind(source &l, source &begin) noexcept; + + /** Report progress in terms of LSN or pages remaining */ + ATTRIBUTE_COLD void report_progress() const; +public: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @tparam store whether to store the records + @param if_exists if store: whether to check if the tablespace exists */ + template<bool store> + static parse_mtr_result parse_mtr(bool if_exists) noexcept; + + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @tparam store whether to store the records + @param if_exists if store: whether to check if the tablespace exists */ + template<bool store> + static parse_mtr_result parse_pmem(bool if_exists) noexcept +#ifdef HAVE_PMEM + ; +#else + { return parse_mtr<store>(if_exists); } +#endif + + /** Erase log records for a page. */ + void erase(map::iterator p); + + /** Clear a fully processed set of stored redo log records. */ + void clear(); + + /** Determine whether redo log recovery progress should be reported. + @param time the current time + @return whether progress should be reported + (the last report was at least 15 seconds ago) */ + bool report(time_t time); + + /** The alloc() memory alignment, in bytes */ + static constexpr size_t ALIGNMENT= sizeof(size_t); + + /** Free a redo log snippet. + @param data buffer allocated in add() */ + inline void free(const void *data); + + /** Remove records for a corrupted page. + This function should only be called when innodb_force_recovery is set. + @param page_id corrupted page identifier */ + ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id); + + /** Flag data file corruption during recovery. */ + ATTRIBUTE_COLD void set_corrupt_fs(); + /** Flag log file corruption during recovery. */ + ATTRIBUTE_COLD void set_corrupt_log(); + + /** @return whether data file corruption was found */ + bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); } + /** @return whether log file corruption was found */ + bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); } + + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records + @retval -1 if the page cannot be recovered due to corruption */ + buf_block_t *recover(const page_id_t page_id) + { + return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr; + } + + /** Try to recover a tablespace that was not readable earlier + @param p iterator + @param name tablespace file name + @param free_block spare buffer block + @return recovered tablespace + @retval nullptr if recovery failed */ + fil_space_t *recover_deferred(const map::iterator &p, + const std::string &name, + buf_block_t *&free_block); +}; + +/** The recovery system */ +extern recv_sys_t recv_sys; + +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this will be set if +recv_sys.pages becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +TRUE means that recovery is running and no operations on the log files +are allowed yet: the variable name is misleading. */ +extern bool recv_no_ibuf_operations; +/** TRUE when recv_init_crash_recovery() has been called. */ +extern bool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** whether writing to the redo log is forbidden; +protected by exclusive log_sys.latch. */ +extern bool recv_no_log_write; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start(). */ +extern bool recv_lsn_checks_on; |