/* Copyright (C) 2007 MySQL AB & Sanja Belkin. 2010 Monty Program Ab. Copyright (c) 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ #include "maria_def.h" #include "trnman.h" #include "ma_blockrec.h" /* for some constants and in-write hooks */ #include "ma_key_recover.h" /* For some in-write hooks */ #include "ma_checkpoint.h" #include "ma_servicethread.h" #include "ma_recovery.h" #include "ma_loghandler_lsn.h" #include "ma_recovery_util.h" /* On Windows, neither my_open() nor mysql_file_sync() work for directories. Also there is no need to flush filesystem changes ,i.e to sync() directories. */ #ifdef _WIN32 #define sync_dir(A,B) 0 #else #define sync_dir(A,B) mysql_file_sync(A,B) #endif /** @file @brief Module which writes and reads to a transaction log */ /* 0xFF can never be valid first byte of a chunk */ #define TRANSLOG_FILLER 0xFF /* number of opened log files in the pagecache (should be at least 2) */ #define OPENED_FILES_NUM 3 #define CACHED_FILES_NUM 5 #define CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT 7 #if CACHED_FILES_NUM > CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT #include #include #endif /** @brief protects checkpoint_in_progress */ static mysql_mutex_t LOCK_soft_sync; /** @brief for killing the background checkpoint thread */ static mysql_cond_t COND_soft_sync; /** @brief control structure for checkpoint background thread */ static MA_SERVICE_THREAD_CONTROL soft_sync_control= {0, FALSE, FALSE, &LOCK_soft_sync, &COND_soft_sync}; uint log_purge_disabled= 0; /* transaction log file descriptor */ typedef struct st_translog_file { uint32 number; PAGECACHE_FILE handler; my_bool was_recovered; my_bool is_sync; } TRANSLOG_FILE; /* records buffer size (should be TRANSLOG_PAGE_SIZE * n) */ #define TRANSLOG_WRITE_BUFFER (1024*1024) /* pagecache_read/write/inject() use bmove512() on their buffers so those must be long-aligned, which we guarantee by using the type below: */ typedef union { ulonglong dummy; uchar buffer[TRANSLOG_PAGE_SIZE]; } TRANSLOG_PAGE_SIZE_BUFF; #define MAX_TRUNSLOG_USED_BUFFERS 3 typedef struct { struct st_translog_buffer *buff[MAX_TRUNSLOG_USED_BUFFERS]; uint8 wrt_ptr; uint8 unlck_ptr; } TRUNSLOG_USED_BUFFERS; static void used_buffs_init(TRUNSLOG_USED_BUFFERS *buffs) { buffs->unlck_ptr= buffs->wrt_ptr= 0; } static void used_buffs_add(TRUNSLOG_USED_BUFFERS *buffs, struct st_translog_buffer *buff); static void used_buffs_register_unlock(TRUNSLOG_USED_BUFFERS *buffs, struct st_translog_buffer *buff); static void used_buffs_urgent_unlock(TRUNSLOG_USED_BUFFERS *buffs); /* min chunk length */ #define TRANSLOG_MIN_CHUNK 3 /* Number of buffers used by loghandler Should be at least 4, because one thread can block up to 2 buffers in normal circumstances (less then half of one and full other, or just switched one and other), But if we met end of the file in the middle and have to switch buffer it will be 3. + 1 buffer for flushing/writing. We have a bigger number here for higher concurrency and to make division faster. The number should be power of 2 to be fast. */ #define TRANSLOG_BUFFERS_NO 8 /* number of bytes (+ header) which can be unused on first page in sequence */ #define TRANSLOG_MINCHUNK_CONTENT 1 /* version of log file */ #define TRANSLOG_VERSION_ID 10000 /* 1.00.00 */ #define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */ /* Maximum length of compressed LSNs (the worst case of whole LSN storing) */ #define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE) #define MAX_NUMBER_OF_LSNS_PER_RECORD 2 /* max lsn calculation for buffer */ #define BUFFER_MAX_LSN(B) \ ((B)->last_lsn == LSN_IMPOSSIBLE ? (B)->prev_last_lsn : (B)->last_lsn) /* log write buffer descriptor */ struct st_translog_buffer { /* Cache for current log. Comes first to be aligned for bmove512() in pagecache_inject() */ uchar buffer[TRANSLOG_WRITE_BUFFER]; /* Maximum LSN of records which ends in this buffer (or IMPOSSIBLE_LSN if no LSNs ends here) */ LSN last_lsn; /* last_lsn of previous buffer or IMPOSSIBLE_LSN if it is very first one */ LSN prev_last_lsn; /* This buffer offset in the file */ TRANSLOG_ADDRESS offset; /* Next buffer offset in the file (it is not always offset + size, in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE) */ TRANSLOG_ADDRESS next_buffer_offset; /* Previous buffer offset to detect it flush finish */ TRANSLOG_ADDRESS prev_buffer_offset; /* If the buffer was forced to close it save value of its horizon otherwise LSN_IMPOSSIBLE */ TRANSLOG_ADDRESS pre_force_close_horizon; /* How much is written (or will be written when copy_to_buffer_in_progress become 0) to this buffer */ translog_size_t size; /* When moving from one log buffer to another, we write the last of the previous buffer to file and then move to start using the new log buffer. In the case of a part filed last page, this page is not moved to the start of the new buffer but instead we set the 'skip_data' variable to tell us how much data at the beginning of the buffer is not relevant. */ uint skipped_data; /* File handler for this buffer */ TRANSLOG_FILE *file; /* Threads which are waiting for buffer filling/freeing */ mysql_cond_t waiting_filling_buffer; /* Number of records which are in copy progress. Controlled via translog_buffer_increase_writers() and translog_buffer_decrease_writers(). 1 Simple case: translog_force_current_buffer_to_finish both called in the same procedure. 2 Simple case: translog_write_variable_record_1group: translog_advance_pointer() increase writer of the buffer and translog_buffer_decrease_writers() decrease it. Usual case: 1) translog_advance_pointer (i.e. reserve place for future writing) increase writers for all buffers where place reserved. Simpliest case: just all space reserved in one buffer complex case: end of the first buffer, all second buffer, beginning of the third buffer. 2) When we finish with writing translog_chaser_page_next() will be called and unlock the buffer by decreasing number of writers. */ uint copy_to_buffer_in_progress; /* list of waiting buffer ready threads */ struct st_my_thread_var *waiting_flush; /* If true then previous buffer overlap with this one (due to flush of loghandler, the last page of that buffer is the same as the first page of this buffer) and have to be written first (because contain old content of page which present in both buffers) */ my_bool overlay; uint buffer_no; /* Lock for the buffer. Current buffer also lock the whole handler (if one want lock the handler one should lock the current buffer). Buffers are locked only in one direction (with overflow and beginning from the first buffer). If we keep lock on buffer N we can lock only buffer N+1 (never N-1). One thread do not lock more then 2 buffer in a time, so to make dead lock it should be N thread (where N equal number of buffers) takes one buffer and try to lock next. But it is impossible because there is only 2 cases when thread take 2 buffers: 1) one thread finishes current buffer (where horizon is) and start next (to which horizon moves). 2) flush start from buffer after current (oldest) and go till the current crabbing by buffer sequence. And there is only one flush in a moment (they are serialised). Because of above and number of buffers equal 5 we can't get dead lock (it is impossible to get all 5 buffers locked simultaneously). */ mysql_mutex_t mutex; /* Some thread is going to close the buffer and it should be done only by that thread */ my_bool is_closing_buffer; /* Version of the buffer increases every time buffer the buffer flushed. With file and offset it allow detect buffer changes */ uint8 ver; /* When previous buffer sent to disk it set its address here to allow to detect when it is done (we have to keep it in this buffer to lock buffers only in one direction). */ TRANSLOG_ADDRESS prev_sent_to_disk; mysql_cond_t prev_sent_to_disk_cond; }; struct st_buffer_cursor { TRUNSLOG_USED_BUFFERS buffs; /* pointer into the buffer */ uchar *ptr; /* current buffer */ struct st_translog_buffer *buffer; /* How many bytes we wrote on the current page */ uint16 current_page_fill; /* How many times we write the page on the disk during flushing process (for sector protection). */ uint16 write_counter; /* previous write offset */ uint16 previous_offset; /* Number of current buffer */ uint8 buffer_no; /* True if it is just filling buffer after advancing the pointer to the horizon. */ my_bool chaser; /* Is current page of the cursor already finished (sector protection should be applied if it is needed) */ my_bool protected; }; typedef uint8 dirty_buffer_mask_t; struct st_translog_descriptor { /* *** Parameters of the log handler *** */ /* Page cache for the log reads */ PAGECACHE *pagecache; uint flags; /* File open flags */ uint open_flags; /* max size of one log size (for new logs creation) */ uint32 log_file_max_size; uint32 server_version; /* server ID (used for replication) */ uint32 server_id; /* Loghandler's buffer capacity in case of chunk 2 filling */ uint32 buffer_capacity_chunk_2; /* Half of the buffer capacity in case of chunk 2 filling, used to decide will we write a record in one group or many. It is written to the variable just to avoid devision every time we need it. */ uint32 half_buffer_capacity_chunk_2; /* Page overhead calculated by flags (whether CRC is enabled, etc) */ uint16 page_overhead; /* Page capacity ("useful load") calculated by flags (TRANSLOG_PAGE_SIZE - page_overhead-1) */ uint16 page_capacity_chunk_2; /* Path to the directory where we store log store files */ char directory[FN_REFLEN]; /* *** Current state of the log handler *** */ /* list of opened files */ DYNAMIC_ARRAY open_files; /* min/max number of file in the array */ uint32 max_file, min_file; /* the opened files list guard */ mysql_rwlock_t open_files_lock; /* File descriptor of the directory where we store log files for syncing it. */ File directory_fd; /* buffers for log writing */ struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO]; /* Mask where 1 in position N mean that buffer N is not flushed */ dirty_buffer_mask_t dirty_buffer_mask; /* The above variable protection */ mysql_mutex_t dirty_buffer_mask_lock; /* horizon - visible end of the log (here is absolute end of the log: position where next chunk can start */ TRANSLOG_ADDRESS horizon; /* horizon buffer cursor */ struct st_buffer_cursor bc; /* maximum LSN of the current (not finished) file */ LSN max_lsn; /* Last flushed LSN (protected by log_flush_lock). Pointers in the log ordered like this: last_lsn_checked <= flushed <= sent_to_disk <= in_buffers_only <= max_lsn <= horizon */ LSN flushed; /* Last LSN sent to the disk (but maybe not written yet) */ LSN sent_to_disk; /* Horizon from which log started after initialization */ TRANSLOG_ADDRESS log_start; TRANSLOG_ADDRESS previous_flush_horizon; /* All what is after this address is not sent to disk yet */ TRANSLOG_ADDRESS in_buffers_only; /* protection of sent_to_disk and in_buffers_only */ mysql_mutex_t sent_to_disk_lock; /* Protect flushed (see above) and for flush serialization (will be removed in v1.5 */ mysql_mutex_t log_flush_lock; mysql_cond_t log_flush_cond; mysql_cond_t new_goal_cond; /* Protects changing of headers of finished files (max_lsn) */ mysql_mutex_t file_header_lock; /* Sorted array (with protection) of files where we started writing process and so we can't give last LSN yet */ mysql_mutex_t unfinished_files_lock; DYNAMIC_ARRAY unfinished_files; /* minimum number of still need file calculeted during last translog_purge call */ uint32 min_need_file; /* Purger data: minimum file in the log (or 0 if unknown) */ uint32 min_file_number; /* Protect purger from many calls and it's data */ mysql_mutex_t purger_lock; /* last low water mark checked */ LSN last_lsn_checked; /** Must be set to 0 under loghandler lock every time a new LSN is generated. */ my_bool is_everything_flushed; /* True when flush pass is in progress */ my_bool flush_in_progress; /* The flush number (used to distinguish two flushes goes one by one) */ volatile int flush_no; /* Next flush pass variables */ TRANSLOG_ADDRESS next_pass_max_lsn; pthread_t max_lsn_requester; }; static struct st_translog_descriptor log_descriptor; ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE; ulong log_file_size= TRANSLOG_FILE_SIZE; /* sync() of log files directory mode */ ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE; ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE; ulong maria_group_commit_interval= 0; /* Marker for end of log */ static uchar end_of_log= 0; #define END_OF_LOG &end_of_log /** Switch for "soft" sync (no real sync() but periodical sync by service thread) */ static volatile my_bool soft_sync= FALSE; /** Switch for "hard" group commit mode */ static volatile my_bool hard_group_commit= FALSE; /** File numbers interval which have to be sync() */ static uint32 soft_sync_min= 0; static uint32 soft_sync_max= 0; static uint32 soft_need_sync= 1; /** stores interval in microseconds */ static uint32 group_commit_wait= 0; enum enum_translog_status translog_status= TRANSLOG_UNINITED; ulonglong translog_syncs= 0; /* Number of sync()s */ /* time of last flush */ static ulonglong flush_start= 0; /* chunk types */ #define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */ #define TRANSLOG_CHUNK_FIXED (1 << 6) /* 1 (pseudo)fixed record (also LSN) */ #define TRANSLOG_CHUNK_NOHDR (2 << 6) /* 2 no head chunk (till page end) */ #define TRANSLOG_CHUNK_LNGTH (3 << 6) /* 3 chunk with chunk length */ #define TRANSLOG_CHUNK_TYPE (3 << 6) /* Mask to get chunk type */ #define TRANSLOG_REC_TYPE 0x3F /* Mask to get record type */ #define TRANSLOG_CHUNK_0_CONT 0x3F /* the type to mark chunk 0 continue */ /* compressed (relative) LSN constants */ #define TRANSLOG_CLSN_LEN_BITS 0xC0 /* Mask to get compressed LSN length */ /* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */ static MARIA_SHARE **id_to_share= NULL; static my_bool translog_page_validator(int res, PAGECACHE_IO_HOOK_ARGS *args); static my_bool translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner); static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected); LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon); /* Initialize log_record_type_descriptors */ LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; #ifndef DBUG_OFF #define translog_buffer_lock_assert_owner(B) \ mysql_mutex_assert_owner(&(B)->mutex) #define translog_lock_assert_owner() \ mysql_mutex_assert_owner(&log_descriptor.bc.buffer->mutex) void translog_lock_handler_assert_owner() { translog_lock_assert_owner(); } /** @brief check the description table validity @param num how many records should be filled */ static uint max_allowed_translog_type= 0; void check_translog_description_table(int num) { int i; DBUG_ENTER("check_translog_description_table"); DBUG_PRINT("enter", ("last record: %d", num)); DBUG_ASSERT(num > 0); /* last is reserved for extending the table */ DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1); DBUG_ASSERT(log_record_type_descriptor[0].rclass == LOGRECTYPE_NOT_ALLOWED); max_allowed_translog_type= num; for (i= 0; i <= num; i++) { DBUG_PRINT("info", ("record type: %d class: %d fixed: %u header: %u LSNs: %u " "name: %s", i, log_record_type_descriptor[i].rclass, (uint)log_record_type_descriptor[i].fixed_length, (uint)log_record_type_descriptor[i].read_header_len, (uint)log_record_type_descriptor[i].compressed_LSN, log_record_type_descriptor[i].name)); switch (log_record_type_descriptor[i].rclass) { case LOGRECTYPE_NOT_ALLOWED: DBUG_ASSERT(i == 0); break; case LOGRECTYPE_VARIABLE_LENGTH: DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0); DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) || ((log_record_type_descriptor[i].compressed_LSN == 1) && (log_record_type_descriptor[i].read_header_len >= LSN_STORE_SIZE)) || ((log_record_type_descriptor[i].compressed_LSN == 2) && (log_record_type_descriptor[i].read_header_len >= LSN_STORE_SIZE * 2))); break; case LOGRECTYPE_PSEUDOFIXEDLENGTH: DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == log_record_type_descriptor[i].read_header_len); DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0); DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2); break; case LOGRECTYPE_FIXEDLENGTH: DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == log_record_type_descriptor[i].read_header_len); DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0); break; default: DBUG_ASSERT(0); } } for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++) { DBUG_ASSERT(log_record_type_descriptor[i].rclass == LOGRECTYPE_NOT_ALLOWED); } DBUG_VOID_RETURN; } #else #define translog_buffer_lock_assert_owner(B) {} #define translog_lock_assert_owner() {} #endif static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23= {LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0, "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL }; static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_HEAD= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_new_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_TAIL= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_new_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_FREE_BLOCKS= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_free_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE, FILEID_STORE_SIZE + PAGE_STORE_SIZE, NULL, write_hook_for_redo, NULL, 0, "redo_free_head_or_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; /* not yet used; for when we have versioning */ static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW= {LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; /** @todo RECOVERY BUG unused, remove? */ static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD= {LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_INDEX= {LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_INDEX_NEW_PAGE= {LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1, NULL, write_hook_for_redo, NULL, 0, "redo_index_new_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_INDEX_FREE_PAGE= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, NULL, write_hook_for_redo, NULL, 0, "redo_index_free_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW= {LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_CLR_END= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE, NULL, write_hook_for_clr_end, NULL, 1, "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_PURGE_END= {LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1, "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_undo_row_insert, NULL, 1, "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_undo_row_delete, NULL, 1, "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, write_hook_for_undo_row_update, NULL, 1, "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, NULL, write_hook_for_undo_key_insert, NULL, 1, "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; /* This will never be in the log, only in the clr */ static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, NULL, write_hook_for_undo_key, NULL, 1, "undo_key_insert_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, NULL, write_hook_for_undo_key_delete, NULL, 1, "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, NULL, write_hook_for_undo_key_delete, NULL, 1, "undo_key_delete_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_PREPARE= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1, "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_COMMIT= {LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL, write_hook_for_commit, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE= {LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, write_hook_for_commit, NULL, 1, "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_CHECKPOINT= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE= {LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0, "redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, NULL, write_hook_for_redo_delete_all, NULL, 0, "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 8 + 8, FILEID_STORE_SIZE + 8 + 8, NULL, NULL, NULL, 0, "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_FILE_ID= {LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0, "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID= {LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_INCOMPLETE_LOG= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, NULL, NULL, NULL, 0, "incomplete_log", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_INCOMPLETE_GROUP= {LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL, NULL, NULL, 0, "incomplete_group", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_BULK_INSERT= {LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE, NULL, write_hook_for_undo_bulk_insert, NULL, 1, "undo_bulk_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_REDO_BITMAP_NEW_PAGE= {LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, NULL, NULL, NULL, 0, "redo_create_bitmap", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_IMPORTED_TABLE= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; static LOG_DESC INIT_LOGREC_DEBUG_INFO= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL; void translog_table_init() { int i; log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]= INIT_LOGREC_RESERVED_FOR_CHUNKS23; log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]= INIT_LOGREC_REDO_INSERT_ROW_HEAD; log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]= INIT_LOGREC_REDO_INSERT_ROW_TAIL; log_record_type_descriptor[LOGREC_REDO_NEW_ROW_HEAD]= INIT_LOGREC_REDO_NEW_ROW_HEAD; log_record_type_descriptor[LOGREC_REDO_NEW_ROW_TAIL]= INIT_LOGREC_REDO_NEW_ROW_TAIL; log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]= INIT_LOGREC_REDO_INSERT_ROW_BLOBS; log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]= INIT_LOGREC_REDO_PURGE_ROW_HEAD; log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]= INIT_LOGREC_REDO_PURGE_ROW_TAIL; log_record_type_descriptor[LOGREC_REDO_FREE_BLOCKS]= INIT_LOGREC_REDO_FREE_BLOCKS; log_record_type_descriptor[LOGREC_REDO_FREE_HEAD_OR_TAIL]= INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL; log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]= INIT_LOGREC_REDO_DELETE_ROW; log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]= INIT_LOGREC_REDO_UPDATE_ROW_HEAD; log_record_type_descriptor[LOGREC_REDO_INDEX]= INIT_LOGREC_REDO_INDEX; log_record_type_descriptor[LOGREC_REDO_INDEX_NEW_PAGE]= INIT_LOGREC_REDO_INDEX_NEW_PAGE; log_record_type_descriptor[LOGREC_REDO_INDEX_FREE_PAGE]= INIT_LOGREC_REDO_INDEX_FREE_PAGE; log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]= INIT_LOGREC_REDO_UNDELETE_ROW; log_record_type_descriptor[LOGREC_CLR_END]= INIT_LOGREC_CLR_END; log_record_type_descriptor[LOGREC_PURGE_END]= INIT_LOGREC_PURGE_END; log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]= INIT_LOGREC_UNDO_ROW_INSERT; log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]= INIT_LOGREC_UNDO_ROW_DELETE; log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]= INIT_LOGREC_UNDO_ROW_UPDATE; log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]= INIT_LOGREC_UNDO_KEY_INSERT; log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT_WITH_ROOT]= INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT; log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]= INIT_LOGREC_UNDO_KEY_DELETE; log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE_WITH_ROOT]= INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT; log_record_type_descriptor[LOGREC_PREPARE]= INIT_LOGREC_PREPARE; log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]= INIT_LOGREC_PREPARE_WITH_UNDO_PURGE; log_record_type_descriptor[LOGREC_COMMIT]= INIT_LOGREC_COMMIT; log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]= INIT_LOGREC_COMMIT_WITH_UNDO_PURGE; log_record_type_descriptor[LOGREC_CHECKPOINT]= INIT_LOGREC_CHECKPOINT; log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]= INIT_LOGREC_REDO_CREATE_TABLE; log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]= INIT_LOGREC_REDO_RENAME_TABLE; log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]= INIT_LOGREC_REDO_DROP_TABLE; log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]= INIT_LOGREC_REDO_DELETE_ALL; log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]= INIT_LOGREC_REDO_REPAIR_TABLE; log_record_type_descriptor[LOGREC_FILE_ID]= INIT_LOGREC_FILE_ID; log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]= INIT_LOGREC_LONG_TRANSACTION_ID; log_record_type_descriptor[LOGREC_INCOMPLETE_LOG]= INIT_LOGREC_INCOMPLETE_LOG; log_record_type_descriptor[LOGREC_INCOMPLETE_GROUP]= INIT_LOGREC_INCOMPLETE_GROUP; log_record_type_descriptor[LOGREC_UNDO_BULK_INSERT]= INIT_LOGREC_UNDO_BULK_INSERT; log_record_type_descriptor[LOGREC_REDO_BITMAP_NEW_PAGE]= INIT_LOGREC_REDO_BITMAP_NEW_PAGE; log_record_type_descriptor[LOGREC_IMPORTED_TABLE]= INIT_LOGREC_IMPORTED_TABLE; log_record_type_descriptor[LOGREC_DEBUG_INFO]= INIT_LOGREC_DEBUG_INFO; for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++) log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED; #ifndef DBUG_OFF check_translog_description_table(LOGREC_FIRST_FREE -1); #endif } /* all possible flags page overheads */ static uint page_overhead[TRANSLOG_FLAGS_NUM]; typedef struct st_translog_validator_data { TRANSLOG_ADDRESS *addr; my_bool was_recovered; } TRANSLOG_VALIDATOR_DATA; /* Check cursor/buffer consistence SYNOPSIS translog_check_cursor cursor cursor which will be checked */ static void translog_check_cursor(struct st_buffer_cursor *cursor __attribute__((unused))) { DBUG_ASSERT(cursor->chaser || ((ulong) (cursor->ptr - cursor->buffer->buffer) == cursor->buffer->size)); DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no); DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE == cursor->current_page_fill % TRANSLOG_PAGE_SIZE); DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); } /** @brief switch the loghandler in read only mode in case of write error */ void translog_stop_writing() { DBUG_ENTER("translog_stop_writing"); DBUG_PRINT("error", ("errno: %d my_errno: %d", errno, my_errno)); translog_status= (translog_status == TRANSLOG_SHUTDOWN ? TRANSLOG_UNINITED : TRANSLOG_READONLY); log_descriptor.is_everything_flushed= 1; log_descriptor.open_flags= O_BINARY | O_RDONLY; DBUG_ASSERT(0); DBUG_VOID_RETURN; } /* @brief Get file name of the log by log number @param file_no Number of the log we want to open @param path Pointer to buffer where file name will be stored (must be FN_REFLEN bytes at least) @return pointer to path */ char *translog_filename_by_fileno(uint32 file_no, char *path) { char buff[11], *end; uint length; DBUG_ENTER("translog_filename_by_fileno"); DBUG_ASSERT(file_no <= 0xfffffff); /* log_descriptor.directory is already formatted */ end= strxmov(path, log_descriptor.directory, "aria_log.0000000", NullS); length= (uint) (int10_to_str(file_no, buff, 10) - buff); strmov(end - length +1, buff); DBUG_PRINT("info", ("Path: '%s' path: %p", path, path)); DBUG_RETURN(path); } /** @brief Create log file with given number without cache @param file_no Number of the log we want to open retval -1 error retval # file descriptor number */ static File create_logfile_by_number_no_cache(uint32 file_no) { File file; char path[FN_REFLEN]; DBUG_ENTER("create_logfile_by_number_no_cache"); if (translog_status != TRANSLOG_OK) DBUG_RETURN(-1); /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ if ((file= mysql_file_create(key_file_translog, translog_filename_by_fileno(file_no, path), 0, O_BINARY | O_RDWR | O_CLOEXEC, MYF(MY_WME))) < 0) { DBUG_PRINT("error", ("Error %d during creating file '%s'", errno, path)); translog_stop_writing(); DBUG_RETURN(-1); } if (sync_log_dir >= TRANSLOG_SYNC_DIR_NEWFILE && sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))) { DBUG_PRINT("error", ("Error %d during syncing directory '%s'", errno, log_descriptor.directory)); mysql_file_close(file, MYF(0)); translog_stop_writing(); DBUG_RETURN(-1); } DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); DBUG_RETURN(file); } /** @brief Open (not create) log file with given number without cache @param file_no Number of the log we want to open retval -1 error retval # file descriptor number */ static File open_logfile_by_number_no_cache(uint32 file_no) { File file; char path[FN_REFLEN]; DBUG_ENTER("open_logfile_by_number_no_cache"); /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ /* TODO: use mysql_file_create() */ if ((file= mysql_file_open(key_file_translog, translog_filename_by_fileno(file_no, path), log_descriptor.open_flags | O_CLOEXEC, MYF(MY_WME))) < 0) { DBUG_PRINT("error", ("Error %d during opening file '%s'", errno, path)); DBUG_RETURN(-1); } DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); DBUG_RETURN(file); } /** @brief get file descriptor by given number using cache @param file_no Number of the log we want to open retval # file descriptor retval NULL file is not opened */ static TRANSLOG_FILE *get_logfile_by_number(uint32 file_no) { TRANSLOG_FILE *file; DBUG_ENTER("get_logfile_by_number"); mysql_rwlock_rdlock(&log_descriptor.open_files_lock); if (log_descriptor.max_file - file_no >= log_descriptor.open_files.elements) { DBUG_PRINT("info", ("File #%u is not opened", file_no)); mysql_rwlock_unlock(&log_descriptor.open_files_lock); DBUG_RETURN(NULL); } DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); DBUG_ASSERT(log_descriptor.max_file >= file_no); DBUG_ASSERT(log_descriptor.min_file <= file_no); file= *dynamic_element(&log_descriptor.open_files, log_descriptor.max_file - file_no, TRANSLOG_FILE **); mysql_rwlock_unlock(&log_descriptor.open_files_lock); DBUG_PRINT("info", ("File %p File no: %u, File handler: %d", file, file_no, (file ? file->handler.file : -1))); DBUG_ASSERT(!file || file->number == file_no); DBUG_RETURN(file); } /** @brief get current file descriptor retval # file descriptor */ static TRANSLOG_FILE *get_current_logfile() { TRANSLOG_FILE *file; DBUG_ENTER("get_current_logfile"); mysql_rwlock_rdlock(&log_descriptor.open_files_lock); DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu", (ulong) log_descriptor.max_file, (ulong) log_descriptor.min_file, (ulong) log_descriptor.open_files.elements)); DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **); mysql_rwlock_unlock(&log_descriptor.open_files_lock); DBUG_RETURN(file); } uchar maria_trans_file_magic[]= { (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A', 'L', 'O', 'G' }; #define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \ 8 + 4 + 4 + 4 + 2 + 3 + \ LSN_STORE_SIZE) /* Write log file page header in the just opened new log file SYNOPSIS translog_write_file_header(); NOTES First page is just a marker page; We don't store any real log data in it. RETURN 0 OK 1 ERROR */ static my_bool translog_write_file_header() { TRANSLOG_FILE *file; ulonglong timestamp; uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff; my_bool rc; DBUG_ENTER("translog_write_file_header"); /* file tag */ memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic)); page+= sizeof(maria_trans_file_magic); /* timestamp */ timestamp= my_hrtime().val; int8store(page, timestamp); page+= 8; /* maria version */ int4store(page, TRANSLOG_VERSION_ID); page+= 4; /* mysql version (MYSQL_VERSION_ID) */ int4store(page, log_descriptor.server_version); page+= 4; /* server ID */ int4store(page, log_descriptor.server_id); page+= 4; /* loghandler page_size */ int2store(page, TRANSLOG_PAGE_SIZE - 1); page+= 2; /* file number */ int3store(page, LSN_FILE_NO(log_descriptor.horizon)); page+= 3; lsn_store(page, LSN_IMPOSSIBLE); page+= LSN_STORE_SIZE; memset(page, TRANSLOG_FILLER, sizeof(page_buff) - (page- page_buff)); file= get_current_logfile(); rc= my_pwrite(file->handler.file, page_buff, sizeof(page_buff), 0, log_write_flags) != 0; /* Dropping the flag in such way can make false alarm: signalling than the file in not sync when it is sync, but the situation is quite rare and protections with mutexes give much more overhead to the whole engine */ file->is_sync= 0; DBUG_RETURN(rc); } /* @brief write the new LSN on the given file header @param file The file descriptor @param lsn That LSN which should be written @retval 0 OK @retval 1 Error */ static my_bool translog_max_lsn_to_header(File file, LSN lsn) { uchar lsn_buff[LSN_STORE_SIZE]; my_bool rc; DBUG_ENTER("translog_max_lsn_to_header"); DBUG_PRINT("enter", ("File descriptor: %ld " "lsn: " LSN_FMT, (long) file, LSN_IN_PARTS(lsn))); lsn_store(lsn_buff, lsn); rc= (my_pwrite(file, lsn_buff, LSN_STORE_SIZE, (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE), log_write_flags) != 0 || mysql_file_sync(file, MYF(MY_WME)) != 0); /* We should not increase counter in case of error above, but it is so unlikely that we can ignore this case */ translog_syncs++; DBUG_RETURN(rc); } /* @brief Extract hander file information from loghandler file page @param desc header information descriptor to be filled with information @param page_buff buffer with the page content */ void translog_interpret_file_header(LOGHANDLER_FILE_INFO *desc, uchar *page_buff) { uchar *ptr; ptr= page_buff + sizeof(maria_trans_file_magic); desc->timestamp= uint8korr(ptr); ptr+= 8; desc->maria_version= uint4korr(ptr); ptr+= 4; desc->mysql_version= uint4korr(ptr); ptr+= 4; desc->server_id= uint4korr(ptr); ptr+= 4; desc->page_size= uint2korr(ptr) + 1; ptr+= 2; desc->file_number= uint3korr(ptr); ptr+= 3; desc->max_lsn= lsn_korr(ptr); } /* @brief Read hander file information from loghandler file @param desc header information descriptor to be filled with information @param file file descriptor to read @retval 0 OK @retval 1 Error */ my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file) { uchar page_buff[LOG_HEADER_DATA_SIZE]; DBUG_ENTER("translog_read_file_header"); if (mysql_file_pread(file, page_buff, sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME))) { DBUG_PRINT("info", ("log read fail error: %d", my_errno)); DBUG_RETURN(1); } translog_interpret_file_header(desc, page_buff); DBUG_PRINT("info", ("timestamp: %llu aria ver: %lu mysql ver: %lu " "server id %lu page size %lu file number %lu " "max lsn: " LSN_FMT, (ulonglong) desc->timestamp, (ulong) desc->maria_version, (ulong) desc->mysql_version, (ulong) desc->server_id, desc->page_size, (ulong) desc->file_number, LSN_IN_PARTS(desc->max_lsn))); DBUG_RETURN(0); } /* @brief set the lsn to the files from_file - to_file if it is greater then written in the file @param from_file first file number (min) @param to_file last file number (max) @param lsn the lsn for writing @param is_locked true if current thread locked the log handler @retval 0 OK @retval 1 Error */ static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file, LSN lsn, my_bool is_locked) { uint32 file; DBUG_ENTER("translog_set_lsn_for_files"); DBUG_PRINT("enter", ("From: %lu to: %lu lsn: " LSN_FMT " locked: %d", (ulong) from_file, (ulong) to_file, LSN_IN_PARTS(lsn), is_locked)); DBUG_ASSERT(from_file <= to_file); DBUG_ASSERT(from_file > 0); /* we have not file 0 */ /* Checks the current file (not finished yet file) */ if (!is_locked) translog_lock(); if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon)) { if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0)) log_descriptor.max_lsn= lsn; to_file--; } if (!is_locked) translog_unlock(); /* Checks finished files if they are */ mysql_mutex_lock(&log_descriptor.file_header_lock); for (file= from_file; file <= to_file; file++) { LOGHANDLER_FILE_INFO info; File fd; fd= open_logfile_by_number_no_cache(file); if ((fd < 0) || ((translog_read_file_header(&info, fd) || (cmp_translog_addr(lsn, info.max_lsn) > 0 && translog_max_lsn_to_header(fd, lsn))) | mysql_file_close(fd, MYF(MY_WME)))) { translog_stop_writing(); mysql_mutex_unlock(&log_descriptor.file_header_lock); DBUG_RETURN(1); } } mysql_mutex_unlock(&log_descriptor.file_header_lock); DBUG_RETURN(0); } /* descriptor of file in unfinished_files */ struct st_file_counter { uint32 file; /* file number */ uint32 counter; /* counter for started writes */ }; /* @brief mark file "in progress" (for multi-group records) @param file log file number */ static void translog_mark_file_unfinished(uint32 file) { ssize_t place, i; struct st_file_counter fc, *fc_ptr; DBUG_ENTER("translog_mark_file_unfinished"); DBUG_PRINT("enter", ("file: %lu", (ulong) file)); fc.file= file; fc.counter= 1; mysql_mutex_lock(&log_descriptor.unfinished_files_lock); if (log_descriptor.unfinished_files.elements == 0) { insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); DBUG_PRINT("info", ("The first element inserted")); goto end; } for (place= log_descriptor.unfinished_files.elements - 1; place >= 0; place--) { fc_ptr= dynamic_element(&log_descriptor.unfinished_files, place, struct st_file_counter *); if (fc_ptr->file <= file) break; } if (place >= 0 && fc_ptr->file == file) { fc_ptr->counter++; DBUG_PRINT("info", ("counter increased")); goto end; } if (place == (ssize_t)log_descriptor.unfinished_files.elements) { insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); DBUG_PRINT("info", ("The last element inserted")); goto end; } /* shift and assign new element */ insert_dynamic(&log_descriptor.unfinished_files, (uchar*) dynamic_element(&log_descriptor.unfinished_files, log_descriptor.unfinished_files.elements- 1, struct st_file_counter *)); for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--) { /* we do not use set_dynamic() to avoid unneeded checks */ memcpy(dynamic_element(&log_descriptor.unfinished_files, i, struct st_file_counter *), dynamic_element(&log_descriptor.unfinished_files, i + 1, struct st_file_counter *), sizeof(struct st_file_counter)); } memcpy(dynamic_element(&log_descriptor.unfinished_files, place + 1, struct st_file_counter *), &fc, sizeof(struct st_file_counter)); end: mysql_mutex_unlock(&log_descriptor.unfinished_files_lock); DBUG_VOID_RETURN; } /* @brief remove file mark "in progress" (for multi-group records) @param file log file number */ static void translog_mark_file_finished(uint32 file) { int i; struct st_file_counter *UNINIT_VAR(fc_ptr); DBUG_ENTER("translog_mark_file_finished"); DBUG_PRINT("enter", ("file: %lu", (ulong) file)); mysql_mutex_lock(&log_descriptor.unfinished_files_lock); DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0); for (i= 0; i < (int) log_descriptor.unfinished_files.elements; i++) { fc_ptr= dynamic_element(&log_descriptor.unfinished_files, i, struct st_file_counter *); if (fc_ptr->file == file) { break; } } DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements); if (! --fc_ptr->counter) delete_dynamic_element(&log_descriptor.unfinished_files, i); mysql_mutex_unlock(&log_descriptor.unfinished_files_lock); DBUG_VOID_RETURN; } /* @brief get max LSN of the record which parts stored in this file @param file file number @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR @retval LSN_IMPOSSIBLE File is still not finished @retval LSN_ERROR Error opening file @retval # LSN of the record which parts stored in this file */ LSN translog_get_file_max_lsn_stored(uint32 file) { uint32 limit= FILENO_IMPOSSIBLE; DBUG_ENTER("translog_get_file_max_lsn_stored"); DBUG_PRINT("enter", ("file: %lu", (ulong)file)); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); mysql_mutex_lock(&log_descriptor.unfinished_files_lock); /* find file with minimum file number "in progress" */ if (log_descriptor.unfinished_files.elements > 0) { struct st_file_counter *fc_ptr; fc_ptr= dynamic_element(&log_descriptor.unfinished_files, 0, struct st_file_counter *); limit= fc_ptr->file; /* minimal file number "in progress" */ } mysql_mutex_unlock(&log_descriptor.unfinished_files_lock); /* if there is no "in progress file" then unfinished file is in progress for sure */ if (limit == FILENO_IMPOSSIBLE) { TRANSLOG_ADDRESS horizon= translog_get_horizon(); limit= LSN_FILE_NO(horizon); } if (file >= limit) { DBUG_PRINT("info", ("The file in in progress")); DBUG_RETURN(LSN_IMPOSSIBLE); } { LOGHANDLER_FILE_INFO info; File fd; fd= open_logfile_by_number_no_cache(file); if(fd < 0) { DBUG_PRINT("error", ("Can't open file")); DBUG_RETURN(LSN_ERROR); } if (translog_read_file_header(&info, fd)) { DBUG_PRINT("error", ("Can't read file header")); info.max_lsn= LSN_ERROR; } if (mysql_file_close(fd, MYF(MY_WME))) { DBUG_PRINT("error", ("Can't close file")); info.max_lsn= LSN_ERROR; } DBUG_PRINT("info", ("Max lsn: " LSN_FMT, LSN_IN_PARTS(info.max_lsn))); DBUG_RETURN(info.max_lsn); } } /* Initialize transaction log file buffer SYNOPSIS translog_buffer_init() buffer The buffer to initialize num Number of this buffer RETURN 0 OK 1 Error */ static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num) { DBUG_ENTER("translog_buffer_init"); buffer->pre_force_close_horizon= buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE; DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: %p", buffer)); buffer->buffer_no= (uint8) num; /* This Buffer File */ buffer->file= NULL; buffer->overlay= 0; /* cache for current log */ memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER); /* Buffer size */ buffer->size= 0; buffer->skipped_data= 0; /* cond of thread which is waiting for buffer filling */ if (mysql_cond_init(key_TRANSLOG_BUFFER_waiting_filling_buffer, &buffer->waiting_filling_buffer, 0)) DBUG_RETURN(1); /* Number of records which are in copy progress */ buffer->copy_to_buffer_in_progress= 0; /* list of waiting buffer ready threads */ buffer->waiting_flush= 0; /* Buffers locked by the following mutex. As far as buffers create logical circle (after last buffer goes first) it trigger false alarm of deadlock detect system, so we remove check of deadlock for this buffers. Indeed all mutex locks concentrated around current buffer except flushing thread (but it is only one thread). One thread can't take more then 2 buffer locks at once. So deadlock is impossible here. To prevent false alarm of dead lock detection we switch dead lock detection for one buffer in the middle of the buffers chain. Excluding only one of eight buffers from deadlock detection hardly can hide other possible problems which include this mutexes. */ if (mysql_mutex_init(key_TRANSLOG_BUFFER_mutex, &buffer->mutex, MY_MUTEX_INIT_FAST) || mysql_cond_init(key_TRANSLOG_BUFFER_prev_sent_to_disk_cond, &buffer->prev_sent_to_disk_cond, 0)) DBUG_RETURN(1); mysql_mutex_setflags(&buffer->mutex, MYF_NO_DEADLOCK_DETECTION); buffer->is_closing_buffer= 0; buffer->prev_sent_to_disk= LSN_IMPOSSIBLE; buffer->prev_buffer_offset= LSN_IMPOSSIBLE; buffer->ver= 0; DBUG_RETURN(0); } /* @brief close transaction log file by descriptor @param file pagegecache file descriptor reference @return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_close_log_file(TRANSLOG_FILE *file) { int rc= 0; flush_pagecache_blocks(log_descriptor.pagecache, &file->handler, FLUSH_RELEASE); /* Sync file when we close it TODO: sync only we have changed the log */ if (!file->is_sync) { rc= mysql_file_sync(file->handler.file, MYF(MY_WME)); translog_syncs++; } rc|= mysql_file_close(file->handler.file, MYF(MY_WME)); my_free(file); return MY_TEST(rc); } /** @brief Initializes TRANSLOG_FILE structure @param file reference on the file to initialize @param number file number @param is_sync is file synced on disk */ static void translog_file_init(TRANSLOG_FILE *file, uint32 number, my_bool is_sync) { pagecache_file_set_null_hooks(&file->handler); file->handler.post_read_hook= translog_page_validator; file->handler.flush_log_callback= maria_flush_log_for_page_none; file->handler.callback_data= (uchar*)file; file->number= number; file->was_recovered= 0; file->is_sync= is_sync; } /** @brief Create and fill header of new file. @note the caller must call it right after it has increased log_descriptor.horizon to the new file (log_descriptor.horizon+= LSN_ONE_FILE) @retval 0 OK @retval 1 Error */ static my_bool translog_create_new_file() { TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE), MYF(0)); TRANSLOG_FILE *old= get_current_logfile(); uint32 file_no= LSN_FILE_NO(log_descriptor.horizon); DBUG_ENTER("translog_create_new_file"); if (file == NULL) goto error; /* Writes max_lsn to the file header before finishing it (there is no need to lock file header buffer because it is still unfinished file, so only one thread can finish the file and nobody interested of LSN of current (unfinished) file, because no one can purge it). */ if (translog_max_lsn_to_header(old->handler.file, log_descriptor.max_lsn)) goto error; mysql_rwlock_wrlock(&log_descriptor.open_files_lock); DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); DBUG_ASSERT(file_no == log_descriptor.max_file + 1); if (allocate_dynamic(&log_descriptor.open_files, log_descriptor.max_file - log_descriptor.min_file + 2)) goto error_lock; /* this call just expand the array */ if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file)) goto error_lock; if ((file->handler.file= create_logfile_by_number_no_cache(file_no)) == -1) goto error_lock; translog_file_init(file, file_no, 0); log_descriptor.max_file++; { char *start= (char*) dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE**); memmove(start + sizeof(TRANSLOG_FILE*), start, sizeof(TRANSLOG_FILE*) * (log_descriptor.max_file - log_descriptor.min_file + 1 - 1)); } /* can't fail we because we expanded array */ set_dynamic(&log_descriptor.open_files, (uchar*)&file, 0); DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); mysql_rwlock_unlock(&log_descriptor.open_files_lock); DBUG_PRINT("info", ("file_no: %lu", (ulong)file_no)); if (translog_write_file_header()) goto error; if (ma_control_file_write_and_force(last_checkpoint_lsn, file_no, max_trid_in_control_file, recovery_failures)) goto error; DBUG_RETURN(0); error_lock: mysql_rwlock_unlock(&log_descriptor.open_files_lock); error: translog_stop_writing(); my_free(file); DBUG_RETURN(1); } /** @brief Locks the loghandler buffer. @param buffer This buffer which should be locked @note See comment before buffer 'mutex' variable. @retval 0 OK @retval 1 Error */ static void translog_buffer_lock(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_buffer_lock"); DBUG_PRINT("enter", ("Lock buffer #%u: %p", buffer->buffer_no, buffer)); mysql_mutex_lock(&buffer->mutex); DBUG_VOID_RETURN; } /* Unlock the loghandler buffer SYNOPSIS translog_buffer_unlock() buffer This buffer which should be unlocked */ static void translog_buffer_unlock(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_buffer_unlock"); DBUG_PRINT("enter", ("Unlock buffer... #%u (%p)", (uint) buffer->buffer_no, buffer)); mysql_mutex_unlock(&buffer->mutex); DBUG_VOID_RETURN; } /* Write a header on the page SYNOPSIS translog_new_page_header() horizon Where to write the page cursor Where to write the page NOTE - space for page header should be checked before */ static uchar translog_sector_random; static void translog_new_page_header(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor) { uchar *ptr; DBUG_ENTER("translog_new_page_header"); DBUG_ASSERT(cursor->ptr); cursor->protected= 0; ptr= cursor->ptr; /* Page number */ int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE); ptr+= 3; /* File number */ int3store(ptr, LSN_FILE_NO(*horizon)); ptr+= 3; DBUG_ASSERT(TRANSLOG_PAGE_FLAGS == (ptr - cursor->ptr)); cursor->ptr[TRANSLOG_PAGE_FLAGS]= (uchar) log_descriptor.flags; ptr++; if (log_descriptor.flags & TRANSLOG_PAGE_CRC) { #ifndef DBUG_OFF DBUG_PRINT("info", ("write 0x11223344 CRC to " LSN_FMT, LSN_IN_PARTS(*horizon))); /* This will be overwritten by real CRC; This is just for debugging */ int4store(ptr, 0x11223344); #endif /* CRC will be put when page is finished */ ptr+= CRC_SIZE; } if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) { /* translog_sector_randmo works like "random" values producer because it is enough to have such "random" for this purpose and it will not interfere with higher level pseudo random value generator */ ptr[0]= translog_sector_random++; ptr+= TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; } { size_t len= (ptr - cursor->ptr); (*horizon)+= len; /* increasing the offset part of the address */ cursor->current_page_fill= (uint16)len; if (!cursor->chaser) cursor->buffer->size+= (translog_size_t)len; } cursor->ptr= ptr; DBUG_PRINT("info", ("NewP buffer #%u: %p chaser: %d Size: %lu (%lu) " "Horizon: " LSN_FMT, (uint) cursor->buffer->buffer_no, cursor->buffer, cursor->chaser, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer), LSN_IN_PARTS(*horizon))); translog_check_cursor(cursor); DBUG_VOID_RETURN; } /* Put sector protection on the page image SYNOPSIS translog_put_sector_protection() page reference on the page content cursor cursor of the buffer NOTES We put a sector protection on all following sectors on the page, except the first sector that is protected by page header. */ static void translog_put_sector_protection(uchar *page, struct st_buffer_cursor *cursor) { uchar *table= page + log_descriptor.page_overhead - TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; uint i, offset; uint16 last_protected_sector= ((cursor->previous_offset - 1) / DISK_DRIVE_SECTOR_SIZE); uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE; uint8 value= table[0] + cursor->write_counter; DBUG_ENTER("translog_put_sector_protection"); if (start_sector == 0) { /* First sector is protected by file & page numbers in the page header. */ start_sector= 1; } DBUG_PRINT("enter", ("Write counter:%u value:%u offset:%u, " "last protected:%u start sector:%u", (uint) cursor->write_counter, (uint) value, (uint) cursor->previous_offset, (uint) last_protected_sector, (uint) start_sector)); if (last_protected_sector == start_sector) { i= last_protected_sector; offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE; /* restore data, because we modified sector which was protected */ if (offset < cursor->previous_offset) page[offset]= table[i]; } for (i= start_sector, offset= start_sector * DISK_DRIVE_SECTOR_SIZE; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++, (offset+= DISK_DRIVE_SECTOR_SIZE)) { DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x", i, offset, (uint) page[offset])); table[i]= page[offset]; page[offset]= value; DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x", i, offset, (uint) page[offset])); } DBUG_VOID_RETURN; } /* Calculate CRC32 of given area SYNOPSIS translog_crc() area Pointer of the area beginning length The Area length RETURN CRC32 */ static uint32 translog_crc(uchar *area, uint length) { DBUG_ENTER("translog_crc"); DBUG_RETURN(my_checksum(0L, area, length)); } /* Finish current page with zeros SYNOPSIS translog_finish_page() horizon \ horizon & buffer pointers cursor / */ static void translog_finish_page(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor) { uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill; uchar *page= cursor->ptr - cursor->current_page_fill; DBUG_ENTER("translog_finish_page"); DBUG_PRINT("enter", ("Buffer: #%u %p " "Buffer addr: " LSN_FMT " " "Page addr: " LSN_FMT " " "size:%u (%u) Pg:%u left:%u", (uint) cursor->buffer_no, cursor->buffer, LSN_IN_PARTS(cursor->buffer->offset), (uint)LSN_FILE_NO(*horizon), (uint)(LSN_OFFSET(*horizon) - cursor->current_page_fill), (uint) cursor->buffer->size, (uint) (cursor->ptr -cursor->buffer->buffer), (uint) cursor->current_page_fill, (uint) left)); DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset) || translog_status == TRANSLOG_UNINITED); if ((LSN_FILE_NO(*horizon) != LSN_FILE_NO(cursor->buffer->offset))) DBUG_VOID_RETURN; // everything wrong do not write to awoid more problems translog_check_cursor(cursor); if (cursor->protected) { DBUG_PRINT("info", ("Already protected and finished")); DBUG_VOID_RETURN; } cursor->protected= 1; DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); if (left != 0) { DBUG_PRINT("info", ("left: %u", (uint) left)); memset(cursor->ptr, TRANSLOG_FILLER, left); cursor->ptr+= left; (*horizon)+= left; /* offset increasing */ if (!cursor->chaser) cursor->buffer->size+= left; /* We are finishing the page so reset the counter */ cursor->current_page_fill= 0; DBUG_PRINT("info", ("Finish Page buffer #%u: %p " "chaser: %d Size: %lu (%lu)", (uint) cursor->buffer->buffer_no, cursor->buffer, cursor->chaser, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer))); translog_check_cursor(cursor); } /* When we are finishing the page other thread might not finish the page header yet (in case if we started from the middle of the page) so we have to read log_descriptor.flags but not the flags from the page. */ if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) { translog_put_sector_protection(page, cursor); DBUG_PRINT("info", ("drop write_counter")); cursor->write_counter= 0; cursor->previous_offset= 0; } if (log_descriptor.flags & TRANSLOG_PAGE_CRC) { uint32 crc= translog_crc(page + log_descriptor.page_overhead, TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead); DBUG_PRINT("info", ("CRC: %lx", (ulong) crc)); /* We have page number, file number and flag before crc */ int4store(page + 3 + 3 + 1, crc); } DBUG_VOID_RETURN; } /* @brief Wait until all threads have finished closing this buffer. @param buffer This buffer should be check */ static void translog_wait_for_closing(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_wait_for_closing"); DBUG_PRINT("enter", ("Buffer #%u %p copies in progress: %u " "is closing %u File: %d size: %lu", (uint) buffer->buffer_no, buffer, (uint) buffer->copy_to_buffer_in_progress, (uint) buffer->is_closing_buffer, (buffer->file ? buffer->file->handler.file : -1), (ulong) buffer->size)); translog_buffer_lock_assert_owner(buffer); while (buffer->is_closing_buffer) { DBUG_PRINT("info", ("wait for writers... buffer: #%u %p", (uint) buffer->buffer_no, buffer)); DBUG_ASSERT(buffer->file != NULL); mysql_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); DBUG_PRINT("info", ("wait for writers done buffer: #%u %p", (uint) buffer->buffer_no, buffer)); } DBUG_VOID_RETURN; } /* @brief Wait until all threads have finished filling this buffer. @param buffer This buffer should be check */ static void translog_wait_for_writers(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_wait_for_writers"); DBUG_PRINT("enter", ("Buffer #%u %p copies in progress: %u " "is closing %u File: %d size: %lu", (uint) buffer->buffer_no, buffer, (uint) buffer->copy_to_buffer_in_progress, (uint) buffer->is_closing_buffer, (buffer->file ? buffer->file->handler.file : -1), (ulong) buffer->size)); translog_buffer_lock_assert_owner(buffer); while (buffer->copy_to_buffer_in_progress) { DBUG_PRINT("info", ("wait for writers... buffer: #%u %p", (uint) buffer->buffer_no, buffer)); DBUG_ASSERT(buffer->file != NULL); mysql_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); DBUG_PRINT("info", ("wait for writers done buffer: #%u %p", (uint) buffer->buffer_no, buffer)); } DBUG_VOID_RETURN; } /* Wait for buffer to become free SYNOPSIS translog_wait_for_buffer_free() buffer The buffer we are waiting for NOTE - this buffer should be locked */ static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer) { TRANSLOG_ADDRESS offset= buffer->offset; TRANSLOG_FILE *file= buffer->file; uint8 ver= buffer->ver; DBUG_ENTER("translog_wait_for_buffer_free"); DBUG_PRINT("enter", ("Buffer #%u %p copies in progress: %u " "is closing %u File: %d size: %lu", (uint) buffer->buffer_no, buffer, (uint) buffer->copy_to_buffer_in_progress, (uint) buffer->is_closing_buffer, (buffer->file ? buffer->file->handler.file : -1), (ulong) buffer->size)); translog_wait_for_writers(buffer); if (offset != buffer->offset || file != buffer->file || ver != buffer->ver) DBUG_VOID_RETURN; /* the buffer if already freed */ while (buffer->file != NULL) { DBUG_PRINT("info", ("wait for writers... buffer: #%u %p", (uint) buffer->buffer_no, buffer)); mysql_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); DBUG_PRINT("info", ("wait for writers done. buffer: #%u %p", (uint) buffer->buffer_no, buffer)); } DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0); DBUG_VOID_RETURN; } /* Initialize the cursor for a buffer SYNOPSIS translog_cursor_init() buffer The buffer cursor It's cursor buffer_no Number of buffer */ static void translog_cursor_init(struct st_buffer_cursor *cursor, struct st_translog_buffer *buffer, uint8 buffer_no) { DBUG_ENTER("translog_cursor_init"); cursor->ptr= buffer->buffer; cursor->buffer= buffer; cursor->buffer_no= buffer_no; cursor->current_page_fill= 0; cursor->chaser= (cursor != &log_descriptor.bc); cursor->write_counter= 0; cursor->previous_offset= 0; cursor->protected= 0; DBUG_VOID_RETURN; } /* @brief Initialize buffer for the current file, and a cursor for this buffer. @param buffer The buffer @param cursor It's cursor @param buffer_no Number of buffer */ static void translog_start_buffer(struct st_translog_buffer *buffer, struct st_buffer_cursor *cursor, uint buffer_no) { DBUG_ENTER("translog_start_buffer"); DBUG_PRINT("enter", ("Assign buffer: #%u (%p) offset: 0x%x(%u)", (uint) buffer->buffer_no, buffer, (uint) LSN_OFFSET(log_descriptor.horizon), (uint) LSN_OFFSET(log_descriptor.horizon))); DBUG_ASSERT(buffer_no == buffer->buffer_no); buffer->pre_force_close_horizon= buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE; DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: %p", buffer)); buffer->offset= log_descriptor.horizon; buffer->next_buffer_offset= LSN_IMPOSSIBLE; buffer->file= get_current_logfile(); buffer->overlay= 0; buffer->size= 0; buffer->skipped_data= 0; translog_cursor_init(cursor, buffer, buffer_no); DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: %p " "chaser: %d Size: %lu (%lu)", (long) (buffer->file ? buffer->file->number : 0), (buffer->file ? buffer->file->handler.file : -1), (uint) cursor->buffer->buffer_no, cursor->buffer, cursor->chaser, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer))); translog_check_cursor(cursor); mysql_mutex_lock(&log_descriptor.dirty_buffer_mask_lock); log_descriptor.dirty_buffer_mask|= (1 << buffer->buffer_no); mysql_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock); DBUG_VOID_RETURN; } /* @brief Switch to the next buffer in a chain. @param horizon \ Pointers on current position in file and buffer @param cursor / @param new_file Also start new file @note - loghandler should be locked - after return new and old buffer still are locked @retval 0 OK @retval 1 Error */ static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor, my_bool new_file) { uint old_buffer_no= cursor->buffer_no; uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no; my_bool chasing= cursor->chaser; DBUG_ENTER("translog_buffer_next"); DBUG_PRINT("info", ("horizon: " LSN_FMT " chasing: %d", LSN_IN_PARTS(log_descriptor.horizon), chasing)); DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0); translog_finish_page(horizon, cursor); if (!chasing) { translog_buffer_lock(new_buffer); #ifndef DBUG_OFF { TRANSLOG_ADDRESS offset= new_buffer->offset; TRANSLOG_FILE *file= new_buffer->file; uint8 ver= new_buffer->ver; translog_lock_assert_owner(); #endif translog_wait_for_buffer_free(new_buffer); #ifndef DBUG_OFF /* We keep the handler locked so nobody can start this new buffer */ DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); } #endif } else DBUG_ASSERT(new_buffer->file != NULL); if (new_file) { /* move the horizon to the next file and its header page */ (*horizon)+= LSN_ONE_FILE; (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE); if (!chasing && translog_create_new_file()) { DBUG_RETURN(1); } } /* prepare next page */ if (chasing) translog_cursor_init(cursor, new_buffer, new_buffer_no); else { translog_lock_assert_owner(); translog_start_buffer(new_buffer, cursor, new_buffer_no); new_buffer->prev_buffer_offset= log_descriptor.buffers[old_buffer_no].offset; new_buffer->prev_last_lsn= BUFFER_MAX_LSN(log_descriptor.buffers + old_buffer_no); } log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset; DBUG_PRINT("info", ("prev_last_lsn set to " LSN_FMT " buffer:%p", LSN_IN_PARTS(new_buffer->prev_last_lsn), new_buffer)); translog_new_page_header(horizon, cursor); DBUG_RETURN(0); } /* Sets max LSN sent to file, and address from which data is only in the buffer SYNOPSIS translog_set_sent_to_disk() buffer buffer which we have sent to disk TODO: use atomic operations if possible (64bit architectures?) */ static void translog_set_sent_to_disk(struct st_translog_buffer *buffer) { LSN lsn= buffer->last_lsn; TRANSLOG_ADDRESS in_buffers= buffer->next_buffer_offset; DBUG_ENTER("translog_set_sent_to_disk"); mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); DBUG_PRINT("enter", ("lsn: " LSN_FMT " in_buffers: " LSN_FMT " " "in_buffers_only: " LSN_FMT " start: " LSN_FMT " " "sent_to_disk: " LSN_FMT, LSN_IN_PARTS(lsn), LSN_IN_PARTS(in_buffers), LSN_IN_PARTS(log_descriptor.log_start), LSN_IN_PARTS(log_descriptor.in_buffers_only), LSN_IN_PARTS(log_descriptor.sent_to_disk))); /* We write sequentially (first part of following assert) but we rewrite the same page in case we started mysql and shut it down immediately (second part of the following assert) */ DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_disk) >= 0 || cmp_translog_addr(lsn, log_descriptor.log_start) < 0); log_descriptor.sent_to_disk= lsn; /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) { log_descriptor.in_buffers_only= in_buffers; DBUG_PRINT("info", ("set new in_buffers_only")); } mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); DBUG_VOID_RETURN; } /* Sets address from which data is only in the buffer SYNOPSIS translog_set_only_in_buffers() lsn LSN to assign in_buffers to assign to in_buffers_only */ static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers) { DBUG_ENTER("translog_set_only_in_buffers"); mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); DBUG_PRINT("enter", ("in_buffers: " LSN_FMT " " "in_buffers_only: " LSN_FMT, LSN_IN_PARTS(in_buffers), LSN_IN_PARTS(log_descriptor.in_buffers_only))); /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) { if (translog_status != TRANSLOG_OK) goto end; log_descriptor.in_buffers_only= in_buffers; DBUG_PRINT("info", ("set new in_buffers_only")); } end: mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); DBUG_VOID_RETURN; } /* Gets address from which data is only in the buffer SYNOPSIS translog_only_in_buffers() RETURN address from which data is only in the buffer */ static TRANSLOG_ADDRESS translog_only_in_buffers() { register TRANSLOG_ADDRESS addr; DBUG_ENTER("translog_only_in_buffers"); mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); addr= log_descriptor.in_buffers_only; mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); DBUG_RETURN(addr); } /* Get max LSN sent to file SYNOPSIS translog_get_sent_to_disk() RETURN max LSN send to file */ static LSN translog_get_sent_to_disk() { register LSN lsn; DBUG_ENTER("translog_get_sent_to_disk"); mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); lsn= log_descriptor.sent_to_disk; DBUG_PRINT("info", ("sent to disk up to " LSN_FMT, LSN_IN_PARTS(lsn))); mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); DBUG_RETURN(lsn); } /* Get first chunk address on the given page SYNOPSIS translog_get_first_chunk_offset() page The page where to find first chunk RETURN first chunk offset */ static my_bool translog_get_first_chunk_offset(uchar *page) { DBUG_ENTER("translog_get_first_chunk_offset"); DBUG_ASSERT(page[TRANSLOG_PAGE_FLAGS] < TRANSLOG_FLAGS_NUM); DBUG_RETURN(page_overhead[page[TRANSLOG_PAGE_FLAGS]]); } /* Write coded length of record SYNOPSIS translog_write_variable_record_1group_code_len dst Destination buffer pointer length Length which should be coded header_len Calculated total header length */ static void translog_write_variable_record_1group_code_len(uchar *dst, translog_size_t length, uint16 header_len) { switch (header_len) { case 6: /* (5 + 1) */ DBUG_ASSERT(length <= 250); *dst= (uint8) length; return; case 8: /* (5 + 3) */ DBUG_ASSERT(length <= 0xFFFF); *dst= 251; int2store(dst + 1, length); return; case 9: /* (5 + 4) */ DBUG_ASSERT(length <= (ulong) 0xFFFFFF); *dst= 252; int3store(dst + 1, length); return; case 10: /* (5 + 5) */ *dst= 253; int4store(dst + 1, length); return; default: DBUG_ASSERT(0); } return; } /* Decode record data length and advance given pointer to the next field SYNOPSIS translog_variable_record_1group_decode_len() src The pointer to the pointer to the length beginning RETURN decoded length */ static translog_size_t translog_variable_record_1group_decode_len(uchar **src) { uint8 first= (uint8) (**src); switch (first) { case 251: (*src)+= 3; return (uint2korr((*src) - 2)); case 252: (*src)+= 4; return (uint3korr((*src) - 3)); case 253: (*src)+= 5; return (uint4korr((*src) - 4)); case 254: case 255: DBUG_ASSERT(0); /* reserved for future use */ return (0); default: (*src)++; return (first); } } /* Get total length of this chunk (not only body) SYNOPSIS translog_get_total_chunk_length() page The page where chunk placed offset Offset of the chunk on this place RETURN total length of the chunk */ static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset) { DBUG_ENTER("translog_get_total_chunk_length"); switch (page[offset] & TRANSLOG_CHUNK_TYPE) { case TRANSLOG_CHUNK_LSN: { /* 0 chunk referred as LSN (head or tail) */ translog_size_t rec_len; uchar *start= page + offset; uchar *ptr= start + 1 + 2; /* chunk type and short trid */ uint16 chunk_len, header_len, page_rest; DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); rec_len= translog_variable_record_1group_decode_len(&ptr); chunk_len= uint2korr(ptr); header_len= (uint16) (ptr -start) + 2; DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", (ulong) rec_len, (uint) chunk_len, (uint) header_len)); if (chunk_len) { DBUG_PRINT("info", ("chunk len: %u + %u = %u", (uint) header_len, (uint) chunk_len, (uint) (chunk_len + header_len))); DBUG_RETURN(chunk_len + header_len); } page_rest= TRANSLOG_PAGE_SIZE - offset; DBUG_PRINT("info", ("page_rest %u", (uint) page_rest)); if (rec_len + header_len < page_rest) DBUG_RETURN(rec_len + header_len); DBUG_RETURN(page_rest); } case TRANSLOG_CHUNK_FIXED: { uchar *ptr; uint type= page[offset] & TRANSLOG_REC_TYPE; uint length; int i; /* 1 (pseudo)fixed record (also LSN) */ DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED")); DBUG_ASSERT(log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH || log_record_type_descriptor[type].rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH); if (log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH) { DBUG_PRINT("info", ("Fixed length: %u", (uint) (log_record_type_descriptor[type].fixed_length + 3))); DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3); } ptr= page + offset + 3; /* first compressed LSN */ length= log_record_type_descriptor[type].fixed_length + 3; for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++) { /* first 2 bits is length - 2 */ uint len= (((uint8) (*ptr)) >> 6) + 2; if (ptr[0] == 0 && ((uint8) ptr[1]) == 1) len+= LSN_STORE_SIZE; /* case of full LSN storing */ ptr+= len; /* subtract saved bytes */ length-= (LSN_STORE_SIZE - len); } DBUG_PRINT("info", ("Pseudo-fixed length: %u", length)); DBUG_RETURN(length); } case TRANSLOG_CHUNK_NOHDR: /* 2 no header chunk (till page end) */ DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR length: %u", (uint) (TRANSLOG_PAGE_SIZE - offset))); DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset); case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH")); DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3); DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3)); DBUG_RETURN(uint2korr(page + offset + 1) + 3); default: DBUG_ASSERT(0); DBUG_RETURN(0); } } /* @brief Waits previous buffer flush finish @param buffer buffer for check @retval 0 previous buffer flushed and this thread have to flush this one @retval 1 previous buffer flushed and this buffer flushed by other thread too */ my_bool translog_prev_buffer_flush_wait(struct st_translog_buffer *buffer) { TRANSLOG_ADDRESS offset= buffer->offset; TRANSLOG_FILE *file= buffer->file; uint8 ver= buffer->ver; DBUG_ENTER("translog_prev_buffer_flush_wait"); DBUG_PRINT("enter", ("buffer: %p #%u offset: " LSN_FMT " " "prev sent: " LSN_FMT " prev offset: " LSN_FMT, buffer, (uint) buffer->buffer_no, LSN_IN_PARTS(buffer->offset), LSN_IN_PARTS(buffer->prev_sent_to_disk), LSN_IN_PARTS(buffer->prev_buffer_offset))); translog_buffer_lock_assert_owner(buffer); if (buffer->prev_buffer_offset != buffer->prev_sent_to_disk) { do { mysql_cond_wait(&buffer->prev_sent_to_disk_cond, &buffer->mutex); if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) DBUG_RETURN(1); /* some the thread flushed the buffer already */ } while(buffer->prev_buffer_offset != buffer->prev_sent_to_disk); } DBUG_RETURN(0); } /* Flush given buffer SYNOPSIS translog_buffer_flush() buffer This buffer should be flushed RETURN 0 OK 1 Error */ static my_bool translog_buffer_flush(struct st_translog_buffer *buffer) { uint32 i, pg; TRANSLOG_ADDRESS offset= buffer->offset; TRANSLOG_FILE *file= buffer->file; uint8 ver= buffer->ver; uint skipped_data; DBUG_ENTER("translog_buffer_flush"); DBUG_PRINT("enter", ("Buffer: #%u %p file: %d offset: " LSN_FMT " size: %lu", (uint) buffer->buffer_no, buffer, buffer->file->handler.file, LSN_IN_PARTS(buffer->offset), (ulong) buffer->size)); translog_buffer_lock_assert_owner(buffer); if (buffer->file == NULL) DBUG_RETURN(0); translog_wait_for_writers(buffer); if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) DBUG_RETURN(0); /* some the thread flushed the buffer already */ if (buffer->is_closing_buffer) { /* some other flush in progress */ translog_wait_for_closing(buffer); if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) DBUG_RETURN(0); /* some the thread flushed the buffer already */ } if (buffer->overlay && translog_prev_buffer_flush_wait(buffer)) DBUG_RETURN(0); /* some the thread flushed the buffer already */ /* Send page by page in the pagecache what we are going to write on the disk */ file= buffer->file; skipped_data= buffer->skipped_data; DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE); for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE; i < buffer->size; i+= TRANSLOG_PAGE_SIZE, pg++) { #ifdef DBUG_TRACE TRANSLOG_ADDRESS addr= (buffer->offset + i); #endif DBUG_PRINT("info", ("send log form %lu till %lu address: " LSN_FMT " " "page #: %lu buffer size: %lu buffer: %p", (ulong) i, (ulong) (i + TRANSLOG_PAGE_SIZE), LSN_IN_PARTS(addr), (ulong) pg, (ulong) buffer->size, buffer)); DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE); DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size); if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN) DBUG_RETURN(1); if (pagecache_write_part(log_descriptor.pagecache, &file->handler, pg, 3, buffer->buffer + i, PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_LEFT_UNLOCKED, PAGECACHE_PIN_LEFT_UNPINNED, PAGECACHE_WRITE_DONE, 0, LSN_IMPOSSIBLE, skipped_data, TRANSLOG_PAGE_SIZE - skipped_data)) { DBUG_PRINT("error", ("Can't write page " LSN_FMT " to pagecache, error: %d", buffer->file->number, (uint)(LSN_OFFSET(buffer->offset)+ i), my_errno)); translog_stop_writing(); DBUG_RETURN(1); } skipped_data= 0; } file->is_sync= 0; if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data, buffer->size - buffer->skipped_data, LSN_OFFSET(buffer->offset) + buffer->skipped_data, log_write_flags)) { DBUG_PRINT("error", ("Can't write buffer " LSN_FMT " size %lu " "to the disk (%d)", (uint) file->handler.file, (uint) LSN_OFFSET(buffer->offset), (ulong) buffer->size, errno)); translog_stop_writing(); DBUG_RETURN(1); } /* Dropping the flag in such way can make false alarm: signalling than the file in not sync when it is sync, but the situation is quite rare and protections with mutexes give much more overhead to the whole engine */ file->is_sync= 0; if (LSN_OFFSET(buffer->last_lsn) != 0) /* if buffer->last_lsn is set */ { if (translog_prev_buffer_flush_wait(buffer)) DBUG_RETURN(0); /* some the thread flushed the buffer already */ translog_set_sent_to_disk(buffer); } else translog_set_only_in_buffers(buffer->next_buffer_offset); /* say to next buffer that we are finished */ { struct st_translog_buffer *next_buffer= log_descriptor.buffers + ((buffer->buffer_no + 1) % TRANSLOG_BUFFERS_NO); if (likely(translog_status == TRANSLOG_OK)){ translog_buffer_lock(next_buffer); next_buffer->prev_sent_to_disk= buffer->offset; translog_buffer_unlock(next_buffer); mysql_cond_broadcast(&next_buffer->prev_sent_to_disk_cond); } else { /* It is shutdown => 1) there is only one thread 2) mutexes of other buffers can be destroyed => we can't use them */ next_buffer->prev_sent_to_disk= buffer->offset; } } /* Free buffer */ buffer->file= NULL; buffer->overlay= 0; buffer->ver++; mysql_mutex_lock(&log_descriptor.dirty_buffer_mask_lock); log_descriptor.dirty_buffer_mask&= ~(1 << buffer->buffer_no); mysql_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock); mysql_cond_broadcast(&buffer->waiting_filling_buffer); DBUG_RETURN(0); } /* Recover page with sector protection (wipe out failed chunks) SYNOPSYS translog_recover_page_up_to_sector() page reference on the page offset offset of failed sector RETURN 0 OK 1 Error */ static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset) { uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end; DBUG_ENTER("translog_recover_page_up_to_sector"); DBUG_PRINT("enter", ("offset: %u first chunk: %u", (uint) offset, (uint) chunk_offset)); while (chunk_offset < offset && page[chunk_offset] != TRANSLOG_FILLER) { uint16 chunk_length; if ((chunk_length= translog_get_total_chunk_length(page, chunk_offset)) == 0) { DBUG_PRINT("error", ("cant get chunk length (offset %u)", (uint) chunk_offset)); DBUG_RETURN(1); } DBUG_PRINT("info", ("chunk: offset: %u length %u", (uint) chunk_offset, (uint) chunk_length)); if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE) { DBUG_PRINT("error", ("damaged chunk (offset %u) in trusted area", (uint) chunk_offset)); DBUG_RETURN(1); } chunk_offset+= chunk_length; } valid_chunk_end= chunk_offset; /* end of trusted area - sector parsing */ while (page[chunk_offset] != TRANSLOG_FILLER) { uint16 chunk_length; if ((chunk_length= translog_get_total_chunk_length(page, chunk_offset)) == 0) break; DBUG_PRINT("info", ("chunk: offset: %u length %u", (uint) chunk_offset, (uint) chunk_length)); if (((ulong) chunk_offset) + ((ulong) chunk_length) > (uint) (offset + DISK_DRIVE_SECTOR_SIZE)) break; chunk_offset+= chunk_length; valid_chunk_end= chunk_offset; } DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end)); memset(page + valid_chunk_end, TRANSLOG_FILLER, TRANSLOG_PAGE_SIZE - valid_chunk_end); DBUG_RETURN(0); } /** @brief Checks and removes sector protection. @param page reference on the page content. @param file transaction log descriptor. @retvat 0 OK @retval 1 Error */ static my_bool translog_check_sector_protection(uchar *page, TRANSLOG_FILE *file) { uint i, offset; uchar *table= page + page_overhead[page[TRANSLOG_PAGE_FLAGS]] - TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; uint8 current= table[0]; DBUG_ENTER("translog_check_sector_protection"); for (i= 1, offset= DISK_DRIVE_SECTOR_SIZE; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++, offset+= DISK_DRIVE_SECTOR_SIZE) { /* TODO: add chunk counting for "suspecting" sectors (difference is more than 1-2), if difference more then present chunks then it is the problem. */ uint8 test= page[offset]; DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " "read: 0x%x stored: 0x%x%x", i, offset, (ulong) current, (uint) uint2korr(page + offset), (uint) table[i], (uint) table[i + 1])); /* 3 is minimal possible record length. So we can have "distance" between 2 sectors value more then DISK_DRIVE_SECTOR_SIZE / 3 only if it is old value, i.e. the sector was not written. */ if (((test < current) && ((uint)(0xFFL - current + test) > DISK_DRIVE_SECTOR_SIZE / 3)) || ((test >= current) && ((uint)(test - current) > DISK_DRIVE_SECTOR_SIZE / 3))) { if (translog_recover_page_up_to_sector(page, offset)) DBUG_RETURN(1); file->was_recovered= 1; DBUG_RETURN(0); } /* Restore value on the page */ page[offset]= table[i]; current= test; DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " "read: 0x%x stored: 0x%x", i, offset, (ulong) current, (uint) page[offset], (uint) table[i])); } DBUG_RETURN(0); } /** @brief Log page validator (read callback) @param page The page data to check @param page_no The page number (/) @param data_ptr Read callback data pointer (pointer to TRANSLOG_FILE) @todo: add turning loghandler to read-only mode after merging with that patch. @retval 0 OK @retval 1 Error */ static my_bool translog_page_validator(int res, PAGECACHE_IO_HOOK_ARGS *args) { uchar *page= args->page; pgcache_page_no_t page_no= args->pageno; uint this_page_page_overhead; uint flags; uchar *page_pos; TRANSLOG_FILE *data= (TRANSLOG_FILE *) args->data; #ifdef DBUG_TRACE pgcache_page_no_t offset= page_no * TRANSLOG_PAGE_SIZE; #endif DBUG_ENTER("translog_page_validator"); data->was_recovered= 0; if (res) { DBUG_RETURN(1); } if ((pgcache_page_no_t) uint3korr(page) != page_no || (uint32) uint3korr(page + 3) != data->number) { DBUG_PRINT("error", ("Page " LSN_FMT ": " "page address written in the page is incorrect: " "File %lu instead of %lu or page %lu instead of %lu", (uint)data->number, (uint)offset, (ulong) uint3korr(page + 3), (ulong) data->number, (ulong) uint3korr(page), (ulong) page_no)); DBUG_RETURN(1); } flags= (uint)(page[TRANSLOG_PAGE_FLAGS]); this_page_page_overhead= page_overhead[flags]; if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | TRANSLOG_RECORD_CRC)) { DBUG_PRINT("error", ("Page " LSN_FMT ": " "Garbage in the page flags field detected : %x", (uint) data->number, (uint) offset, (uint) flags)); DBUG_RETURN(1); } page_pos= page + (3 + 3 + 1); if (flags & TRANSLOG_PAGE_CRC) { uint32 crc= translog_crc(page + this_page_page_overhead, TRANSLOG_PAGE_SIZE - this_page_page_overhead); if (crc != uint4korr(page_pos)) { DBUG_PRINT("error", ("Page " LSN_FMT ": " "CRC mismatch: calculated: %lx on the page %lx", (uint) data->number, (uint) offset, (ulong) crc, (ulong) uint4korr(page_pos))); DBUG_RETURN(1); } page_pos+= CRC_SIZE; /* Skip crc */ } if (flags & TRANSLOG_SECTOR_PROTECTION && translog_check_sector_protection(page, data)) { DBUG_RETURN(1); } DBUG_RETURN(0); } /** @brief Locks the loghandler. */ void translog_lock() { uint8 current_buffer; DBUG_ENTER("translog_lock"); /* Locking the loghandler mean locking current buffer, but it can change during locking, so we should check it */ for (;;) { /* log_descriptor.bc.buffer_no is only one byte so its reading is an atomic operation */ current_buffer= log_descriptor.bc.buffer_no; translog_buffer_lock(log_descriptor.buffers + current_buffer); if (log_descriptor.bc.buffer_no == current_buffer) break; translog_buffer_unlock(log_descriptor.buffers + current_buffer); } DBUG_VOID_RETURN; } /* Unlock the loghandler SYNOPSIS translog_unlock() RETURN 0 OK 1 Error */ void translog_unlock() { translog_buffer_unlock(log_descriptor.bc.buffer); } /** @brief Get log page by file number and offset of the beginning of the page @param data validator data, which contains the page address @param buffer buffer for page placing (might not be used in some cache implementations) @param direct_link if it is not NULL then caller can accept direct link to the page cache @retval NULL Error @retval # pointer to the page cache which should be used to read this page */ static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer, PAGECACHE_BLOCK_LINK **direct_link) { TRANSLOG_ADDRESS addr= *(data->addr), in_buffers; uint32 file_no= LSN_FILE_NO(addr); TRANSLOG_FILE *file; DBUG_ENTER("translog_get_page"); DBUG_PRINT("enter", ("File: %u Offset: %u(0x%x)", file_no, (uint) LSN_OFFSET(addr), (uint) LSN_OFFSET(addr))); /* it is really page address */ DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0); if (direct_link) *direct_link= NULL; restart: in_buffers= translog_only_in_buffers(); DBUG_PRINT("info", ("in_buffers: " LSN_FMT, LSN_IN_PARTS(in_buffers))); if (in_buffers != LSN_IMPOSSIBLE && cmp_translog_addr(addr, in_buffers) >= 0) { translog_lock(); DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0); /* recheck with locked loghandler */ in_buffers= translog_only_in_buffers(); if (cmp_translog_addr(addr, in_buffers) >= 0) { uint16 buffer_no= log_descriptor.bc.buffer_no; #ifdef DBUG_ASSERT_EXISTS uint16 buffer_start= buffer_no; #endif struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer; struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer; for (;;) { /* if the page is in the buffer and it is the last version of the page (in case of division the page by buffer flush) */ if (curr_buffer->file != NULL && cmp_translog_addr(addr, curr_buffer->offset) >= 0 && cmp_translog_addr(addr, (curr_buffer->next_buffer_offset ? curr_buffer->next_buffer_offset: curr_buffer->offset + curr_buffer->size)) < 0) { TRANSLOG_ADDRESS offset= curr_buffer->offset; TRANSLOG_FILE *fl= curr_buffer->file; uchar *from, *table= NULL; int is_last_unfinished_page; uint last_protected_sector= 0; uint skipped_data= curr_buffer->skipped_data; TRANSLOG_FILE file_copy; uint8 ver= curr_buffer->ver; translog_wait_for_writers(curr_buffer); if (offset != curr_buffer->offset || fl != curr_buffer->file || ver != curr_buffer->ver) { DBUG_ASSERT(buffer_unlock == curr_buffer); translog_buffer_unlock(buffer_unlock); goto restart; } DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset)); from= curr_buffer->buffer + (addr - curr_buffer->offset); if (skipped_data && addr == curr_buffer->offset) { /* We read page part of which is not present in buffer, so we should read absent part from file (page cache actually) */ file= get_logfile_by_number(file_no); DBUG_ASSERT(file != NULL); /* it's ok to not lock the page because: - The log handler has it's own page cache. - There is only one thread that can access the log cache at a time */ if (!(buffer= pagecache_read(log_descriptor.pagecache, &file->handler, LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, 3, buffer, PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_LEFT_UNLOCKED, NULL))) DBUG_RETURN(NULL); } else skipped_data= 0; /* Read after skipped in buffer data */ /* Now we have correct data in buffer up to 'skipped_data'. The following memcpy() will move the data from the internal buffer that was not yet on disk. */ memcpy(buffer + skipped_data, from + skipped_data, TRANSLOG_PAGE_SIZE - skipped_data); /* We can use copy then in translog_page_validator() because it do not put it permanently somewhere. We have to use copy because after releasing log lock we can't guaranty that the file still be present (in real life it will be present but theoretically possible that it will be released already from last files cache); */ file_copy= *(curr_buffer->file); file_copy.handler.callback_data= (uchar*) &file_copy; is_last_unfinished_page= ((log_descriptor.bc.buffer == curr_buffer) && (log_descriptor.bc.ptr >= from) && (log_descriptor.bc.ptr < from + TRANSLOG_PAGE_SIZE)); if (is_last_unfinished_page && (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)) { last_protected_sector= ((log_descriptor.bc.previous_offset - 1) / DISK_DRIVE_SECTOR_SIZE); table= buffer + log_descriptor.page_overhead - TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; } DBUG_ASSERT(buffer_unlock == curr_buffer); translog_buffer_unlock(buffer_unlock); if (is_last_unfinished_page) { uint i; /* This is last unfinished page => we should not check CRC and remove only that protection which already installed (no need to check it) We do not check the flag of sector protection, because if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is not set then last_protected_sector will be 0 so following loop will be never executed */ DBUG_PRINT("info", ("This is last unfinished page, " "last protected sector %u", last_protected_sector)); for (i= 1; i <= last_protected_sector; i++) { uint offset= i * DISK_DRIVE_SECTOR_SIZE; DBUG_PRINT("info", ("Sector %u: 0x%02x <- 0x%02x", i, buffer[offset], table[i])); buffer[offset]= table[i]; } } else { /* This IF should be true because we use in-memory data which supposed to be correct. */ PAGECACHE_IO_HOOK_ARGS args; args.page= buffer; args.pageno= LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE; args.data= (uchar*) &file_copy; if (translog_page_validator(0, &args)) { DBUG_ASSERT(0); buffer= NULL; } } DBUG_RETURN(buffer); } buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO; curr_buffer= log_descriptor.buffers + buffer_no; translog_buffer_lock(curr_buffer); translog_buffer_unlock(buffer_unlock); buffer_unlock= curr_buffer; /* we can't make a full circle */ DBUG_ASSERT(buffer_start != buffer_no); } } translog_unlock(); } file= get_logfile_by_number(file_no); DBUG_ASSERT(file != NULL); buffer= pagecache_read(log_descriptor.pagecache, &file->handler, LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, 3, (direct_link ? NULL : buffer), PAGECACHE_PLAIN_PAGE, (direct_link ? PAGECACHE_LOCK_READ : PAGECACHE_LOCK_LEFT_UNLOCKED), direct_link); DBUG_PRINT("info", ("Direct link is assigned to : %p * %p", direct_link, (direct_link ? *direct_link : NULL))); data->was_recovered= file->was_recovered; DBUG_RETURN(buffer); } /** @brief free direct log page link @param direct_link the direct log page link to be freed */ static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link) { DBUG_ENTER("translog_free_link"); DBUG_PRINT("info", ("Direct link: %p", direct_link)); if (direct_link) pagecache_unlock_by_link(log_descriptor.pagecache, direct_link, PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, FALSE); DBUG_VOID_RETURN; } /** @brief Finds last full page of the given log file. @param addr address structure to fill with data, which contain file number of the log file @param last_page_ok Result of the check whether last page OK. (for now only we check only that file length divisible on page length). @param no_errors suppress messages about non-critical errors @retval 0 OK @retval 1 Error */ static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr, my_bool *last_page_ok, my_bool no_errors) { char path[FN_REFLEN]; uint32 rec_offset; my_off_t file_size; uint32 file_no= LSN_FILE_NO(*addr); TRANSLOG_FILE *file; #ifdef DBUG_TRACE char buff[21]; #endif DBUG_ENTER("translog_get_last_page_addr"); if (likely((file= get_logfile_by_number(file_no)) != NULL)) { /* This function used only during initialization of loghandler or in scanner (which mean we need read that part of the log), so the requested log file have to be opened and can't be freed after returning pointer on it (file_size). */ file_size= mysql_file_seek(file->handler.file, 0, SEEK_END, MYF(0)); } else { /* This branch is used only during very early initialization when files are not opened. */ File fd; if ((fd= mysql_file_open(key_file_translog, translog_filename_by_fileno(file_no, path), O_RDONLY | O_CLOEXEC, (no_errors ? MYF(0) : MYF(MY_WME)))) < 0) { my_errno= errno; DBUG_PRINT("error", ("Error %d during opening file #%d", errno, file_no)); DBUG_RETURN(1); } file_size= mysql_file_seek(fd, 0, SEEK_END, MYF(0)); mysql_file_close(fd, MYF(0)); } DBUG_PRINT("info", ("File size: %s", llstr(file_size, buff))); if (file_size == MY_FILEPOS_ERROR) DBUG_RETURN(1); DBUG_ASSERT(file_size < 0xffffffffULL); if (((uint32)file_size) > TRANSLOG_PAGE_SIZE) { rec_offset= (((((uint32)file_size) / TRANSLOG_PAGE_SIZE) - 1) * TRANSLOG_PAGE_SIZE); *last_page_ok= (((uint32)file_size) == rec_offset + TRANSLOG_PAGE_SIZE); } else { *last_page_ok= 0; rec_offset= 0; } *addr= MAKE_LSN(file_no, rec_offset); DBUG_PRINT("info", ("Last page: 0x%lx ok: %d", (ulong) rec_offset, *last_page_ok)); DBUG_RETURN(0); } /** @brief Get number bytes for record length storing @param length Record length which will be encoded @return 1,3,4,5 - number of bytes to store given length */ static uint translog_variable_record_length_bytes(translog_size_t length) { if (length < 250) return 1; if (length < 0xFFFF) return 3; if (length < (ulong) 0xFFFFFF) return 4; return 5; } /** @brief Gets header of this chunk. @param chunk The pointer to the chunk beginning @retval # total length of the chunk @retval 0 Error */ static uint16 translog_get_chunk_header_length(uchar *chunk) { DBUG_ENTER("translog_get_chunk_header_length"); switch (*chunk & TRANSLOG_CHUNK_TYPE) { case TRANSLOG_CHUNK_LSN: { /* 0 chunk referred as LSN (head or tail) */ translog_size_t rec_len __attribute__((unused)); uchar *start= chunk; uchar *ptr= start + 1 + 2; uint16 chunk_len, header_len; DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); rec_len= translog_variable_record_1group_decode_len(&ptr); chunk_len= uint2korr(ptr); header_len= (uint16) (ptr - start) +2; DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", (ulong) rec_len, (uint) chunk_len, (uint) header_len)); if (chunk_len) { /* TODO: fine header end */ /* The last chunk of multi-group record can be base for it header calculation (we skip to the first group to read the header) so if we stuck here something is wrong. */ DBUG_ASSERT(0); DBUG_RETURN(0); /* Keep compiler happy */ } DBUG_RETURN(header_len); } case TRANSLOG_CHUNK_FIXED: { /* 1 (pseudo)fixed record (also LSN) */ DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3")); DBUG_RETURN(3); } case TRANSLOG_CHUNK_NOHDR: /* 2 no header chunk (till page end) */ DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1")); DBUG_RETURN(1); break; case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */ DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3")); DBUG_RETURN(3); break; } DBUG_ASSERT(0); DBUG_RETURN(0); /* Keep compiler happy */ } /** @brief Truncate the log to the given address. Used during the startup if the end of log if corrupted. @param addr new horizon @retval 0 OK @retval 1 Error */ static my_bool translog_truncate_log(TRANSLOG_ADDRESS addr) { uchar *page; TRANSLOG_ADDRESS current_page; uint32 next_page_offset, page_rest; uint32 i; File fd; int rc; TRANSLOG_VALIDATOR_DATA data; char path[FN_REFLEN]; uchar page_buff[TRANSLOG_PAGE_SIZE]; DBUG_ENTER("translog_truncate_log"); /* TODO: write warning to the client */ DBUG_PRINT("warning", ("removing all records from " LSN_FMT " " "till " LSN_FMT, LSN_IN_PARTS(addr), LSN_IN_PARTS(log_descriptor.horizon))); DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0); /* remove files between the address and horizon */ for (i= LSN_FILE_NO(addr) + 1; i <= LSN_FILE_NO(log_descriptor.horizon); i++) if (mysql_file_delete(key_file_translog, translog_filename_by_fileno(i, path), MYF(MY_WME))) { translog_unlock(); DBUG_RETURN(1); } /* truncate the last file up to the last page */ next_page_offset= LSN_OFFSET(addr); next_page_offset= (next_page_offset - ((next_page_offset - 1) % TRANSLOG_PAGE_SIZE + 1) + TRANSLOG_PAGE_SIZE); page_rest= next_page_offset - LSN_OFFSET(addr); memset(page_buff, TRANSLOG_FILLER, page_rest); rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 || ((mysql_file_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) || (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr), log_write_flags)) || mysql_file_sync(fd, MYF(MY_WME))))); translog_syncs++; rc|= (fd > 0 && mysql_file_close(fd, MYF(MY_WME))); if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS) { rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)); translog_syncs++; } if (rc) DBUG_RETURN(1); /* fix the horizon */ log_descriptor.horizon= addr; /* fix the buffer data */ current_page= MAKE_LSN(LSN_FILE_NO(addr), (next_page_offset - TRANSLOG_PAGE_SIZE)); data.addr= ¤t_page; if ((page= translog_get_page(&data, log_descriptor.buffers->buffer, NULL)) == NULL) DBUG_RETURN(1); if (page != log_descriptor.buffers->buffer) memcpy(log_descriptor.buffers->buffer, page, TRANSLOG_PAGE_SIZE); log_descriptor.bc.buffer->offset= current_page; log_descriptor.bc.buffer->size= LSN_OFFSET(addr) - LSN_OFFSET(current_page); log_descriptor.bc.ptr= log_descriptor.buffers->buffer + log_descriptor.bc.buffer->size; log_descriptor.bc.current_page_fill= log_descriptor.bc.buffer->size; DBUG_RETURN(0); } /** Applies function 'callback' to all files (in a directory) which name looks like a log's name (aria_log.[0-9]{7}). If 'callback' returns TRUE this interrupts the walk and returns TRUE. Otherwise FALSE is returned after processing all log files. It cannot just use log_descriptor.directory because that may not yet have been initialized. @param directory directory to scan @param callback function to apply; is passed directory and base name of found file */ my_bool translog_walk_filenames(const char *directory, my_bool (*callback)(const char *, const char *)) { MY_DIR *dirp; size_t i; my_bool rc= FALSE; /* Finds and removes transaction log files */ if (!(dirp = my_dir(directory, MYF(MY_DONT_SORT)))) return FALSE; for (i= 0; i < dirp->number_of_files; i++) { char *file= dirp->dir_entry[i].name; if (strncmp(file, "aria_log.", 10) == 0 && file[10] >= '0' && file[10] <= '9' && file[11] >= '0' && file[11] <= '9' && file[12] >= '0' && file[12] <= '9' && file[13] >= '0' && file[13] <= '9' && file[14] >= '0' && file[14] <= '9' && file[15] >= '0' && file[15] <= '9' && file[16] >= '0' && file[16] <= '9' && file[17] >= '0' && file[17] <= '9' && file[18] == '\0' && (*callback)(directory, file)) { rc= TRUE; break; } } my_dirend(dirp); return rc; } /** @brief Fills table of dependence length of page header from page flags */ void translog_fill_overhead_table() { uint i; for (i= 0; i < TRANSLOG_FLAGS_NUM; i++) { page_overhead[i]= 7; if (i & TRANSLOG_PAGE_CRC) page_overhead[i]+= CRC_SIZE; if (i & TRANSLOG_SECTOR_PROTECTION) page_overhead[i]+= TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; } } /** Callback to find first log in directory. */ static my_bool translog_callback_search_first(const char *directory __attribute__((unused)), const char *filename __attribute__((unused))) { return TRUE; } /** @brief Checks that chunk is LSN one @param type type of the chunk @retval 1 the chunk is LNS @retval 0 the chunk is not LSN */ static my_bool translog_is_LSN_chunk(uchar type) { DBUG_ENTER("translog_is_LSN_chunk"); DBUG_PRINT("info", ("byte: %x chunk type: %u record type: %u", type, type >> 6, type & TRANSLOG_REC_TYPE)); DBUG_RETURN(((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_FIXED) || (((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_LSN) && ((type & TRANSLOG_REC_TYPE)) != TRANSLOG_CHUNK_0_CONT)); } /** @brief Initialize transaction log @param directory Directory where log files are put @param log_file_max_size max size of one log size (for new logs creation) @param server_version version of MySQL server (MYSQL_VERSION_ID) @param server_id server ID (replication & Co) @param pagecache Page cache for the log reads @param flags flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION TRANSLOG_RECORD_CRC) @param read_only Put transaction log in read-only mode @param init_table_func function to initialize record descriptors table @param no_errors suppress messages about non-critical errors @todo Free used resources in case of error. @retval 0 OK @retval 1 Error */ my_bool translog_init_with_table(const char *directory, uint32 log_file_max_size, uint32 server_version, uint32 server_id, PAGECACHE *pagecache, uint flags, my_bool readonly, void (*init_table_func)(), my_bool no_errors) { int i; int old_log_was_recovered= 0, logs_found= 0; uint old_flags= flags; uint32 start_file_num= 1; TRANSLOG_ADDRESS UNINIT_VAR(sure_page), last_page, last_valid_page, checkpoint_lsn; my_bool version_changed= 0; DBUG_ENTER("translog_init_with_table"); translog_syncs= 0; flush_start= 0; id_to_share= NULL; log_purge_disabled= 0; log_descriptor.directory_fd= -1; log_descriptor.is_everything_flushed= 1; log_descriptor.flush_in_progress= 0; log_descriptor.flush_no= 0; log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; /* Normally in Aria this this calls translog_table_init() */ (*init_table_func)(); compile_time_assert(sizeof(log_descriptor.dirty_buffer_mask) * 8 >= TRANSLOG_BUFFERS_NO); log_descriptor.dirty_buffer_mask= 0; if (readonly) log_descriptor.open_flags= O_BINARY | O_RDONLY; else log_descriptor.open_flags= O_BINARY | O_RDWR; if (mysql_mutex_init(key_TRANSLOG_BUFFER_mutex, &log_descriptor.sent_to_disk_lock, MY_MUTEX_INIT_FAST) || mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_file_header_lock, &log_descriptor.file_header_lock, MY_MUTEX_INIT_FAST) || mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_unfinished_files_lock, &log_descriptor.unfinished_files_lock, MY_MUTEX_INIT_FAST) || mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_purger_lock, &log_descriptor.purger_lock, MY_MUTEX_INIT_FAST) || mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_log_flush_lock, &log_descriptor.log_flush_lock, MY_MUTEX_INIT_FAST) || mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_dirty_buffer_mask_lock, &log_descriptor.dirty_buffer_mask_lock, MY_MUTEX_INIT_FAST) || mysql_cond_init(key_TRANSLOG_DESCRIPTOR_log_flush_cond, &log_descriptor.log_flush_cond, 0) || mysql_cond_init(key_TRANSLOG_DESCRIPTOR_new_goal_cond, &log_descriptor.new_goal_cond, 0) || mysql_rwlock_init(key_TRANSLOG_DESCRIPTOR_open_files_lock, &log_descriptor.open_files_lock) || my_init_dynamic_array(PSI_INSTRUMENT_ME, &log_descriptor.open_files, sizeof(TRANSLOG_FILE*), 10, 10, MYF(0)) || my_init_dynamic_array(PSI_INSTRUMENT_ME, &log_descriptor.unfinished_files, sizeof(struct st_file_counter), 10, 10, MYF(0))) goto err; log_descriptor.min_need_file= 0; log_descriptor.min_file_number= 0; log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE; /* Directory to store files */ unpack_dirname(log_descriptor.directory, directory); #ifndef _WIN32 if ((log_descriptor.directory_fd= my_open(log_descriptor.directory, O_RDONLY, MYF(MY_WME))) < 0) { my_errno= errno; DBUG_PRINT("error", ("Error %d during opening directory '%s'", errno, log_descriptor.directory)); goto err; } #endif log_descriptor.in_buffers_only= LSN_IMPOSSIBLE; DBUG_ASSERT(log_file_max_size % TRANSLOG_PAGE_SIZE == 0 && log_file_max_size >= TRANSLOG_MIN_FILE_SIZE); /* max size of one log size (for new logs creation) */ log_file_size= log_descriptor.log_file_max_size= log_file_max_size; /* server version */ log_descriptor.server_version= server_version; /* server ID */ log_descriptor.server_id= server_id; /* Page cache for the log reads */ log_descriptor.pagecache= pagecache; /* Flags */ DBUG_ASSERT((flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | TRANSLOG_RECORD_CRC)) == 0); log_descriptor.flags= flags; translog_fill_overhead_table(); log_descriptor.page_overhead= page_overhead[flags]; log_descriptor.page_capacity_chunk_2= TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1; compile_time_assert(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0); log_descriptor.buffer_capacity_chunk_2= (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) * log_descriptor.page_capacity_chunk_2; log_descriptor.half_buffer_capacity_chunk_2= log_descriptor.buffer_capacity_chunk_2 / 2; DBUG_PRINT("info", ("Overhead: %u pc2: %u bc2: %u, bc2/2: %u", log_descriptor.page_overhead, log_descriptor.page_capacity_chunk_2, log_descriptor.buffer_capacity_chunk_2, log_descriptor.half_buffer_capacity_chunk_2)); /* Just to init it somehow (hack for bootstrap)*/ { TRANSLOG_FILE *file= 0; log_descriptor.min_file = log_descriptor.max_file= 1; insert_dynamic(&log_descriptor.open_files, (uchar *)&file); translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); pop_dynamic(&log_descriptor.open_files); } /* Buffers for log writing */ for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) { if (translog_buffer_init(log_descriptor.buffers + i, i)) goto err; DBUG_PRINT("info", ("translog_buffer buffer #%u:%p", i, log_descriptor.buffers + i)); } /* last_logno and last_checkpoint_lsn were set in ma_control_file_create_or_open() */ logs_found= (last_logno != FILENO_IMPOSSIBLE); translog_status= (readonly ? TRANSLOG_READONLY : TRANSLOG_OK); checkpoint_lsn= last_checkpoint_lsn; if (logs_found) { my_bool pageok; DBUG_PRINT("info", ("log found...")); /* TODO: scan directory for aria_log.XXXXXXXX files and find highest XXXXXXXX & set logs_found TODO: check that last checkpoint within present log addresses space find the log end */ if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE) { DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0); /* only last log needs to be checked */ sure_page= MAKE_LSN(last_logno, TRANSLOG_PAGE_SIZE); } else { sure_page= last_checkpoint_lsn; DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0); sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE; } /* Set horizon to the beginning of the last file first */ log_descriptor.horizon= last_page= MAKE_LSN(last_logno, 0); if (translog_get_last_page_addr(&last_page, &pageok, no_errors)) { if (!translog_walk_filenames(log_descriptor.directory, &translog_callback_search_first)) { /* Files was deleted, just start from the next log number, so that existing tables are in the past. */ start_file_num= last_logno + 1; checkpoint_lsn= LSN_IMPOSSIBLE; /* no log so no checkpoint */ logs_found= 0; } else goto err; } else if (LSN_OFFSET(last_page) == 0) { if (LSN_FILE_NO(last_page) == 1) { logs_found= 0; /* file #1 has no pages */ DBUG_PRINT("info", ("log found. But is is empty => no log assumed")); } else { last_page-= LSN_ONE_FILE; if (translog_get_last_page_addr(&last_page, &pageok, 0)) goto err; } } if (logs_found) { uint32 i; log_descriptor.min_file= translog_first_file(log_descriptor.horizon, 1); log_descriptor.max_file= last_logno; /* Open all files */ if (allocate_dynamic(&log_descriptor.open_files, log_descriptor.max_file - log_descriptor.min_file + 1)) goto err; for (i = log_descriptor.max_file; i >= log_descriptor.min_file; i--) { /* We can't allocate all file together because they will be freed one by one */ TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE), MYF(0)); compile_time_assert(MY_FILEPOS_ERROR > 0xffffffffULL); if (file == NULL || (file->handler.file= open_logfile_by_number_no_cache(i)) < 0 || mysql_file_seek(file->handler.file, 0, SEEK_END, MYF(0)) >= 0xffffffffULL) { int j; for (j= i - log_descriptor.min_file - 1; j > 0; j--) { TRANSLOG_FILE *el= *dynamic_element(&log_descriptor.open_files, j, TRANSLOG_FILE **); mysql_file_close(el->handler.file, MYF(MY_WME)); my_free(el); } if (file) { free(file); goto err; } else goto err; } translog_file_init(file, i, 1); /* we allocated space so it can't fail */ insert_dynamic(&log_descriptor.open_files, (uchar *)&file); } DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); } } else if (readonly) { /* There is no logs and there is read-only mode => nothing to read */ DBUG_PRINT("error", ("No logs and read-only mode")); goto err; } if (logs_found) { TRANSLOG_ADDRESS current_page= sure_page; my_bool pageok; DBUG_PRINT("info", ("The log is really present")); if (sure_page > last_page) { my_printf_error(HA_ERR_GENERIC, "Aria engine: log data error\n" "last_log_page: " LSN_FMT " is less than\n" "checkpoint page: " LSN_FMT, MYF(0), LSN_IN_PARTS(last_page), LSN_IN_PARTS(sure_page)); goto err; } /* TODO: check page size */ last_valid_page= LSN_IMPOSSIBLE; /* Scans and validate pages. We need it to show "outside" only for sure valid part of the log. If the log was damaged then fixed we have to cut off damaged part before some other process start write something in the log. */ do { TRANSLOG_ADDRESS current_file_last_page; current_file_last_page= current_page; if (translog_get_last_page_addr(¤t_file_last_page, &pageok, 0)) goto err; if (!pageok) { DBUG_PRINT("error", ("File %lu have no complete last page", (ulong) LSN_FILE_NO(current_file_last_page))); old_log_was_recovered= 1; /* This file is not written till the end so it should be last */ last_page= current_file_last_page; /* TODO: issue warning */ } do { TRANSLOG_VALIDATOR_DATA data; TRANSLOG_PAGE_SIZE_BUFF psize_buff; uchar *page; data.addr= ¤t_page; if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL) goto err; if (data.was_recovered) { DBUG_PRINT("error", ("file no: %lu (%d) " "rec_offset: 0x%lx (%lu) (%d)", (ulong) LSN_FILE_NO(current_page), (uint3korr(page + 3) != LSN_FILE_NO(current_page)), (ulong) LSN_OFFSET(current_page), (ulong) (LSN_OFFSET(current_page) / TRANSLOG_PAGE_SIZE), (uint3korr(page) != LSN_OFFSET(current_page) / TRANSLOG_PAGE_SIZE))); old_log_was_recovered= 1; break; } old_flags= page[TRANSLOG_PAGE_FLAGS]; last_valid_page= current_page; current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */ } while (current_page <= current_file_last_page); current_page+= LSN_ONE_FILE; current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE); } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) && !old_log_was_recovered); if (last_valid_page == LSN_IMPOSSIBLE) { /* Panic!!! Even page which should be valid is invalid */ /* TODO: issue error */ goto err; } DBUG_PRINT("info", ("Last valid page is in file: %lu " "offset: %lu (0x%lx) " "Logs found: %d was recovered: %d " "flags match: %d", (ulong) LSN_FILE_NO(last_valid_page), (ulong) LSN_OFFSET(last_valid_page), (ulong) LSN_OFFSET(last_valid_page), logs_found, old_log_was_recovered, (old_flags == flags))); /* TODO: check server ID */ if (logs_found && !old_log_was_recovered && old_flags == flags) { TRANSLOG_VALIDATOR_DATA data; TRANSLOG_PAGE_SIZE_BUFF psize_buff; uchar *page; uint16 chunk_offset; data.addr= &last_valid_page; /* continue old log */ DBUG_ASSERT(LSN_FILE_NO(last_valid_page)== LSN_FILE_NO(log_descriptor.horizon)); if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL || (chunk_offset= translog_get_first_chunk_offset(page)) == 0) goto err; /* Puts filled part of old page in the buffer */ log_descriptor.horizon= last_valid_page; translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); /* Free space if filled with TRANSLOG_FILLER and first uchar of real chunk can't be TRANSLOG_FILLER */ while (chunk_offset < TRANSLOG_PAGE_SIZE && page[chunk_offset] != TRANSLOG_FILLER) { uint16 chunk_length; if ((chunk_length= translog_get_total_chunk_length(page, chunk_offset)) == 0) goto err; DBUG_PRINT("info", ("chunk: offset: %u length: %u", (uint) chunk_offset, (uint) chunk_length)); chunk_offset+= chunk_length; /* chunk can't cross the page border */ DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE); } memcpy(log_descriptor.buffers->buffer, page, chunk_offset); log_descriptor.bc.buffer->size+= chunk_offset; log_descriptor.bc.ptr+= chunk_offset; log_descriptor.bc.current_page_fill= chunk_offset; log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, (chunk_offset + LSN_OFFSET(last_valid_page))); DBUG_PRINT("info", ("Move Page #%u: %p chaser: %d Size: %lu (%lu)", (uint) log_descriptor.bc.buffer_no, log_descriptor.bc.buffer, log_descriptor.bc.chaser, (ulong) log_descriptor.bc.buffer->size, (ulong) (log_descriptor.bc.ptr - log_descriptor.bc. buffer->buffer))); translog_check_cursor(&log_descriptor.bc); } if (!old_log_was_recovered && old_flags == flags) { LOGHANDLER_FILE_INFO info; /* Accessing &log_descriptor.open_files without mutex is safe because it is initialization */ if (translog_read_file_header(&info, (*dynamic_element(&log_descriptor. open_files, 0, TRANSLOG_FILE **))-> handler.file)) goto err; version_changed= (info.maria_version != TRANSLOG_VERSION_ID); } } DBUG_PRINT("info", ("Logs found: %d was recovered: %d", logs_found, old_log_was_recovered)); if (!logs_found) { TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE), MYF(MY_WME)); DBUG_PRINT("info", ("The log is not found => we will create new log")); if (file == NULL) goto err; /* Start new log system from scratch */ log_descriptor.horizon= MAKE_LSN(start_file_num, TRANSLOG_PAGE_SIZE); /* header page */ translog_file_init(file, start_file_num, 0); if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file)) { my_free(file); goto err; } if ((file->handler.file= create_logfile_by_number_no_cache(start_file_num)) == -1) goto err; log_descriptor.min_file= log_descriptor.max_file= start_file_num; if (translog_write_file_header()) goto err; DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); if (ma_control_file_write_and_force(checkpoint_lsn, start_file_num, max_trid_in_control_file, recovery_failures)) goto err; /* assign buffer 0 */ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); } else if ((old_log_was_recovered || old_flags != flags || version_changed) && !readonly) { /* leave the damaged file untouched */ log_descriptor.horizon+= LSN_ONE_FILE; /* header page */ log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, TRANSLOG_PAGE_SIZE); if (translog_create_new_file()) goto err; /* Buffer system left untouched after recovery => we should init it (starting from buffer 0) */ translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); } /* all LSNs that are on disk are flushed */ log_descriptor.log_start= log_descriptor.sent_to_disk= log_descriptor.flushed= log_descriptor.horizon; log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset; log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */ /* Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially) address of the next LSN and we want indicate that all LSNs that are already on the disk are flushed so we need decrease horizon on 1 (we are sure that there is no LSN on the disk which is greater then 'flushed' and there will not be LSN created that is equal or less then the value of the 'flushed'). */ log_descriptor.flushed--; /* offset decreased */ log_descriptor.sent_to_disk--; /* offset decreased */ /* Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up structures for generating 2-byte ids: */ id_to_share= (MARIA_SHARE **) my_malloc(PSI_INSTRUMENT_ME, SHARE_ID_MAX * sizeof(MARIA_SHARE*), MYF(MY_WME | MY_ZEROFILL)); if (unlikely(!id_to_share)) goto err; id_to_share--; /* min id is 1 */ /* Check the last LSN record integrity */ if (logs_found) { TRANSLOG_SCANNER_DATA scanner; TRANSLOG_ADDRESS page_addr; LSN last_lsn= LSN_IMPOSSIBLE; /* take very last page address and try to find LSN record on it if it fail take address of previous page and so on */ page_addr= (log_descriptor.horizon - ((log_descriptor.horizon - 1) % TRANSLOG_PAGE_SIZE + 1)); if (translog_scanner_init(page_addr, 1, &scanner, 1)) goto err; scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]]; for (;;) { uint chunk_1byte; chunk_1byte= scanner.page[scanner.page_offset]; while (!translog_is_LSN_chunk(chunk_1byte) && scanner.page != END_OF_LOG && scanner.page[scanner.page_offset] != TRANSLOG_FILLER && scanner.page_addr == page_addr) { if (translog_get_next_chunk(&scanner)) { translog_destroy_scanner(&scanner); goto err; } if (scanner.page != END_OF_LOG) chunk_1byte= scanner.page[scanner.page_offset]; } if (translog_is_LSN_chunk(chunk_1byte)) { last_lsn= scanner.page_addr + scanner.page_offset; if (translog_get_next_chunk(&scanner)) { translog_destroy_scanner(&scanner); goto err; } if (scanner.page == END_OF_LOG) break; /* it was the last record */ chunk_1byte= scanner.page[scanner.page_offset]; continue; /* try to find other record on this page */ } if (last_lsn != LSN_IMPOSSIBLE) break; /* there is no more records on the page */ /* We have to make step back */ if (unlikely(LSN_OFFSET(page_addr) == TRANSLOG_PAGE_SIZE)) { uint32 file_no= LSN_FILE_NO(page_addr); my_bool last_page_ok; /* it is beginning of the current file */ if (unlikely(file_no == 1)) { /* It is beginning of the log => there is no LSNs in the log => There is no harm in leaving it "as-is". */ log_descriptor.previous_flush_horizon= log_descriptor.horizon; DBUG_PRINT("info", ("previous_flush_horizon: " LSN_FMT, LSN_IN_PARTS(log_descriptor. previous_flush_horizon))); DBUG_RETURN(0); } file_no--; page_addr= MAKE_LSN(file_no, TRANSLOG_PAGE_SIZE); translog_get_last_page_addr(&page_addr, &last_page_ok, 0); /* page should be OK as it is not the last file */ DBUG_ASSERT(last_page_ok); } else { page_addr-= TRANSLOG_PAGE_SIZE; } translog_destroy_scanner(&scanner); if (translog_scanner_init(page_addr, 1, &scanner, 1)) goto err; scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]]; } translog_destroy_scanner(&scanner); /* Now scanner points to the last LSN chunk, lets check it */ { TRANSLOG_HEADER_BUFFER rec; translog_size_t rec_len; int len; uchar buffer[1]; DBUG_PRINT("info", ("going to check the last found record " LSN_FMT, LSN_IN_PARTS(last_lsn))); len= translog_read_record_header(last_lsn, &rec); if (unlikely (len == RECHEADER_READ_ERROR || len == RECHEADER_READ_EOF)) { DBUG_PRINT("error", ("unexpected end of log or record during " "reading record header: " LSN_FMT " len: %d", LSN_IN_PARTS(last_lsn), len)); if (readonly) log_descriptor.log_start= log_descriptor.horizon= last_lsn; else if (translog_truncate_log(last_lsn)) { translog_free_record_header(&rec); goto err; } } else { DBUG_ASSERT(last_lsn == rec.lsn); if (likely(rec.record_length != 0)) { /* Reading the last byte of record will trigger scanning all record chunks for now */ rec_len= translog_read_record(rec.lsn, rec.record_length - 1, 1, buffer, NULL); if (rec_len != 1) { DBUG_PRINT("error", ("unexpected end of log or record during " "reading record body: " LSN_FMT " len: %d", LSN_IN_PARTS(rec.lsn), len)); if (readonly) log_descriptor.log_start= log_descriptor.horizon= last_lsn; else if (translog_truncate_log(last_lsn)) { translog_free_record_header(&rec); goto err; } } } } translog_free_record_header(&rec); } } log_descriptor.previous_flush_horizon= log_descriptor.horizon; DBUG_PRINT("info", ("previous_flush_horizon: " LSN_FMT, LSN_IN_PARTS(log_descriptor.previous_flush_horizon))); DBUG_RETURN(0); err: ma_message_no_user(0, "log initialization failed"); DBUG_RETURN(1); } /* @brief Free transaction log file buffer. @param buffer_no The buffer to free */ static void translog_buffer_destroy(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_buffer_destroy"); DBUG_PRINT("enter", ("Buffer #%u: %p file: %d offset: " LSN_FMT " size: %lu", (uint) buffer->buffer_no, buffer, (buffer->file ? buffer->file->handler.file : -1), LSN_IN_PARTS(buffer->offset), (ulong) buffer->size)); if (buffer->file != NULL) { /* We ignore errors here, because we can't do something about it (it is shutting down) We also have to take the locks even if there can't be any other threads running, because translog_buffer_flush() requires that we have the buffer locked. */ translog_buffer_lock(buffer); translog_buffer_flush(buffer); translog_buffer_unlock(buffer); } DBUG_PRINT("info", ("Destroy mutex: %p", &buffer->mutex)); mysql_mutex_destroy(&buffer->mutex); mysql_cond_destroy(&buffer->waiting_filling_buffer); DBUG_VOID_RETURN; } /* Free log handler resources SYNOPSIS translog_destroy() */ void translog_destroy() { TRANSLOG_FILE **file; uint i; uint8 current_buffer; DBUG_ENTER("translog_destroy"); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); translog_lock(); current_buffer= log_descriptor.bc.buffer_no; translog_status= (translog_status == TRANSLOG_READONLY ? TRANSLOG_UNINITED : TRANSLOG_SHUTDOWN); if (log_descriptor.bc.buffer->file != NULL) translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc); translog_unlock(); for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) { struct st_translog_buffer *buffer= (log_descriptor.buffers + ((i + current_buffer + 1) % TRANSLOG_BUFFERS_NO)); translog_buffer_destroy(buffer); } translog_status= TRANSLOG_UNINITED; /* close files */ while ((file= (TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files))) translog_close_log_file(*file); mysql_mutex_destroy(&log_descriptor.sent_to_disk_lock); mysql_mutex_destroy(&log_descriptor.file_header_lock); mysql_mutex_destroy(&log_descriptor.unfinished_files_lock); mysql_mutex_destroy(&log_descriptor.purger_lock); mysql_mutex_destroy(&log_descriptor.log_flush_lock); mysql_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock); mysql_cond_destroy(&log_descriptor.log_flush_cond); mysql_cond_destroy(&log_descriptor.new_goal_cond); mysql_rwlock_destroy(&log_descriptor.open_files_lock); delete_dynamic(&log_descriptor.open_files); delete_dynamic(&log_descriptor.unfinished_files); if (log_descriptor.directory_fd >= 0) mysql_file_close(log_descriptor.directory_fd, MYF(MY_WME)); if (id_to_share != NULL) my_free(id_to_share + 1); DBUG_VOID_RETURN; } /* @brief Starts new page. @param horizon \ Position in file and buffer where we are @param cursor / @param prev_buffer Buffer which should be flushed will be assigned here. This is always set (to NULL if nothing to flush). @note We do not want to flush the buffer immediately because we want to let caller of this function first advance 'horizon' pointer and unlock the loghandler and only then flush the log which can take some time. @retval 0 OK @retval 1 Error */ static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor, struct st_translog_buffer **prev_buffer) { struct st_translog_buffer *buffer= cursor->buffer; DBUG_ENTER("translog_page_next"); *prev_buffer= NULL; if ((cursor->ptr + TRANSLOG_PAGE_SIZE > cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) || (LSN_OFFSET(*horizon) > log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE)) { DBUG_PRINT("info", ("Switch to next buffer Buffer Size: %lu (%lu) => %d " "File size: %lu max: %lu => %d", (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer), (cursor->ptr + TRANSLOG_PAGE_SIZE > cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER), (ulong) LSN_OFFSET(*horizon), (ulong) log_descriptor.log_file_max_size, (LSN_OFFSET(*horizon) > (log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE)))); if (translog_buffer_next(horizon, cursor, LSN_OFFSET(*horizon) > (log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE))) DBUG_RETURN(1); *prev_buffer= buffer; DBUG_PRINT("info", ("Buffer #%u (%p): have to be flushed", (uint) buffer->buffer_no, buffer)); } else { DBUG_PRINT("info", ("Use the same buffer #%u (%p): " "Buffer Size: %lu (%lu)", (uint) buffer->buffer_no, buffer, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer))); translog_finish_page(horizon, cursor); translog_new_page_header(horizon, cursor); } DBUG_RETURN(0); } /* Write data of given length to the current page SYNOPSIS translog_write_data_on_page() horizon \ Pointers on file and buffer cursor / length IN length of the chunk buffer buffer with data RETURN 0 OK 1 Error */ static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor, translog_size_t length, uchar *buffer) { DBUG_ENTER("translog_write_data_on_page"); DBUG_PRINT("enter", ("Chunk length: %lu Page size %u", (ulong) length, (uint) cursor->current_page_fill)); DBUG_ASSERT(length > 0); DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER); memcpy(cursor->ptr, buffer, length); cursor->ptr+= length; (*horizon)+= length; /* adds offset */ cursor->current_page_fill+= length; if (!cursor->chaser) cursor->buffer->size+= length; DBUG_PRINT("info", ("Write data buffer #%u: %p " "chaser: %d Size: %lu (%lu)", (uint) cursor->buffer->buffer_no, cursor->buffer, cursor->chaser, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer))); translog_check_cursor(cursor); DBUG_RETURN(0); } /* Write data from parts of given length to the current page SYNOPSIS translog_write_parts_on_page() horizon \ Pointers on file and buffer cursor / length IN length of the chunk parts IN/OUT chunk source RETURN 0 OK 1 Error */ static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor, translog_size_t length, struct st_translog_parts *parts) { translog_size_t left= length; uint cur= (uint) parts->current; DBUG_ENTER("translog_write_parts_on_page"); DBUG_PRINT("enter", ("Chunk length: %lu parts: %u of %u. Page size: %u " "Buffer size: %lu (%lu)", (ulong) length, (uint) (cur + 1), (uint) parts->elements, (uint) cursor->current_page_fill, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer))); DBUG_ASSERT(length > 0); DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER); do { translog_size_t len; LEX_CUSTRING *part; const uchar *buff; DBUG_ASSERT(cur < parts->elements); part= parts->parts + cur; buff= part->str; DBUG_PRINT("info", ("Part: %u Length: %lu left: %lu buff: %p", (uint) (cur + 1), (ulong) part->length, (ulong) left, buff)); if (part->length > left) { /* we should write less then the current part */ len= left; part->length-= len; part->str+= len; DBUG_PRINT("info", ("Set new part: %u Length: %lu", (uint) (cur + 1), (ulong) part->length)); } else { len= (translog_size_t) part->length; cur++; DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len)); } DBUG_PRINT("info", ("copy: %p <- %p %u", cursor->ptr, buff, len)); if (likely(len)) { memcpy(cursor->ptr, buff, len); left-= len; cursor->ptr+= len; } } while (left); DBUG_PRINT("info", ("Horizon: " LSN_FMT " Length %u(0x%x)", LSN_IN_PARTS(*horizon), length, length)); parts->current= cur; (*horizon)+= length; /* offset increasing */ cursor->current_page_fill+= length; if (!cursor->chaser) cursor->buffer->size+= length; /* We do not not updating parts->total_record_length here because it is need only before writing record to have total length */ DBUG_PRINT("info", ("Write parts buffer #%u: %p " "chaser: %d Size: %lu (%lu) " "Horizon: " LSN_FMT " buff offset: 0x%x", (uint) cursor->buffer->buffer_no, cursor->buffer, cursor->chaser, (ulong) cursor->buffer->size, (ulong) (cursor->ptr - cursor->buffer->buffer), LSN_IN_PARTS(*horizon), (uint) (LSN_OFFSET(cursor->buffer->offset) + cursor->buffer->size))); translog_check_cursor(cursor); DBUG_RETURN(0); } /* Put 1 group chunk type 0 header into parts array SYNOPSIS translog_write_variable_record_1group_header() parts Descriptor of record source parts type The log record type short_trid Short transaction ID or 0 if it has no sense header_length Calculated header length of chunk type 0 chunk0_header Buffer for the chunk header writing */ static void translog_write_variable_record_1group_header(struct st_translog_parts *parts, enum translog_record_type type, SHORT_TRANSACTION_ID short_trid, uint16 header_length, uchar *chunk0_header) { LEX_CUSTRING *part; DBUG_ASSERT(parts->current != 0); /* first part is left for header */ part= parts->parts + (--parts->current); parts->total_record_length+= (translog_size_t) (part->length= header_length); part->str= chunk0_header; /* puts chunk type */ *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); int2store(chunk0_header + 1, short_trid); /* puts record length */ translog_write_variable_record_1group_code_len(chunk0_header + 3, parts->record_length, header_length); /* puts 0 as chunk length which indicate 1 group record */ int2store(chunk0_header + header_length - 2, 0); } /* Increase number of writers for this buffer SYNOPSIS translog_buffer_increase_writers() buffer target buffer */ static inline void translog_buffer_increase_writers(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_buffer_increase_writers"); translog_buffer_lock_assert_owner(buffer); buffer->copy_to_buffer_in_progress++; DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u %p progress: %d", (uint) buffer->buffer_no, buffer, buffer->copy_to_buffer_in_progress)); DBUG_VOID_RETURN; } /* Decrease number of writers for this buffer SYNOPSIS translog_buffer_decrease_writers() buffer target buffer */ static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer) { DBUG_ENTER("translog_buffer_decrease_writers"); translog_buffer_lock_assert_owner(buffer); buffer->copy_to_buffer_in_progress--; DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u %p progress: %d", (uint) buffer->buffer_no, buffer, buffer->copy_to_buffer_in_progress)); if (buffer->copy_to_buffer_in_progress == 0) mysql_cond_broadcast(&buffer->waiting_filling_buffer); DBUG_VOID_RETURN; } /** @brief Skip to the next page for chaser (thread which advanced horizon pointer and now feeling the buffer) @param horizon \ Pointers on file position and buffer @param cursor / @retval 1 OK @retval 0 Error */ static my_bool translog_chaser_page_next(TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor) { struct st_translog_buffer *buffer_to_flush; my_bool rc; DBUG_ENTER("translog_chaser_page_next"); DBUG_ASSERT(cursor->chaser); rc= translog_page_next(horizon, cursor, &buffer_to_flush); if (buffer_to_flush != NULL) { translog_buffer_lock(buffer_to_flush); translog_buffer_decrease_writers(buffer_to_flush); used_buffs_register_unlock(&cursor->buffs, buffer_to_flush); if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); } DBUG_RETURN(rc); } /* Put chunk 2 from new page beginning SYNOPSIS translog_write_variable_record_chunk2_page() parts Descriptor of record source parts horizon \ Pointers on file position and buffer cursor / RETURN 0 OK 1 Error */ static my_bool translog_write_variable_record_chunk2_page(struct st_translog_parts *parts, TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor) { uchar chunk2_header[1]; DBUG_ENTER("translog_write_variable_record_chunk2_page"); chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; if (translog_chaser_page_next(horizon, cursor)) DBUG_RETURN(1); /* Puts chunk type */ translog_write_data_on_page(horizon, cursor, 1, chunk2_header); /* Puts chunk body */ translog_write_parts_on_page(horizon, cursor, log_descriptor.page_capacity_chunk_2, parts); DBUG_RETURN(0); } /* Put chunk 3 of requested length in the buffer from new page beginning SYNOPSIS translog_write_variable_record_chunk3_page() parts Descriptor of record source parts length Length of this chunk horizon \ Pointers on file position and buffer cursor / RETURN 0 OK 1 Error */ static my_bool translog_write_variable_record_chunk3_page(struct st_translog_parts *parts, uint16 length, TRANSLOG_ADDRESS *horizon, struct st_buffer_cursor *cursor) { LEX_CUSTRING *part; uchar chunk3_header[1 + 2]; DBUG_ENTER("translog_write_variable_record_chunk3_page"); if (translog_chaser_page_next(horizon, cursor)) DBUG_RETURN(1); if (length == 0) { /* It was call to write page header only (no data for chunk 3) */ DBUG_PRINT("info", ("It is a call to make page header only")); DBUG_RETURN(0); } DBUG_ASSERT(parts->current != 0); /* first part is left for header */ part= parts->parts + (--parts->current); parts->total_record_length+= (translog_size_t) (part->length= 1 + 2); part->str= chunk3_header; /* Puts chunk type */ *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH); /* Puts chunk length */ int2store(chunk3_header + 1, length); translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts); DBUG_RETURN(0); } /* Move log pointer (horizon) on given number pages starting from next page, and given offset on the last page SYNOPSIS translog_advance_pointer() pages Number of full pages starting from the next one last_page_data Plus this data on the last page RETURN 0 OK 1 Error */ static my_bool translog_advance_pointer(int pages, uint16 last_page_data, TRUNSLOG_USED_BUFFERS *buffs) { translog_size_t last_page_offset= (log_descriptor.page_overhead + last_page_data); translog_size_t offset= (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill + pages * TRANSLOG_PAGE_SIZE + last_page_offset); translog_size_t buffer_end_offset, file_end_offset, min_offset; DBUG_ENTER("translog_advance_pointer"); DBUG_PRINT("enter", ("Pointer: " LSN_FMT " + %u + %u pages + %u + %u", LSN_IN_PARTS(log_descriptor.horizon), (uint) (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill), pages, (uint) log_descriptor.page_overhead, (uint) last_page_data)); translog_lock_assert_owner(); used_buffs_init(buffs); if (pages == -1) { /* It is special case when we advance the pointer on the same page. It can happened when we write last part of multi-group record. */ DBUG_ASSERT(last_page_data + log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE); offset= last_page_data; last_page_offset= log_descriptor.bc.current_page_fill + last_page_data; goto end; } DBUG_PRINT("info", ("last_page_offset %lu", (ulong) last_page_offset)); DBUG_ASSERT(last_page_offset <= TRANSLOG_PAGE_SIZE); /* The loop will be executed 1-3 times. Usually we advance the pointer to fill only the current buffer (if we have more then 1/2 of buffer free or 2 buffers (rest of current and all next). In case of really huge record end where we write last group with "table of content" of all groups and ignore buffer borders we can occupy 3 buffers. */ for (;;) { uint8 new_buffer_no; struct st_translog_buffer *new_buffer; struct st_translog_buffer *old_buffer; buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size; if (likely(log_descriptor.log_file_max_size >= LSN_OFFSET(log_descriptor.horizon))) file_end_offset= (log_descriptor.log_file_max_size - LSN_OFFSET(log_descriptor.horizon)); else { /* We already have written more then current file limit allow, So we will finish this page and start new file */ file_end_offset= (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill); } DBUG_PRINT("info", ("offset: %u buffer_end_offs: %u, " "file_end_offs: %u", offset, buffer_end_offset, file_end_offset)); DBUG_PRINT("info", ("Buff #%u %u (%p) offset 0x%x + size 0x%x = " "0x%x (0x%x)", log_descriptor.bc.buffer->buffer_no, log_descriptor.bc.buffer_no, log_descriptor.bc.buffer, (uint) LSN_OFFSET(log_descriptor.bc.buffer->offset), log_descriptor.bc.buffer->size, (uint) (LSN_OFFSET(log_descriptor.bc.buffer->offset) + log_descriptor.bc.buffer->size), (uint) LSN_OFFSET(log_descriptor.horizon))); DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) + log_descriptor.bc.buffer->size == LSN_OFFSET(log_descriptor.horizon)); if (offset <= buffer_end_offset && offset <= file_end_offset) break; old_buffer= log_descriptor.bc.buffer; new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; new_buffer= log_descriptor.buffers + new_buffer_no; translog_buffer_lock(new_buffer); #ifndef DBUG_OFF { TRANSLOG_ADDRESS offset= new_buffer->offset; TRANSLOG_FILE *file= new_buffer->file; uint8 ver= new_buffer->ver; translog_lock_assert_owner(); #endif translog_wait_for_buffer_free(new_buffer); #ifndef DBUG_OFF /* We keep the handler locked so nobody can start this new buffer */ DBUG_ASSERT((offset == new_buffer->offset && new_buffer->file == NULL && (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver) || translog_status == TRANSLOG_READONLY); } #endif min_offset= MY_MIN(buffer_end_offset, file_end_offset); /* TODO: check is it ptr or size enough */ log_descriptor.bc.buffer->size+= min_offset; log_descriptor.bc.ptr+= min_offset; DBUG_PRINT("info", ("NewP buffer #%u: %p chaser: %d Size: %lu (%lu)", (uint) log_descriptor.bc.buffer->buffer_no, log_descriptor.bc.buffer, log_descriptor.bc.chaser, (ulong) log_descriptor.bc.buffer->size, (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. buffer->buffer))); DBUG_ASSERT((ulong) (log_descriptor.bc.ptr - log_descriptor.bc.buffer->buffer) == log_descriptor.bc.buffer->size); DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no == log_descriptor.bc.buffer_no); translog_buffer_increase_writers(log_descriptor.bc.buffer); // register for case of error used_buffs_add(buffs, log_descriptor.bc.buffer); if (file_end_offset <= buffer_end_offset) { log_descriptor.horizon+= LSN_ONE_FILE; log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, TRANSLOG_PAGE_SIZE); DBUG_PRINT("info", ("New file: %lu", (ulong) LSN_FILE_NO(log_descriptor.horizon))); if (translog_create_new_file()) { struct st_translog_buffer *ob= log_descriptor.bc.buffer; translog_buffer_unlock(ob); used_buffs_urgent_unlock(buffs); translog_buffer_lock(ob); DBUG_RETURN(1); } } else { DBUG_PRINT("info", ("The same file")); log_descriptor.horizon+= min_offset; /* offset increasing */ } translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); old_buffer->next_buffer_offset= new_buffer->offset; new_buffer->prev_buffer_offset= old_buffer->offset; translog_buffer_unlock(old_buffer); offset-= min_offset; } DBUG_PRINT("info", ("drop write_counter")); log_descriptor.bc.write_counter= 0; log_descriptor.bc.previous_offset= 0; end: log_descriptor.bc.ptr+= offset; log_descriptor.bc.buffer->size+= offset; translog_buffer_increase_writers(log_descriptor.bc.buffer); used_buffs_add(buffs, log_descriptor.bc.buffer); log_descriptor.horizon+= offset; /* offset increasing */ log_descriptor.bc.current_page_fill= last_page_offset; DBUG_PRINT("info", ("NewP buffer #%u: %p chaser: %d Size: %lu (%lu) " "offset: %u last page: %u", (uint) log_descriptor.bc.buffer->buffer_no, log_descriptor.bc.buffer, log_descriptor.bc.chaser, (ulong) log_descriptor.bc.buffer->size, (ulong) (log_descriptor.bc.ptr - log_descriptor.bc.buffer-> buffer), (uint) offset, (uint) last_page_offset)); DBUG_PRINT("info", ("pointer moved to: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon))); translog_check_cursor(&log_descriptor.bc); log_descriptor.bc.protected= 0; DBUG_RETURN(0); } static void used_buffs_add(TRUNSLOG_USED_BUFFERS *buffs, struct st_translog_buffer *buff) { DBUG_ENTER("used_buffs_add"); DBUG_PRINT("enter", ("ADD buffs: %p unlk %u (%p) wrt_ptr: %u (%p)" " buff %p (%u)", buffs, buffs->wrt_ptr, buffs->buff[buffs->wrt_ptr], buffs->unlck_ptr, buffs->buff[buffs->unlck_ptr], buff, buff->buffer_no)); DBUG_ASSERT(buffs->wrt_ptr < MAX_TRUNSLOG_USED_BUFFERS); buffs->buff[buffs->wrt_ptr++]= buff; DBUG_VOID_RETURN; } static void used_buffs_register_unlock(TRUNSLOG_USED_BUFFERS *buffs, struct st_translog_buffer *buff __attribute__((unused)) ) { DBUG_ENTER("used_buffs_register_unlock"); DBUG_PRINT("enter", ("SUB buffs: %p unlk %u (%p) wrt_ptr: %u (%p)" " buff %p (%u)", buffs, buffs->wrt_ptr, buffs->buff[buffs->wrt_ptr], buffs->unlck_ptr, buffs->buff[buffs->unlck_ptr], buff, buff->buffer_no)); DBUG_ASSERT(buffs->buff[buffs->unlck_ptr] == buff); buffs->unlck_ptr++; DBUG_VOID_RETURN; } static void used_buffs_urgent_unlock(TRUNSLOG_USED_BUFFERS *buffs) { uint i; DBUG_ENTER("used_buffs_urgent_unlock"); translog_lock(); translog_stop_writing(); translog_unlock(); for (i= buffs->unlck_ptr; i < buffs->wrt_ptr; i++) { struct st_translog_buffer *buf= buffs->buff[i]; translog_buffer_lock(buf); translog_buffer_decrease_writers(buf); translog_buffer_unlock(buf); buffs->buff[i]= NULL; } used_buffs_init(buffs); DBUG_VOID_RETURN; } /* Get page rest SYNOPSIS translog_get_current_page_rest() NOTE loghandler should be locked RETURN number of bytes left on the current page */ static uint translog_get_current_page_rest() { return (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill); } /* Get buffer rest in full pages SYNOPSIS translog_get_current_buffer_rest() NOTE loghandler should be locked RETURN number of full pages left on the current buffer */ static uint translog_get_current_buffer_rest() { return (uint)((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER - log_descriptor.bc.ptr) / TRANSLOG_PAGE_SIZE); } /* Calculate possible group size without first (current) page SYNOPSIS translog_get_current_group_size() NOTE loghandler should be locked RETURN group size without first (current) page */ static translog_size_t translog_get_current_group_size() { /* buffer rest in full pages */ translog_size_t buffer_rest= translog_get_current_buffer_rest(); DBUG_ENTER("translog_get_current_group_size"); DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest)); buffer_rest*= log_descriptor.page_capacity_chunk_2; /* in case of only half of buffer free we can write this and next buffer */ if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2) { DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu", (ulong) buffer_rest, (ulong) log_descriptor.buffer_capacity_chunk_2)); buffer_rest+= log_descriptor.buffer_capacity_chunk_2; } DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest)); DBUG_RETURN(buffer_rest); } static inline void set_lsn(LSN *lsn, LSN value) { DBUG_ENTER("set_lsn"); translog_lock_assert_owner(); *lsn= value; /* we generate LSN so something is not flushed in log */ log_descriptor.is_everything_flushed= 0; DBUG_PRINT("info", ("new LSN appeared: " LSN_FMT, LSN_IN_PARTS(value))); DBUG_VOID_RETURN; } /** @brief Write variable record in 1 group. @param lsn LSN of the record will be written here @param type the log record type @param short_trid Short transaction ID or 0 if it has no sense @param parts Descriptor of record source parts @param buffer_to_flush Buffer which have to be flushed if it is not 0 @param header_length Calculated header length of chunk type 0 @param trn Transaction structure pointer for hooks by record log type, for short_id @param hook_arg Argument which will be passed to pre-write and in-write hooks of this record. @note We must have a translog_lock() when entering this function We must have buffer_to_flush locked (if not null) @return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_write_variable_record_1group(LSN *lsn, enum translog_record_type type, MARIA_HA *tbl_info, SHORT_TRANSACTION_ID short_trid, struct st_translog_parts *parts, struct st_translog_buffer *buffer_to_flush, uint16 header_length, TRN *trn, void *hook_arg) { TRANSLOG_ADDRESS horizon; struct st_buffer_cursor cursor; int rc= 0; uint i; translog_size_t record_rest, full_pages, first_page; uint additional_chunk3_page= 0; uchar chunk0_header[1 + 2 + 5 + 2]; DBUG_ENTER("translog_write_variable_record_1group"); translog_lock_assert_owner(); if (buffer_to_flush) translog_buffer_lock_assert_owner(buffer_to_flush); set_lsn(lsn, horizon= log_descriptor.horizon); if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), *lsn, TRUE) || (log_record_type_descriptor[type].inwrite_hook && (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, lsn, hook_arg))) { translog_unlock(); if (buffer_to_flush != NULL) { translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); } DBUG_RETURN(1); } cursor= log_descriptor.bc; cursor.chaser= 1; /* Advance pointer to be able unlock the loghandler */ first_page= translog_get_current_page_rest(); record_rest= parts->record_length - (first_page - header_length); full_pages= record_rest / log_descriptor.page_capacity_chunk_2; record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); if (record_rest + 1 == log_descriptor.page_capacity_chunk_2) { DBUG_PRINT("info", ("2 chunks type 3 is needed")); /* We will write 2 chunks type 3 at the end of this group */ additional_chunk3_page= 1; record_rest= 1; } DBUG_PRINT("info", ("first_page: %u (%u) full_pages: %u (%lu) " "additional: %u (%u) rest %u = %u", first_page, first_page - header_length, full_pages, (ulong) full_pages * log_descriptor.page_capacity_chunk_2, additional_chunk3_page, additional_chunk3_page * (log_descriptor.page_capacity_chunk_2 - 1), record_rest, parts->record_length)); /* record_rest + 3 is chunk type 3 overhead + record_rest */ rc= translog_advance_pointer((int)(full_pages + additional_chunk3_page), (record_rest ? record_rest + 3 : 0), &cursor.buffs); log_descriptor.bc.buffer->last_lsn= *lsn; DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), log_descriptor.bc.buffer)); translog_unlock(); /* Check if we switched buffer and need process it (current buffer is unlocked already => we will not delay other threads */ if (buffer_to_flush != NULL) { if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); } if (rc) { //translog_advance_pointer decreased writers so it is OK DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); DBUG_RETURN(1); } translog_write_variable_record_1group_header(parts, type, short_trid, header_length, chunk0_header); /* fill the pages */ translog_write_parts_on_page(&horizon, &cursor, first_page, parts); DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon))); for (i= 0; i < full_pages; i++) { if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) goto error; DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon))); } if (additional_chunk3_page) { if (translog_write_variable_record_chunk3_page(parts, log_descriptor. page_capacity_chunk_2 - 2, &horizon, &cursor)) goto error; DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon))); DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE); } if (translog_write_variable_record_chunk3_page(parts, record_rest, &horizon, &cursor)) goto error; DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, (uint) LSN_FILE_NO(log_descriptor.horizon), (uint) LSN_OFFSET(log_descriptor.horizon), (uint) LSN_FILE_NO(horizon), (uint) LSN_OFFSET(horizon))); translog_buffer_lock(cursor.buffer); translog_buffer_decrease_writers(cursor.buffer); used_buffs_register_unlock(&cursor.buffs, cursor.buffer); translog_buffer_unlock(cursor.buffer); DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); DBUG_RETURN(0); error: used_buffs_urgent_unlock(&cursor.buffs); DBUG_RETURN(1); } /** @brief Write variable record in 1 chunk. @param lsn LSN of the record will be written here @param type the log record type @param short_trid Short transaction ID or 0 if it has no sense @param parts Descriptor of record source parts @param buffer_to_flush Buffer which have to be flushed if it is not 0 @param header_length Calculated header length of chunk type 0 @param trn Transaction structure pointer for hooks by record log type, for short_id @param hook_arg Argument which will be passed to pre-write and in-write hooks of this record. @note We must have a translog_lock() when entering this function We must have buffer_to_flush locked (if not null) @return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_write_variable_record_1chunk(LSN *lsn, enum translog_record_type type, MARIA_HA *tbl_info, SHORT_TRANSACTION_ID short_trid, struct st_translog_parts *parts, struct st_translog_buffer *buffer_to_flush, uint16 header_length, TRN *trn, void *hook_arg) { int rc; uchar chunk0_header[1 + 2 + 5 + 2]; DBUG_ENTER("translog_write_variable_record_1chunk"); translog_lock_assert_owner(); if (buffer_to_flush) translog_buffer_lock_assert_owner(buffer_to_flush); translog_write_variable_record_1group_header(parts, type, short_trid, header_length, chunk0_header); set_lsn(lsn, log_descriptor.horizon); if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), *lsn, TRUE) || (log_record_type_descriptor[type].inwrite_hook && (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, lsn, hook_arg))) { translog_unlock(); rc= 1; goto err; } rc= translog_write_parts_on_page(&log_descriptor.horizon, &log_descriptor.bc, parts->total_record_length, parts); log_descriptor.bc.buffer->last_lsn= *lsn; DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), log_descriptor.bc.buffer)); translog_unlock(); /* check if we switched buffer and need process it (current buffer is unlocked already => we will not delay other threads */ err: if (buffer_to_flush != NULL) { if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); } DBUG_RETURN(rc); } /* @brief Calculates and write LSN difference (compressed LSN). @param base_lsn LSN from which we calculate difference @param lsn LSN for codding @param dst Result will be written to dst[-pack_length] .. dst[-1] @note To store an LSN in a compact way we will use the following compression: If a log record has LSN1, and it contains the LSN2 as a back reference, Instead of LSN2 we write LSN1-LSN2, encoded as: two bits the number N (see below) 14 bits N bytes That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 is stored in the first two bits. @note function made to write the result in backward direction with no special sense or tricks both directions are equal in complicity @retval # pointer on coded LSN */ static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst) { uint64 diff; DBUG_ENTER("translog_put_LSN_diff"); DBUG_PRINT("enter", ("Base: " LSN_FMT " val: " LSN_FMT " dst: %p", LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn), dst)); DBUG_ASSERT(base_lsn > lsn); diff= base_lsn - lsn; DBUG_PRINT("info", ("Diff: 0x%llx", (ulonglong) diff)); if (diff <= 0x3FFF) { dst-= 2; /* Note we store this high uchar first to ensure that first uchar has 0 in the 3 upper bits. */ dst[0]= (uchar)(diff >> 8); dst[1]= (uchar)(diff & 0xFF); } else if (diff <= 0x3FFFFFL) { dst-= 3; dst[0]= (uchar)(0x40 | (diff >> 16)); int2store(dst + 1, diff & 0xFFFF); } else if (diff <= 0x3FFFFFFFL) { dst-= 4; dst[0]= (uchar)(0x80 | (diff >> 24)); int3store(dst + 1, diff & 0xFFFFFFL); } else if (diff <= 0x3FFFFFFFFFLL) { dst-= 5; dst[0]= (uchar)(0xC0 | (diff >> 32)); int4store(dst + 1, diff & 0xFFFFFFFFL); } else { /* It is full LSN after special 1 diff (which is impossible in real life) */ dst-= 2 + LSN_STORE_SIZE; dst[0]= 0; dst[1]= 1; lsn_store(dst + 2, lsn); } DBUG_PRINT("info", ("new dst: %p", dst)); DBUG_RETURN(dst); } /* Get LSN from LSN-difference (compressed LSN) SYNOPSIS translog_get_LSN_from_diff() base_lsn LSN from which we calculate difference src pointer to coded lsn dst pointer to buffer where to write 7byte LSN NOTE: To store an LSN in a compact way we will use the following compression: If a log record has LSN1, and it contains the lSN2 as a back reference, Instead of LSN2 we write LSN1-LSN2, encoded as: two bits the number N (see below) 14 bits N bytes That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 is stored in the first two bits. RETURN pointer to buffer after decoded LSN */ static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst) { LSN lsn; uint32 diff; uint32 first_byte; uint32 file_no, rec_offset; uint8 code; DBUG_ENTER("translog_get_LSN_from_diff"); DBUG_PRINT("enter", ("Base: " LSN_FMT " src:%p dst %p", LSN_IN_PARTS(base_lsn), src, dst)); first_byte= *((uint8*) src); code= first_byte >> 6; /* Length is in 2 most significant bits */ first_byte&= 0x3F; src++; /* Skip length + encode */ file_no= LSN_FILE_NO(base_lsn); /* Assume relative */ DBUG_PRINT("info", ("code: %u first byte: %lu", (uint) code, (ulong) first_byte)); switch (code) { case 0: if (first_byte == 0 && *((uint8*)src) == 1) { /* It is full LSN after special 1 diff (which is impossible in real life) */ memcpy(dst, src + 1, LSN_STORE_SIZE); DBUG_PRINT("info", ("Special case of full LSN, new src:%p", src + 1 + LSN_STORE_SIZE)); DBUG_RETURN(src + 1 + LSN_STORE_SIZE); } rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) | *((uint8*)src)); break; case 1: diff= uint2korr(src); rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) | diff); break; case 2: diff= uint3korr(src); rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) | diff); break; case 3: { ulonglong base_offset= LSN_OFFSET(base_lsn); diff= uint4korr(src); if (diff > LSN_OFFSET(base_lsn)) { /* take 1 from file offset */ first_byte++; base_offset+= 0x100000000LL; } file_no= LSN_FILE_NO(base_lsn) - first_byte; DBUG_ASSERT(base_offset - diff <= UINT_MAX); rec_offset= (uint32)(base_offset - diff); break; } default: DBUG_ASSERT(0); DBUG_RETURN(NULL); } lsn= MAKE_LSN(file_no, rec_offset); src+= code + 1; lsn_store(dst, lsn); DBUG_PRINT("info", ("new src:%p", src)); DBUG_RETURN(src); } /** @brief Encodes relative LSNs listed in the parameters. @param parts Parts list with encoded LSN(s) @param base_lsn LSN which is base for encoding @param lsns number of LSN(s) to encode @param compressed_LSNs buffer which can be used for storing compressed LSN(s) */ static void translog_relative_LSN_encode(struct st_translog_parts *parts, LSN base_lsn, uint lsns, uchar *compressed_LSNs) { LEX_CUSTRING *part; uint lsns_len= lsns * LSN_STORE_SIZE; uchar buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE]; uchar *buffer= buffer_src; const uchar *cbuffer; DBUG_ENTER("translog_relative_LSN_encode"); DBUG_ASSERT(parts->current != 0); part= parts->parts + parts->current; /* collect all LSN(s) in one chunk if it (they) is (are) divided */ if (part->length < lsns_len) { size_t copied= part->length; LEX_CUSTRING *next_part; DBUG_PRINT("info", ("Using buffer:%p", compressed_LSNs)); memcpy(buffer, part->str, part->length); next_part= parts->parts + parts->current + 1; do { DBUG_ASSERT(next_part < parts->parts + parts->elements); if ((next_part->length + copied) < lsns_len) { memcpy(buffer + copied, next_part->str, next_part->length); copied+= next_part->length; next_part->length= 0; next_part->str= 0; /* delete_dynamic_element(&parts->parts, parts->current + 1); */ next_part++; parts->current++; part= parts->parts + parts->current; } else { size_t len= lsns_len - copied; memcpy(buffer + copied, next_part->str, len); copied= lsns_len; next_part->str+= len; next_part->length-= len; } } while (copied < lsns_len); cbuffer= buffer; } else { cbuffer= part->str; part->str+= lsns_len; part->length-= lsns_len; parts->current--; part= parts->parts + parts->current; } { /* Compress */ LSN ref; int economy; const uchar *src_ptr; uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD * COMPRESSED_LSN_MAX_STORE_SIZE); /* We write the result in backward direction with no special sense or tricks both directions are equal in complicity */ for (src_ptr= cbuffer + lsns_len - LSN_STORE_SIZE; src_ptr >= (const uchar*)cbuffer; src_ptr-= LSN_STORE_SIZE) { ref= lsn_korr(src_ptr); dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr); } part->length= (size_t)((compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD * COMPRESSED_LSN_MAX_STORE_SIZE)) - dst_ptr); economy= lsns_len - (uint)part->length; parts->record_length-= economy; DBUG_PRINT("info", ("new length of LSNs: %lu economy: %d", (ulong)part->length, economy)); parts->total_record_length-= economy; part->str= dst_ptr; } DBUG_VOID_RETURN; } /** @brief Write multi-group variable-size record. @param lsn LSN of the record will be written here @param type the log record type @param short_trid Short transaction ID or 0 if it has no sense @param parts Descriptor of record source parts @param buffer_to_flush Buffer which have to be flushed if it is not 0 @param header_length Header length calculated for 1 group @param buffer_rest Beginning from which we plan to write in full pages @param trn Transaction structure pointer for hooks by record log type, for short_id @param hook_arg Argument which will be passed to pre-write and in-write hooks of this record. @note We must have a translog_lock() when entering this function We must have buffer_to_flush locked (if not null) buffer_to_flush should *NOT* be locked when calling this function. (This is note is here as this is different from most other translog_write...() functions which require the buffer to be locked) @return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_write_variable_record_mgroup(LSN *lsn, enum translog_record_type type, MARIA_HA *tbl_info, SHORT_TRANSACTION_ID short_trid, struct st_translog_parts *parts, struct st_translog_buffer *buffer_to_flush, uint16 header_length, translog_size_t buffer_rest, TRN *trn, void *hook_arg) { TRANSLOG_ADDRESS horizon; struct st_buffer_cursor cursor; int rc= 0; size_t i, curr_group= 0; uint chunk2_page, full_pages; translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1; translog_size_t done= 0; struct st_translog_group_descriptor group; DYNAMIC_ARRAY groups; uint16 chunk3_size; uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1; uint16 last_page_capacity; my_bool new_page_before_chunk0= 1, first_chunk0= 1; uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1]; uchar chunk2_header[1]; uint header_fixed_part= header_length + 2; uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1); uint file_of_the_first_group; int pages_to_skip; struct st_translog_buffer *buffer_of_last_lsn; my_bool external_buffer_to_flush= TRUE; DBUG_ENTER("translog_write_variable_record_mgroup"); translog_lock_assert_owner(); used_buffs_init(&cursor.buffs); chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &groups, sizeof(struct st_translog_group_descriptor), 10, 10, MYF(0))) { translog_unlock(); if (buffer_to_flush != NULL) { translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); } DBUG_PRINT("error", ("init array failed")); DBUG_RETURN(1); } first_page= translog_get_current_page_rest(); record_rest= parts->record_length - (first_page - 1); DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest)); if (record_rest < buffer_rest) { /* The record (group 1 type) is larger than the free space on the page - we need to split it in two. But when we split it in two, the first part is big enough to hold all the data of the record (because the header of the first part of the split is smaller than the header of the record as a whole when it takes only one chunk) */ DBUG_PRINT("info", ("too many free space because changing header")); buffer_rest-= log_descriptor.page_capacity_chunk_2; DBUG_ASSERT(record_rest >= buffer_rest); } file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon); translog_mark_file_unfinished(file_of_the_first_group); do { DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); group.addr= horizon= log_descriptor.horizon; cursor= log_descriptor.bc; cursor.chaser= 1; if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255) { /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */ full_pages= 255; buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2; } /* group chunks = full pages + first page (which actually can be full, too). But here we assign number of chunks - 1 */ group.num= full_pages; if (insert_dynamic(&groups, (uchar*) &group)) { DBUG_PRINT("error", ("insert into array failed")); goto err_unlock; } DBUG_PRINT("info", ("chunk: #%u first_page: %u (%u) " "full_pages: %lu (%lu) " "Left %lu", groups.elements, first_page, first_page - 1, (ulong) full_pages, (ulong) (full_pages * log_descriptor.page_capacity_chunk_2), (ulong)(parts->record_length - (first_page - 1 + buffer_rest) - done))); rc= translog_advance_pointer((int)full_pages, 0, &cursor.buffs); translog_unlock(); if (buffer_to_flush != NULL) { if (!external_buffer_to_flush) translog_buffer_decrease_writers(buffer_to_flush); if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); buffer_to_flush= NULL; } external_buffer_to_flush= FALSE; if (rc) { DBUG_PRINT("error", ("flush of unlock buffer failed")); //translog_advance_pointer decreased writers so it is OK DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); goto err; } translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " "Left %lu", LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon), (ulong) (parts->record_length - (first_page - 1) - done))); for (i= 0; i < full_pages; i++) { if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) goto err; DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " " "local: " LSN_FMT " " "Left: %lu", LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon), (ulong) (parts->record_length - (first_page - 1) - i * log_descriptor.page_capacity_chunk_2 - done))); } done+= (first_page - 1 + buffer_rest); if (translog_chaser_page_next(&horizon, &cursor)) { DBUG_PRINT("error", ("flush of unlock buffer failed")); goto err; } translog_buffer_lock(cursor.buffer); translog_buffer_decrease_writers(cursor.buffer); used_buffs_register_unlock(&cursor.buffs, cursor.buffer); translog_buffer_unlock(cursor.buffer); translog_lock(); /* Check that we have place for chunk type 2 */ first_page= translog_get_current_page_rest(); if (first_page <= 1) { if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, &buffer_to_flush)) goto err_unlock; first_page= translog_get_current_page_rest(); } buffer_rest= translog_get_current_group_size(); if (buffer_to_flush) used_buffs_register_unlock(&cursor.buffs, buffer_to_flush); // will be unlocked } while ((translog_size_t)(first_page + buffer_rest) < (translog_size_t)(parts->record_length - done)); group.addr= horizon= log_descriptor.horizon; cursor= log_descriptor.bc; cursor.chaser= 1; group.num= 0; /* 0 because it does not matter */ if (insert_dynamic(&groups, (uchar*) &group)) { DBUG_PRINT("error", ("insert into array failed")); goto err_unlock; } record_rest= parts->record_length - done; DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest)); if (first_page > record_rest + 1) { /* We have not so much data to fill all first page (no speaking about full pages) so it will be: > or ...> or >...> */ chunk2_page= full_pages= 0; last_page_capacity= first_page; pages_to_skip= -1; } else { /* We will have: >...>> or >...>...> or >...> */ chunk2_page= 1; record_rest-= (first_page - 1); pages_to_skip= full_pages= record_rest / log_descriptor.page_capacity_chunk_2; record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); last_page_capacity= page_capacity; } chunk3_size= 0; chunk3_pages= 0; if (last_page_capacity > record_rest + 1 && record_rest != 0) { if (last_page_capacity > record_rest + header_fixed_part + groups.elements * (7 + 1)) { /* 1 record of type 0 */ chunk3_pages= 0; } else { pages_to_skip++; chunk3_pages= 1; if (record_rest + 2 == last_page_capacity) { chunk3_size= record_rest - 1; record_rest= 1; } else { chunk3_size= record_rest; record_rest= 0; } } } /* A first non-full page will hold type 0 chunk only if it fit in it with all its headers */ while (page_capacity < record_rest + header_fixed_part + (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1)) chunk0_pages++; DBUG_PRINT("info", ("chunk0_pages: %u groups %u groups per full page: %u " "Group on last page: %u", chunk0_pages, groups.elements, groups_per_page, (groups.elements - ((page_capacity - header_fixed_part) / (7 + 1)) * (chunk0_pages - 1)))); DBUG_PRINT("info", ("first_page: %u chunk2: %u full_pages: %u (%lu) " "chunk3: %u (%u) rest: %u", first_page, chunk2_page, full_pages, (ulong) full_pages * log_descriptor.page_capacity_chunk_2, chunk3_pages, (uint) chunk3_size, (uint) record_rest)); DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); rc= translog_advance_pointer(pages_to_skip + (int)(chunk0_pages - 1), (uint16)(record_rest + header_fixed_part + ((uint)groups.elements - ((page_capacity - header_fixed_part) / (7 + 1)) * (chunk0_pages - 1)) * (7 + 1)), &cursor.buffs); buffer_of_last_lsn= log_descriptor.bc.buffer; translog_unlock(); if (buffer_to_flush != NULL) { DBUG_ASSERT(!external_buffer_to_flush); translog_buffer_decrease_writers(buffer_to_flush); if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); buffer_to_flush= NULL; } if (rc) { DBUG_PRINT("error", ("flush of unlock buffer failed")); goto err; } if (rc) goto err; if (chunk2_page) { DBUG_PRINT("info", ("chunk 2 to finish first page")); translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " "Left: %lu", LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon), (ulong) (parts->record_length - (first_page - 1) - done))); } else if (chunk3_pages) { uchar chunk3_header[3]; DBUG_PRINT("info", ("chunk 3")); DBUG_ASSERT(full_pages == 0); chunk3_pages= 0; chunk3_header[0]= TRANSLOG_CHUNK_LNGTH; int2store(chunk3_header + 1, chunk3_size); translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header); translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts); DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " "Left: %lu", LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon), (ulong) (parts->record_length - chunk3_size - done))); } else { DBUG_PRINT("info", ("no new_page_before_chunk0")); new_page_before_chunk0= 0; } for (i= 0; i < full_pages; i++) { DBUG_ASSERT(chunk2_page != 0); if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) goto err; DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " "Left: %lu", LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon), (ulong) (parts->record_length - (first_page - 1) - i * log_descriptor.page_capacity_chunk_2 - done))); } if (chunk3_pages && translog_write_variable_record_chunk3_page(parts, chunk3_size, &horizon, &cursor)) goto err; DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon), LSN_IN_PARTS(horizon))); *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); int2store(chunk0_header + 1, short_trid); translog_write_variable_record_1group_code_len(chunk0_header + 3, parts->record_length, header_length); do { size_t limit; if (new_page_before_chunk0 && translog_chaser_page_next(&horizon, &cursor)) { DBUG_PRINT("error", ("flush of unlock buffer failed")); goto err; } new_page_before_chunk0= 1; if (first_chunk0) { first_chunk0= 0; /* We can drop "log_descriptor.is_everything_flushed" earlier when have lock on loghandler and assign initial value of "horizon" variable or before unlocking loghandler (because we will increase writers counter on the buffer and every thread which wanted flush the buffer will wait till we finish with it). But IMHO better here take short lock and do not bother other threads with waiting. */ translog_lock(); set_lsn(lsn, horizon); buffer_of_last_lsn->last_lsn= *lsn; DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", LSN_IN_PARTS(buffer_of_last_lsn->last_lsn), buffer_of_last_lsn)); if (log_record_type_descriptor[type].inwrite_hook && (*log_record_type_descriptor[type].inwrite_hook) (type, trn, tbl_info, lsn, hook_arg)) goto err_unlock; translog_unlock(); } /* A first non-full page will hold type 0 chunk only if it fit in it with all its headers => the fist page is full or number of groups less then possible number of full page. */ limit= (groups_per_page < groups.elements - curr_group ? groups_per_page : groups.elements - curr_group); DBUG_PRINT("info", ("Groups: %zu curr: %zu limit: %zu", groups.elements, curr_group, limit)); if (chunk0_pages == 1) { DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u", (uint) limit, (uint) record_rest, (uint) (2 + limit * (7 + 1) + record_rest))); int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1) + record_rest); } else { DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u", (uint) limit, (uint) (2 + limit * (7 + 1)))); int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1)); } int2store(chunk0_header + header_length, groups.elements - curr_group); translog_write_data_on_page(&horizon, &cursor, header_fixed_part, chunk0_header); for (i= curr_group; i < limit + curr_group; i++) { struct st_translog_group_descriptor *grp_ptr; grp_ptr= dynamic_element(&groups, i, struct st_translog_group_descriptor *); lsn_store(group_desc, grp_ptr->addr); group_desc[7]= grp_ptr->num; translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc); } if (chunk0_pages == 1 && record_rest != 0) translog_write_parts_on_page(&horizon, &cursor, record_rest, parts); chunk0_pages--; curr_group+= limit; /* put special type to indicate that it is not LSN chunk */ *chunk0_header= (uchar) (TRANSLOG_CHUNK_LSN | TRANSLOG_CHUNK_0_CONT); } while (chunk0_pages != 0); translog_buffer_lock(cursor.buffer); translog_buffer_decrease_writers(cursor.buffer); used_buffs_register_unlock(&cursor.buffs, cursor.buffer); translog_buffer_unlock(cursor.buffer); rc= 0; DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn), *lsn, FALSE)) goto err; translog_mark_file_finished(file_of_the_first_group); delete_dynamic(&groups); DBUG_RETURN(0); err_unlock: translog_unlock(); err: if (cursor.buffs.unlck_ptr != cursor.buffs.wrt_ptr) used_buffs_urgent_unlock(&cursor.buffs); if (buffer_to_flush != NULL) { /* This is to prevent locking buffer forever in case of error */ if (!external_buffer_to_flush) translog_buffer_decrease_writers(buffer_to_flush); if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); buffer_to_flush= NULL; } translog_mark_file_finished(file_of_the_first_group); delete_dynamic(&groups); DBUG_RETURN(1); } /** @brief Write the variable length log record. @param lsn LSN of the record will be written here @param type the log record type @param short_trid Short transaction ID or 0 if it has no sense @param parts Descriptor of record source parts @param trn Transaction structure pointer for hooks by record log type, for short_id @param hook_arg Argument which will be passed to pre-write and in-write hooks of this record. @return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_write_variable_record(LSN *lsn, enum translog_record_type type, MARIA_HA *tbl_info, SHORT_TRANSACTION_ID short_trid, struct st_translog_parts *parts, TRN *trn, void *hook_arg) { struct st_translog_buffer *buffer_to_flush= NULL; uint header_length1= 1 + 2 + 2 + translog_variable_record_length_bytes(parts->record_length); ulong buffer_rest; uint page_rest; /* Max number of such LSNs per record is 2 */ uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * COMPRESSED_LSN_MAX_STORE_SIZE]; my_bool res; DBUG_ENTER("translog_write_variable_record"); translog_lock(); DBUG_PRINT("info", ("horizon: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon))); page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; DBUG_PRINT("info", ("header length: %u page_rest: %u", header_length1, page_rest)); /* header and part which we should read have to fit in one chunk TODO: allow to divide readable header */ if (page_rest < (header_length1 + log_record_type_descriptor[type].read_header_len)) { DBUG_PRINT("info", ("Next page, size: %u header: %u + %u", log_descriptor.bc.current_page_fill, header_length1, log_record_type_descriptor[type].read_header_len)); translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, &buffer_to_flush); /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */ page_rest= log_descriptor.page_capacity_chunk_2 + 1; DBUG_PRINT("info", ("page_rest: %u", page_rest)); } /* To minimize compressed size we will compress always relative to very first chunk address (log_descriptor.horizon for now) */ if (log_record_type_descriptor[type].compressed_LSN > 0) { translog_relative_LSN_encode(parts, log_descriptor.horizon, log_record_type_descriptor[type]. compressed_LSN, compressed_LSNs); /* recalculate header length after compression */ header_length1= 1 + 2 + 2 + translog_variable_record_length_bytes(parts->record_length); DBUG_PRINT("info", ("after compressing LSN(s) header length: %u " "record length: %lu", header_length1, (ulong)parts->record_length)); } /* TODO: check space on current page for header + few bytes */ if (page_rest >= parts->record_length + header_length1) { /* following function makes translog_unlock(); */ res= translog_write_variable_record_1chunk(lsn, type, tbl_info, short_trid, parts, buffer_to_flush, header_length1, trn, hook_arg); DBUG_RETURN(res); } buffer_rest= translog_get_current_group_size(); if (buffer_rest >= parts->record_length + header_length1 - page_rest) { /* following function makes translog_unlock(); */ res= translog_write_variable_record_1group(lsn, type, tbl_info, short_trid, parts, buffer_to_flush, header_length1, trn, hook_arg); DBUG_RETURN(res); } /* following function makes translog_unlock(); */ res= translog_write_variable_record_mgroup(lsn, type, tbl_info, short_trid, parts, buffer_to_flush, header_length1, buffer_rest, trn, hook_arg); DBUG_RETURN(res); } /** @brief Write the fixed and pseudo-fixed log record. @param lsn LSN of the record will be written here @param type the log record type @param short_trid Short transaction ID or 0 if it has no sense @param parts Descriptor of record source parts @param trn Transaction structure pointer for hooks by record log type, for short_id @param hook_arg Argument which will be passed to pre-write and in-write hooks of this record. @return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_write_fixed_record(LSN *lsn, enum translog_record_type type, MARIA_HA *tbl_info, SHORT_TRANSACTION_ID short_trid, struct st_translog_parts *parts, TRN *trn, void *hook_arg) { struct st_translog_buffer *buffer_to_flush= NULL; uchar chunk1_header[1 + 2]; /* Max number of such LSNs per record is 2 */ uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * COMPRESSED_LSN_MAX_STORE_SIZE]; LEX_CUSTRING *part; int rc= 1; DBUG_ENTER("translog_write_fixed_record"); DBUG_ASSERT((log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH && parts->record_length == log_record_type_descriptor[type].fixed_length) || (log_record_type_descriptor[type].rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH && parts->record_length == log_record_type_descriptor[type].fixed_length)); translog_lock(); DBUG_PRINT("info", ("horizon: " LSN_FMT, LSN_IN_PARTS(log_descriptor.horizon))); DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE); DBUG_PRINT("info", ("Page size: %u record: %u next cond: %d", log_descriptor.bc.current_page_fill, (parts->record_length + log_record_type_descriptor[type].compressed_LSN * 2 + 3), ((((uint) log_descriptor.bc.current_page_fill) + (parts->record_length + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > TRANSLOG_PAGE_SIZE))); /* check that there is enough place on current page. NOTE: compressing may increase page LSN size on two bytes for every LSN */ if ((((uint) log_descriptor.bc.current_page_fill) + (parts->record_length + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > TRANSLOG_PAGE_SIZE) { DBUG_PRINT("info", ("Next page")); if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, &buffer_to_flush)) goto err; /* rc == 1 */ if (buffer_to_flush) translog_buffer_lock_assert_owner(buffer_to_flush); } set_lsn(lsn, log_descriptor.horizon); if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), *lsn, TRUE) || (log_record_type_descriptor[type].inwrite_hook && (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, lsn, hook_arg))) goto err; /* compress LSNs */ if (log_record_type_descriptor[type].rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH) { DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0); translog_relative_LSN_encode(parts, *lsn, log_record_type_descriptor[type]. compressed_LSN, compressed_LSNs); } /* Write the whole record at once (we know that there is enough place on the destination page) */ DBUG_ASSERT(parts->current != 0); /* first part is left for header */ part= parts->parts + (--parts->current); parts->total_record_length+= (translog_size_t) (part->length= 1 + 2); part->str= chunk1_header; *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED); int2store(chunk1_header + 1, short_trid); rc= translog_write_parts_on_page(&log_descriptor.horizon, &log_descriptor.bc, parts->total_record_length, parts); log_descriptor.bc.buffer->last_lsn= *lsn; DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), log_descriptor.bc.buffer)); err: translog_unlock(); /* check if we switched buffer and need process it (current buffer is unlocked already => we will not delay other threads */ if (buffer_to_flush != NULL) { if (!rc) rc= translog_buffer_flush(buffer_to_flush); translog_buffer_unlock(buffer_to_flush); } DBUG_RETURN(rc); } /** @brief Writes the log record If share has no 2-byte-id yet, gives an id to the share and logs LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID yet, logs it. @param lsn LSN of the record will be written here @param type the log record type @param trn Transaction structure pointer for hooks by record log type, for short_id @param tbl_info MARIA_HA of table or NULL @param rec_len record length or 0 (count it) @param part_no number of parts or 0 (count it) @param parts_data zero ended (in case of number of parts is 0) array of LEX_STRINGs (parts), first TRANSLOG_INTERNAL_PARTS positions in the log should be unused (need for loghandler) @param store_share_id if tbl_info!=NULL then share's id will automatically be stored in the two first bytes pointed (so pointer is assumed to be !=NULL) @param hook_arg argument which will be passed to pre-write and in-write hooks of this record. @return Operation status @retval 0 OK @retval 1 Error */ my_bool translog_write_record(LSN *lsn, enum translog_record_type type, TRN *trn, MARIA_HA *tbl_info, translog_size_t rec_len, uint part_no, LEX_CUSTRING *parts_data, uchar *store_share_id, void *hook_arg) { struct st_translog_parts parts; LEX_CUSTRING *part; int rc; uint short_trid= trn->short_id; DBUG_ENTER("translog_write_record"); DBUG_PRINT("enter", ("type: %u (%s) ShortTrID: %u rec_len: %lu", (uint) type, log_record_type_descriptor[type].name, (uint) short_trid, (ulong) rec_len)); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); DBUG_ASSERT(type != 0); DBUG_SLOW_ASSERT((uint)type <= max_allowed_translog_type); if (unlikely(translog_status != TRANSLOG_OK)) { DBUG_PRINT("error", ("Transaction log is write protected")); DBUG_RETURN(1); } if (tbl_info && type != LOGREC_FILE_ID) { MARIA_SHARE *share= tbl_info->s; DBUG_ASSERT(share->now_transactional); if (unlikely(share->id == 0)) { /* First log write for this MARIA_SHARE; give it a short id. When the lock manager is enabled and needs a short id, it should be assigned in the lock manager (because row locks will be taken before log records are written; for example SELECT FOR UPDATE takes locks but writes no log record. */ if (unlikely(translog_assign_id_to_share(tbl_info, trn))) DBUG_RETURN(1); } fileid_store(store_share_id, share->id); } if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID))) { LSN dummy_lsn; LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; uchar log_data[6]; DBUG_ASSERT(trn->undo_lsn == LSN_IMPOSSIBLE); int6store(log_data, trn->trid); log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */ if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID, trn, NULL, sizeof(log_data), sizeof(log_array)/sizeof(log_array[0]), log_array, NULL, NULL))) DBUG_RETURN(1); } parts.parts= parts_data; /* count parts if they are not counted by upper level */ if (part_no == 0) { for (part_no= TRANSLOG_INTERNAL_PARTS; parts_data[part_no].length != 0; part_no++); } parts.elements= part_no; parts.current= TRANSLOG_INTERNAL_PARTS; /* clear TRANSLOG_INTERNAL_PARTS */ compile_time_assert(TRANSLOG_INTERNAL_PARTS != 0); parts_data[0].str= 0; parts_data[0].length= 0; /* count length of the record */ if (rec_len == 0) { for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\ part < parts_data + part_no; part++) { rec_len+= (translog_size_t) part->length; } } parts.record_length= rec_len; #ifndef DBUG_OFF { uint i; size_t len= 0; #ifdef HAVE_valgrind ha_checksum checksum= 0; #endif for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++) { #ifdef HAVE_valgrind /* Find unitialized bytes early */ checksum+= my_checksum(checksum, parts_data[i].str, parts_data[i].length); #endif len+= parts_data[i].length; } DBUG_ASSERT(len == rec_len); } #endif /* Start total_record_length from record_length then overhead will be add */ parts.total_record_length= parts.record_length; DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length)); /* process this parts */ if (!(rc= (log_record_type_descriptor[type].prewrite_hook && (*log_record_type_descriptor[type].prewrite_hook)(type, trn, tbl_info, hook_arg)))) { switch (log_record_type_descriptor[type].rclass) { case LOGRECTYPE_VARIABLE_LENGTH: rc= translog_write_variable_record(lsn, type, tbl_info, short_trid, &parts, trn, hook_arg); break; case LOGRECTYPE_PSEUDOFIXEDLENGTH: case LOGRECTYPE_FIXEDLENGTH: rc= translog_write_fixed_record(lsn, type, tbl_info, short_trid, &parts, trn, hook_arg); break; case LOGRECTYPE_NOT_ALLOWED: default: DBUG_ASSERT(0); rc= 1; } } DBUG_PRINT("info", ("LSN: " LSN_FMT, LSN_IN_PARTS(*lsn))); DBUG_RETURN(rc); } /* Decode compressed (relative) LSN(s) SYNOPSIS translog_relative_lsn_decode() base_lsn LSN for encoding src Decode LSN(s) from here dst Put decoded LSNs here lsns number of LSN(s) RETURN position in sources after decoded LSN(s) */ static uchar *translog_relative_LSN_decode(LSN base_lsn, uchar *src, uchar *dst, uint lsns) { uint i; for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE) { src= translog_get_LSN_from_diff(base_lsn, src, dst); } return src; } /** @brief Get header of fixed/pseudo length record and call hook for it processing @param page Pointer to the buffer with page where LSN chunk is placed @param page_offset Offset of the first chunk in the page @param buff Buffer to be filled with header data @return Length of header or operation status @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where stored decoded part of the header */ static int translog_fixed_length_header(uchar *page, translog_size_t page_offset, TRANSLOG_HEADER_BUFFER *buff) { struct st_log_record_type_descriptor *desc= log_record_type_descriptor + buff->type; uchar *src= page + page_offset + 3; uchar *dst= buff->header; uchar *start= src; int lsns= desc->compressed_LSN; uint length= desc->fixed_length; DBUG_ENTER("translog_fixed_length_header"); buff->record_length= length; if (desc->rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH) { DBUG_ASSERT(lsns > 0); src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns); lsns*= LSN_STORE_SIZE; dst+= lsns; length-= lsns; buff->compressed_LSN_economy= (lsns - (int) (src - start)); } else buff->compressed_LSN_economy= 0; memcpy(dst, src, length); buff->non_header_data_start_offset= (uint16) (page_offset + ((src + length) - (page + page_offset))); buff->non_header_data_len= 0; DBUG_RETURN(buff->record_length); } /* Free resources used by TRANSLOG_HEADER_BUFFER SYNOPSIS translog_free_record_header(); */ void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff) { DBUG_ENTER("translog_free_record_header"); if (buff->groups_no != 0) { my_free(buff->groups); buff->groups_no= 0; } DBUG_VOID_RETURN; } /** @brief Returns the current horizon at the end of the current log @return Horizon @retval LSN_ERROR error @retvar # Horizon */ TRANSLOG_ADDRESS translog_get_horizon() { TRANSLOG_ADDRESS res; DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); translog_lock(); res= log_descriptor.horizon; translog_unlock(); return res; } /** @brief Returns the current horizon at the end of the current log, caller is assumed to already hold the lock @return Horizon @retval LSN_ERROR error @retvar # Horizon */ TRANSLOG_ADDRESS translog_get_horizon_no_lock() { DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); translog_lock_assert_owner(); return log_descriptor.horizon; } /* Set last page in the scanner data structure SYNOPSIS translog_scanner_set_last_page() scanner Information about current chunk during scanning RETURN 0 OK 1 Error */ static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA *scanner) { my_bool page_ok; if (LSN_FILE_NO(scanner->page_addr) == LSN_FILE_NO(scanner->horizon)) { /* It is last file => we can easy find last page address by horizon */ uint pagegrest= LSN_OFFSET(scanner->horizon) % TRANSLOG_PAGE_SIZE; scanner->last_file_page= (scanner->horizon - (pagegrest ? pagegrest : TRANSLOG_PAGE_SIZE)); return (0); } scanner->last_file_page= scanner->page_addr; return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok, 0)); } /** @brief Get page from page cache according to requested method @param scanner The scanner data @return operation status @retval 0 OK @retval 1 Error */ static my_bool translog_scanner_get_page(TRANSLOG_SCANNER_DATA *scanner) { TRANSLOG_VALIDATOR_DATA data; DBUG_ENTER("translog_scanner_get_page"); data.addr= &scanner->page_addr; data.was_recovered= 0; DBUG_RETURN((scanner->page= translog_get_page(&data, scanner->buffer, (scanner->use_direct_link ? &scanner->direct_link : NULL))) == NULL); } /** @brief Initialize reader scanner. @param lsn LSN with which it have to be inited @param fixed_horizon true if it is OK do not read records which was written after scanning beginning @param scanner scanner which have to be inited @param use_direct prefer using direct lings from page handler where it is possible. @note If direct link was used translog_destroy_scanner should be called after it using @return status of the operation @retval 0 OK @retval 1 Error */ my_bool translog_scanner_init(LSN lsn, my_bool fixed_horizon, TRANSLOG_SCANNER_DATA *scanner, my_bool use_direct) { DBUG_ENTER("translog_scanner_init"); DBUG_PRINT("enter", ("Scanner: %p LSN: " LSN_FMT, scanner, LSN_IN_PARTS(lsn))); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; scanner->fixed_horizon= fixed_horizon; scanner->use_direct_link= use_direct; scanner->direct_link= NULL; scanner->horizon= translog_get_horizon(); DBUG_PRINT("info", ("horizon: " LSN_FMT, LSN_IN_PARTS(scanner->horizon))); /* lsn < horizon */ DBUG_ASSERT(lsn <= scanner->horizon); scanner->page_addr= lsn; scanner->page_addr-= scanner->page_offset; /*decrease offset */ if (translog_scanner_set_last_page(scanner)) DBUG_RETURN(1); if (translog_scanner_get_page(scanner)) DBUG_RETURN(1); DBUG_RETURN(0); } /** @brief Destroy scanner object; @param scanner The scanner object to destroy */ void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner) { DBUG_ENTER("translog_destroy_scanner"); DBUG_PRINT("enter", ("Scanner: %p", scanner)); translog_free_link(scanner->direct_link); DBUG_VOID_RETURN; } /* Checks End of the Log SYNOPSIS translog_scanner_eol() scanner Information about current chunk during scanning RETURN 1 End of the Log 0 OK */ static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner) { DBUG_ENTER("translog_scanner_eol"); DBUG_PRINT("enter", ("Horizon: " LSN_FMT " Current: (%u, 0x%x+0x%x=0x%x)", LSN_IN_PARTS(scanner->horizon), LSN_IN_PARTS(scanner->page_addr), (uint) scanner->page_offset, (uint) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset))); if (scanner->horizon > (scanner->page_addr + scanner->page_offset)) { DBUG_PRINT("info", ("Horizon is not reached")); DBUG_RETURN(0); } if (scanner->fixed_horizon) { DBUG_PRINT("info", ("Horizon is fixed and reached")); DBUG_RETURN(1); } scanner->horizon= translog_get_horizon(); DBUG_PRINT("info", ("Horizon is re-read, EOL: %d", scanner->horizon <= (scanner->page_addr + scanner->page_offset))); DBUG_RETURN(scanner->horizon <= (scanner->page_addr + scanner->page_offset)); } /** @brief Cheks End of the Page @param scanner Information about current chunk during scanning @retval 1 End of the Page @retval 0 OK */ static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner) { DBUG_ENTER("translog_scanner_eop"); DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE || scanner->page[scanner->page_offset] == TRANSLOG_FILLER); } /** @brief Checks End of the File (i.e. we are scanning last page, which do not mean end of this page) @param scanner Information about current chunk during scanning @retval 1 End of the File @retval 0 OK */ static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner) { DBUG_ENTER("translog_scanner_eof"); DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) == LSN_FILE_NO(scanner->last_file_page)); DBUG_PRINT("enter", ("curr Page: 0x%lx last page: 0x%lx " "normal EOF: %d", (ulong) LSN_OFFSET(scanner->page_addr), (ulong) LSN_OFFSET(scanner->last_file_page), LSN_OFFSET(scanner->page_addr) == LSN_OFFSET(scanner->last_file_page))); /* TODO: detect damaged file EOF, TODO: issue warning if damaged file EOF detected */ DBUG_RETURN(scanner->page_addr == scanner->last_file_page); } /* Move scanner to the next chunk SYNOPSIS translog_get_next_chunk() scanner Information about current chunk during scanning RETURN 0 OK 1 Error */ static my_bool translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner) { uint16 len; DBUG_ENTER("translog_get_next_chunk"); if (translog_scanner_eop(scanner)) len= TRANSLOG_PAGE_SIZE - scanner->page_offset; else if ((len= translog_get_total_chunk_length(scanner->page, scanner->page_offset)) == 0) DBUG_RETURN(1); scanner->page_offset+= len; if (translog_scanner_eol(scanner)) { scanner->page= END_OF_LOG; scanner->page_offset= 0; DBUG_RETURN(0); } if (translog_scanner_eop(scanner)) { /* before reading next page we should unpin current one if it was pinned */ translog_free_link(scanner->direct_link); if (translog_scanner_eof(scanner)) { DBUG_PRINT("info", ("horizon: " LSN_FMT " pageaddr: " LSN_FMT, LSN_IN_PARTS(scanner->horizon), LSN_IN_PARTS(scanner->page_addr))); /* if it is log end it have to be caught before */ DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) > LSN_FILE_NO(scanner->page_addr)); scanner->page_addr+= LSN_ONE_FILE; scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr, TRANSLOG_PAGE_SIZE); if (translog_scanner_set_last_page(scanner)) DBUG_RETURN(1); } else { scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */ } if (translog_scanner_get_page(scanner)) DBUG_RETURN(1); scanner->page_offset= translog_get_first_chunk_offset(scanner->page); if (translog_scanner_eol(scanner)) { scanner->page= END_OF_LOG; scanner->page_offset= 0; DBUG_RETURN(0); } DBUG_ASSERT(scanner->page[scanner->page_offset] != TRANSLOG_FILLER); } DBUG_RETURN(0); } /** @brief Get header of variable length record and call hook for it processing @param page Pointer to the buffer with page where LSN chunk is placed @param page_offset Offset of the first chunk in the page @param buff Buffer to be filled with header data @param scanner If present should be moved to the header page if it differ from LSN page @return Length of header or operation status @retval RECHEADER_READ_ERROR error @retval RECHEADER_READ_EOF End of the log reached during the read @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where stored decoded part of the header */ static int translog_variable_length_header(uchar *page, translog_size_t page_offset, TRANSLOG_HEADER_BUFFER *buff, TRANSLOG_SCANNER_DATA *scanner) { struct st_log_record_type_descriptor *desc= (log_record_type_descriptor + buff->type); uchar *src= page + page_offset + 1 + 2; uchar *dst= buff->header; LSN base_lsn; uint lsns= desc->compressed_LSN; uint16 chunk_len; uint16 length= desc->read_header_len; uint16 buffer_length= length; uint16 body_len; int rc; TRANSLOG_SCANNER_DATA internal_scanner; DBUG_ENTER("translog_variable_length_header"); buff->record_length= translog_variable_record_1group_decode_len(&src); chunk_len= uint2korr(src); DBUG_PRINT("info", ("rec len: %lu chunk len: %u length: %u bufflen: %u", (ulong) buff->record_length, (uint) chunk_len, (uint) length, (uint) buffer_length)); if (chunk_len == 0) { uint16 page_rest; DBUG_PRINT("info", ("1 group")); src+= 2; page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); base_lsn= buff->lsn; body_len= MY_MIN(page_rest, buff->record_length); } else { uint grp_no, curr; uint header_to_skip; uint16 page_rest; DBUG_PRINT("info", ("multi-group")); grp_no= buff->groups_no= uint2korr(src + 2); if (!(buff->groups= (TRANSLOG_GROUP*) my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_GROUP) * grp_no, MYF(0)))) DBUG_RETURN(RECHEADER_READ_ERROR); DBUG_PRINT("info", ("Groups: %u", (uint) grp_no)); src+= (2 + 2); page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); curr= 0; header_to_skip= (uint) (src - (page + page_offset)); buff->chunk0_pages= 0; for (;;) { uint i, read_length= grp_no; buff->chunk0_pages++; if (page_rest < grp_no * (7 + 1)) read_length= page_rest / (7 + 1); DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u " "start from: %u", buff->chunk0_pages, read_length, grp_no, curr)); for (i= 0; i < read_length; i++, curr++) { DBUG_ASSERT(curr < buff->groups_no); buff->groups[curr].addr= lsn_korr(src + i * (7 + 1)); buff->groups[curr].num= src[i * (7 + 1) + 7]; DBUG_PRINT("info", ("group #%u " LSN_FMT " chunks: %u", curr, LSN_IN_PARTS(buff->groups[curr].addr), (uint) buff->groups[curr].num)); } grp_no-= read_length; if (grp_no == 0) { if (scanner) { buff->chunk0_data_addr= scanner->page_addr; /* offset increased */ buff->chunk0_data_addr+= (page_offset + header_to_skip + read_length * (7 + 1)); } else { buff->chunk0_data_addr= buff->lsn; /* offset increased */ buff->chunk0_data_addr+= (header_to_skip + read_length * (7 + 1)); } buff->chunk0_data_len= chunk_len - 2 - read_length * (7 + 1); DBUG_PRINT("info", ("Data address: " LSN_FMT " len: %u", LSN_IN_PARTS(buff->chunk0_data_addr), buff->chunk0_data_len)); break; } if (scanner == NULL) { DBUG_PRINT("info", ("use internal scanner for header reading")); scanner= &internal_scanner; if (translog_scanner_init(buff->lsn, 1, scanner, 0)) { rc= RECHEADER_READ_ERROR; goto exit_and_free; } } if (translog_get_next_chunk(scanner)) { if (scanner == &internal_scanner) translog_destroy_scanner(scanner); rc= RECHEADER_READ_ERROR; goto exit_and_free; } if (scanner->page == END_OF_LOG) { if (scanner == &internal_scanner) translog_destroy_scanner(scanner); rc= RECHEADER_READ_EOF; goto exit_and_free; } page= scanner->page; page_offset= scanner->page_offset; src= page + page_offset + header_to_skip; chunk_len= uint2korr(src - 2 - 2); DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len)); page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); } if (scanner == NULL) { DBUG_PRINT("info", ("use internal scanner")); scanner= &internal_scanner; } else { translog_destroy_scanner(scanner); } base_lsn= buff->groups[0].addr; translog_scanner_init(base_lsn, 1, scanner, scanner == &internal_scanner); /* first group chunk is always chunk type 2 */ page= scanner->page; page_offset= scanner->page_offset; src= page + page_offset + 1; page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); body_len= page_rest; if (scanner == &internal_scanner) translog_destroy_scanner(scanner); } if (lsns) { uchar *start= src; src= translog_relative_LSN_decode(base_lsn, src, dst, lsns); lsns*= LSN_STORE_SIZE; dst+= lsns; length-= lsns; buff->record_length+= (buff->compressed_LSN_economy= (int) (lsns - (src - start))); DBUG_PRINT("info", ("lsns: %u length: %u economy: %d new length: %lu", lsns / LSN_STORE_SIZE, (uint) length, (int) buff->compressed_LSN_economy, (ulong) buff->record_length)); body_len-= (uint16) (src - start); } else buff->compressed_LSN_economy= 0; DBUG_ASSERT(body_len >= length); body_len-= length; memcpy(dst, src, length); buff->non_header_data_start_offset= (uint16) (src + length - page); buff->non_header_data_len= body_len; DBUG_PRINT("info", ("non_header_data_start_offset: %u len: %u buffer: %u", buff->non_header_data_start_offset, buff->non_header_data_len, buffer_length)); DBUG_RETURN(buffer_length); exit_and_free: my_free(buff->groups); buff->groups_no= 0; /* prevent try to use of buff->groups */ DBUG_RETURN(rc); } /** @brief Read record header from the given buffer @param page page content buffer @param page_offset offset of the chunk in the page @param buff destination buffer @param scanner If this is set the scanner will be moved to the record header page (differ from LSN page in case of multi-group records) @return Length of header or operation status @retval RECHEADER_READ_ERROR error @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where stored decoded part of the header */ int translog_read_record_header_from_buffer(uchar *page, uint16 page_offset, TRANSLOG_HEADER_BUFFER *buff, TRANSLOG_SCANNER_DATA *scanner) { translog_size_t res; DBUG_ENTER("translog_read_record_header_from_buffer"); DBUG_PRINT("info", ("page byte: 0x%x offset: %u", (uint) page[page_offset], (uint) page_offset)); DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset])); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); buff->type= (page[page_offset] & TRANSLOG_REC_TYPE); buff->short_trid= uint2korr(page + page_offset + 1); DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN " LSN_FMT, (uint) buff->type, (uint)buff->short_trid, LSN_IN_PARTS(buff->lsn))); /* Read required bytes from the header and call hook */ switch (log_record_type_descriptor[buff->type].rclass) { case LOGRECTYPE_VARIABLE_LENGTH: res= translog_variable_length_header(page, page_offset, buff, scanner); break; case LOGRECTYPE_PSEUDOFIXEDLENGTH: case LOGRECTYPE_FIXEDLENGTH: res= translog_fixed_length_header(page, page_offset, buff); break; default: DBUG_ASSERT(0); /* we read some junk (got no LSN) */ res= RECHEADER_READ_ERROR; } DBUG_RETURN(res); } /** @brief Read record header and some fixed part of a record (the part depend on record type). @param lsn log record serial number (address of the record) @param buff log record header buffer @note Some type of record can be read completely by this call @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative LSN can be translated to absolute one), some fields can be added (like actual header length in the record if the header has variable length) @return Length of header or operation status @retval RECHEADER_READ_ERROR error @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where stored decoded part of the header */ int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff) { TRANSLOG_PAGE_SIZE_BUFF psize_buff; uchar *page; translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; PAGECACHE_BLOCK_LINK *direct_link; TRANSLOG_ADDRESS addr; TRANSLOG_VALIDATOR_DATA data; DBUG_ENTER("translog_read_record_header"); DBUG_PRINT("enter", ("LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); buff->lsn= lsn; buff->groups_no= 0; data.addr= &addr; data.was_recovered= 0; addr= lsn; addr-= page_offset; /* offset decreasing */ res= (!(page= translog_get_page(&data, psize_buff.buffer, &direct_link))) ? RECHEADER_READ_ERROR : translog_read_record_header_from_buffer(page, page_offset, buff, 0); translog_free_link(direct_link); DBUG_RETURN(res); } /** @brief Read record header and some fixed part of a record (the part depend on record type). @param scan scanner position to read @param buff log record header buffer @param move_scanner request to move scanner to the header position @note Some type of record can be read completely by this call @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative LSN can be translated to absolute one), some fields can be added (like actual header length in the record if the header has variable length) @return Length of header or operation status @retval RECHEADER_READ_ERROR error @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where stored decoded part of the header */ int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner, TRANSLOG_HEADER_BUFFER *buff, my_bool move_scanner) { translog_size_t res; DBUG_ENTER("translog_read_record_header_scan"); DBUG_PRINT("enter", ("Scanner: Cur: " LSN_FMT " Hrz: " LSN_FMT " " "Lst: " LSN_FMT " Offset: %u(%x) fixed %d", LSN_IN_PARTS(scanner->page_addr), LSN_IN_PARTS(scanner->horizon), LSN_IN_PARTS(scanner->last_file_page), (uint) scanner->page_offset, (uint) scanner->page_offset, scanner->fixed_horizon)); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); buff->groups_no= 0; buff->lsn= scanner->page_addr; buff->lsn+= scanner->page_offset; /* offset increasing */ res= translog_read_record_header_from_buffer(scanner->page, scanner->page_offset, buff, (move_scanner ? scanner : 0)); DBUG_RETURN(res); } /** @brief Read record header and some fixed part of the next record (the part depend on record type). @param scanner data for scanning if lsn is NULL scanner data will be used for continue scanning. The scanner can be NULL. @param buff log record header buffer @return Length of header or operation status @retval RECHEADER_READ_ERROR error @retval RECHEADER_READ_EOF EOF @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where stored decoded part of the header */ int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, TRANSLOG_HEADER_BUFFER *buff) { translog_size_t res; DBUG_ENTER("translog_read_next_record_header"); buff->groups_no= 0; /* to be sure that we will free it right */ DBUG_PRINT("enter", ("scanner: %p", scanner)); DBUG_PRINT("info", ("Scanner: Cur: " LSN_FMT " Hrz: " LSN_FMT " " "Lst: " LSN_FMT " Offset: %u(%x) fixed: %d", LSN_IN_PARTS(scanner->page_addr), LSN_IN_PARTS(scanner->horizon), LSN_IN_PARTS(scanner->last_file_page), (uint) scanner->page_offset, (uint) scanner->page_offset, scanner->fixed_horizon)); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); do { if (translog_get_next_chunk(scanner)) DBUG_RETURN(RECHEADER_READ_ERROR); if (scanner->page == END_OF_LOG) { DBUG_PRINT("info", ("End of file from the scanner")); /* Last record was read */ buff->lsn= LSN_IMPOSSIBLE; DBUG_RETURN(RECHEADER_READ_EOF); } DBUG_PRINT("info", ("Page: " LSN_FMT " offset: %lu byte: %x", LSN_IN_PARTS(scanner->page_addr), (ulong) scanner->page_offset, (uint) scanner->page[scanner->page_offset])); } while (!translog_is_LSN_chunk(scanner->page[scanner->page_offset]) && scanner->page[scanner->page_offset] != TRANSLOG_FILLER); if (scanner->page[scanner->page_offset] == TRANSLOG_FILLER) { DBUG_PRINT("info", ("End of file")); /* Last record was read */ buff->lsn= LSN_IMPOSSIBLE; /* Return 'end of log' marker */ res= RECHEADER_READ_EOF; } else res= translog_read_record_header_scan(scanner, buff, 0); DBUG_RETURN(res); } /* Moves record data reader to the next chunk and fill the data reader information about that chunk. SYNOPSIS translog_record_read_next_chunk() data data cursor RETURN 0 OK 1 Error */ static my_bool translog_record_read_next_chunk(TRANSLOG_READER_DATA *data) { translog_size_t new_current_offset= data->current_offset + data->chunk_size; uint16 chunk_header_len, chunk_len; uint8 type; DBUG_ENTER("translog_record_read_next_chunk"); if (data->eor) { DBUG_PRINT("info", ("end of the record flag set")); DBUG_RETURN(1); } if (data->header.groups_no && data->header.groups_no - 1 != data->current_group && data->header.groups[data->current_group].num == data->current_chunk) { /* Goto next group */ data->current_group++; data->current_chunk= 0; DBUG_PRINT("info", ("skip to group: #%u", data->current_group)); translog_destroy_scanner(&data->scanner); translog_scanner_init(data->header.groups[data->current_group].addr, 1, &data->scanner, 1); } else { data->current_chunk++; if (translog_get_next_chunk(&data->scanner)) DBUG_RETURN(1); if (data->scanner.page == END_OF_LOG) { /* Actually it should not happened, but we want to quit nicely in case of a truncated log */ DBUG_RETURN(1); } } type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE; if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no) { DBUG_PRINT("info", ("Last chunk: data len: %u offset: %u group: %u of %u", data->header.chunk0_data_len, data->scanner.page_offset, data->current_group, data->header.groups_no - 1)); DBUG_ASSERT(data->header.groups_no - 1 == data->current_group); DBUG_ASSERT(data->header.lsn == data->scanner.page_addr + data->scanner.page_offset); translog_destroy_scanner(&data->scanner); translog_scanner_init(data->header.chunk0_data_addr, 1, &data->scanner, 1); data->chunk_size= data->header.chunk0_data_len; data->body_offset= data->scanner.page_offset; data->current_offset= new_current_offset; data->eor= 1; DBUG_RETURN(0); } if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED) { data->eor= 1; DBUG_RETURN(1); /* End of record */ } chunk_header_len= translog_get_chunk_header_length(data->scanner.page + data->scanner.page_offset); chunk_len= translog_get_total_chunk_length(data->scanner.page, data->scanner.page_offset); data->chunk_size= chunk_len - chunk_header_len; data->body_offset= data->scanner.page_offset + chunk_header_len; data->current_offset= new_current_offset; DBUG_PRINT("info", ("grp: %u chunk: %u body_offset: %u chunk_size: %u " "current_offset: %lu", (uint) data->current_group, (uint) data->current_chunk, (uint) data->body_offset, (uint) data->chunk_size, (ulong) data->current_offset)); DBUG_RETURN(0); } /* Initialize record reader data from LSN SYNOPSIS translog_init_reader_data() lsn reference to LSN we should start from data reader data to initialize RETURN 0 OK 1 Error */ static my_bool translog_init_reader_data(LSN lsn, TRANSLOG_READER_DATA *data) { int read_header; DBUG_ENTER("translog_init_reader_data"); if (translog_scanner_init(lsn, 1, &data->scanner, 1) || ((read_header= translog_read_record_header_scan(&data->scanner, &data->header, 1)) == RECHEADER_READ_ERROR)) DBUG_RETURN(1); data->read_header= read_header; data->body_offset= data->header.non_header_data_start_offset; data->chunk_size= data->header.non_header_data_len; data->current_offset= data->read_header; data->current_group= 0; data->current_chunk= 0; data->eor= 0; DBUG_PRINT("info", ("read_header: %u " "body_offset: %u chunk_size: %u current_offset: %lu", (uint) data->read_header, (uint) data->body_offset, (uint) data->chunk_size, (ulong) data->current_offset)); DBUG_RETURN(0); } /** @brief Destroy reader data object */ static void translog_destroy_reader_data(TRANSLOG_READER_DATA *data) { translog_destroy_scanner(&data->scanner); translog_free_record_header(&data->header); } /* Read a part of the record. SYNOPSIS translog_read_record_header() lsn log record serial number (address of the record) offset From the beginning of the record beginning (read by translog_read_record_header). length Length of record part which have to be read. buffer Buffer where to read the record part (have to be at least 'length' bytes length) RETURN length of data actually read */ translog_size_t translog_read_record(LSN lsn, translog_size_t offset, translog_size_t length, uchar *buffer, TRANSLOG_READER_DATA *data) { translog_size_t requested_length= length; translog_size_t end= offset + length; TRANSLOG_READER_DATA internal_data; DBUG_ENTER("translog_read_record"); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); if (data == NULL) { DBUG_ASSERT(lsn != LSN_IMPOSSIBLE); data= &internal_data; } if (lsn || (offset < data->current_offset && !(offset < data->read_header && offset + length < data->read_header))) { if (translog_init_reader_data(lsn, data)) DBUG_RETURN(0); } DBUG_PRINT("info", ("Offset: %lu length: %lu " "Scanner: Cur: " LSN_FMT " Hrz: " LSN_FMT " " "Lst: " LSN_FMT " Offset: %u(%x) fixed: %d", (ulong) offset, (ulong) length, LSN_IN_PARTS(data->scanner.page_addr), LSN_IN_PARTS(data->scanner.horizon), LSN_IN_PARTS(data->scanner.last_file_page), (uint) data->scanner.page_offset, (uint) data->scanner.page_offset, data->scanner.fixed_horizon)); if (offset < data->read_header) { uint16 len= MY_MIN(data->read_header, end) - offset; DBUG_PRINT("info", ("enter header offset: %lu length: %lu", (ulong) offset, (ulong) length)); memcpy(buffer, data->header.header + offset, len); length-= len; if (length == 0) { translog_destroy_reader_data(data); DBUG_RETURN(requested_length); } offset+= len; buffer+= len; DBUG_PRINT("info", ("len: %u offset: %lu curr: %lu length: %lu", len, (ulong) offset, (ulong) data->current_offset, (ulong) length)); } /* TODO: find first page which we should read by offset */ /* read the record chunk by chunk */ for(;;) { uint page_end= data->current_offset + data->chunk_size; DBUG_PRINT("info", ("enter body offset: %lu curr: %lu " "length: %lu page_end: %lu", (ulong) offset, (ulong) data->current_offset, (ulong) length, (ulong) page_end)); if (offset < page_end) { uint len= page_end - offset; set_if_smaller(len, length); /* in case we read beyond record's end */ DBUG_ASSERT(offset >= data->current_offset); memcpy(buffer, data->scanner.page + data->body_offset + (offset - data->current_offset), len); length-= len; if (length == 0) { translog_destroy_reader_data(data); DBUG_RETURN(requested_length); } offset+= len; buffer+= len; DBUG_PRINT("info", ("len: %u offset: %lu curr: %lu length: %lu", len, (ulong) offset, (ulong) data->current_offset, (ulong) length)); } if (translog_record_read_next_chunk(data)) { translog_destroy_reader_data(data); DBUG_RETURN(requested_length - length); } } } /* @brief Force skipping to the next buffer @todo Do not copy old page content if all page protections are switched off (because we do not need calculate something or change old parts of the page) */ static void translog_force_current_buffer_to_finish() { TRANSLOG_ADDRESS new_buff_beginning; uint16 old_buffer_no= log_descriptor.bc.buffer_no; uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; struct st_translog_buffer *new_buffer= (log_descriptor.buffers + new_buffer_no); struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer; uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill; uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; uint16 UNINIT_VAR(current_page_fill), write_counter, previous_offset; DBUG_ENTER("translog_force_current_buffer_to_finish"); DBUG_PRINT("enter", ("Buffer #%u %p " "Buffer addr: " LSN_FMT " " "Page addr: " LSN_FMT " " "size: %lu (%lu) Pg: %u left: %u in progress %u", (uint) old_buffer_no, old_buffer, LSN_IN_PARTS(old_buffer->offset), LSN_FILE_NO(log_descriptor.horizon), (uint)(LSN_OFFSET(log_descriptor.horizon) - log_descriptor.bc.current_page_fill), (ulong) old_buffer->size, (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. buffer->buffer), (uint) log_descriptor.bc.current_page_fill, (uint) left, (uint) old_buffer-> copy_to_buffer_in_progress)); translog_lock_assert_owner(); new_buff_beginning= old_buffer->offset; new_buff_beginning+= old_buffer->size; /* increase offset */ DBUG_ASSERT(log_descriptor.bc.ptr !=NULL); DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) == LSN_FILE_NO(old_buffer->offset) || translog_status == TRANSLOG_READONLY ); translog_check_cursor(&log_descriptor.bc); DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); if (left) { /* TODO: if 'left' is so small that can't hold any other record then do not move the page */ DBUG_PRINT("info", ("left: %u", (uint) left)); old_buffer->pre_force_close_horizon= old_buffer->offset + old_buffer->size; /* decrease offset */ new_buff_beginning-= log_descriptor.bc.current_page_fill; current_page_fill= log_descriptor.bc.current_page_fill; memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left); old_buffer->size+= left; DBUG_PRINT("info", ("Finish Page buffer #%u: %p " "Size: %lu", (uint) old_buffer->buffer_no, old_buffer, (ulong) old_buffer->size)); DBUG_ASSERT(old_buffer->buffer_no == log_descriptor.bc.buffer_no); } else { log_descriptor.bc.current_page_fill= 0; } translog_buffer_lock(new_buffer); #ifndef DBUG_OFF { TRANSLOG_ADDRESS offset= new_buffer->offset; TRANSLOG_FILE *file= new_buffer->file; uint8 ver= new_buffer->ver; translog_lock_assert_owner(); #endif translog_wait_for_buffer_free(new_buffer); #ifndef DBUG_OFF /* We keep the handler locked so nobody can start this new buffer */ DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); } #endif write_counter= log_descriptor.bc.write_counter; previous_offset= log_descriptor.bc.previous_offset; translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); /* Fix buffer offset (which was incorrectly set to horizon) */ log_descriptor.bc.buffer->offset= new_buff_beginning; log_descriptor.bc.write_counter= write_counter; log_descriptor.bc.previous_offset= previous_offset; new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer); DBUG_PRINT("info", ("prev_last_lsn set to " LSN_FMT " buffer: %p", LSN_IN_PARTS(new_buffer->prev_last_lsn), new_buffer)); /* Advances this log pointer, increases writers and let other threads to write to the log while we process old page content */ if (left) { log_descriptor.bc.ptr+= current_page_fill; log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill= current_page_fill; new_buffer->overlay= 1; } else translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); translog_buffer_increase_writers(new_buffer); translog_buffer_unlock(new_buffer); /* We have to wait until all writers finish before start changing the pages by applying protection and copying the page content in the new buffer. */ #ifndef DBUG_OFF { TRANSLOG_ADDRESS offset= old_buffer->offset; TRANSLOG_FILE *file= old_buffer->file; uint8 ver= old_buffer->ver; #endif /* Now only one thread can flush log (buffer can flush many threads but log flush log flush where this function is used can do only one thread) so no other thread can set is_closing_buffer. */ DBUG_ASSERT(!old_buffer->is_closing_buffer); old_buffer->is_closing_buffer= 1; /* Other flushes will wait */ DBUG_PRINT("enter", ("Buffer #%u %p is_closing_buffer set", (uint) old_buffer->buffer_no, old_buffer)); translog_wait_for_writers(old_buffer); #ifndef DBUG_OFF /* We blocked flushing this buffer so the buffer should not changed */ DBUG_ASSERT(offset == old_buffer->offset && file == old_buffer->file && ver == old_buffer->ver); } #endif if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) { translog_put_sector_protection(data, &log_descriptor.bc); if (left) { log_descriptor.bc.write_counter++; log_descriptor.bc.previous_offset= current_page_fill; } else { DBUG_PRINT("info", ("drop write_counter")); log_descriptor.bc.write_counter= 0; log_descriptor.bc.previous_offset= 0; } } if (log_descriptor.flags & TRANSLOG_PAGE_CRC) { uint32 crc= translog_crc(data + log_descriptor.page_overhead, TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead); DBUG_PRINT("info", ("CRC: 0x%x", crc)); int4store(data + 3 + 3 + 1, crc); } old_buffer->is_closing_buffer= 0; DBUG_PRINT("enter", ("Buffer #%u %p is_closing_buffer cleared", (uint) old_buffer->buffer_no, old_buffer)); mysql_cond_broadcast(&old_buffer->waiting_filling_buffer); if (left) { if (log_descriptor.flags & (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION)) memcpy(new_buffer->buffer, data, current_page_fill); else { /* This page header does not change if we add more data to the page so we can not copy it and will not overwrite later */ new_buffer->skipped_data= current_page_fill; TRASH_ALLOC(new_buffer->buffer, current_page_fill); DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE); } } old_buffer->next_buffer_offset= new_buffer->offset; translog_buffer_lock(new_buffer); new_buffer->prev_buffer_offset= old_buffer->offset; translog_buffer_decrease_writers(new_buffer); translog_buffer_unlock(new_buffer); DBUG_VOID_RETURN; } /** @brief Waits while given lsn will be flushed @param lsn log record serial number up to which (inclusive) the log has to be flushed */ void translog_flush_wait_for_end(LSN lsn) { DBUG_ENTER("translog_flush_wait_for_end"); DBUG_PRINT("enter", ("LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); mysql_mutex_assert_owner(&log_descriptor.log_flush_lock); while (cmp_translog_addr(log_descriptor.flushed, lsn) < 0) mysql_cond_wait(&log_descriptor.log_flush_cond, &log_descriptor.log_flush_lock); DBUG_VOID_RETURN; } /** @brief Sets goal for the next flush pass and waits for this pass end. @param lsn log record serial number up to which (inclusive) the log has to be flushed */ void translog_flush_set_new_goal_and_wait(TRANSLOG_ADDRESS lsn) { int flush_no= log_descriptor.flush_no; DBUG_ENTER("translog_flush_set_new_goal_and_wait"); DBUG_PRINT("enter", ("LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); mysql_mutex_assert_owner(&log_descriptor.log_flush_lock); if (cmp_translog_addr(lsn, log_descriptor.next_pass_max_lsn) > 0) { log_descriptor.next_pass_max_lsn= lsn; log_descriptor.max_lsn_requester= pthread_self(); mysql_cond_broadcast(&log_descriptor.new_goal_cond); } while (flush_no == log_descriptor.flush_no) { mysql_cond_wait(&log_descriptor.log_flush_cond, &log_descriptor.log_flush_lock); } DBUG_VOID_RETURN; } /** @brief sync() range of files (inclusive) and directory (by request) @param min min internal file number to flush @param max max internal file number to flush @param sync_dir need sync directory return Operation status @retval 0 OK @retval 1 Error */ static my_bool translog_sync_files(uint32 min, uint32 max, my_bool sync_dir) { uint fn; my_bool rc= 0; ulonglong flush_interval; DBUG_ENTER("translog_sync_files"); DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d", (ulong) min, (ulong) max, (int) sync_dir)); DBUG_ASSERT(min <= max); flush_interval= group_commit_wait; if (flush_interval) flush_start= microsecond_interval_timer(); for (fn= min; fn <= max; fn++) { TRANSLOG_FILE *file= get_logfile_by_number(fn); DBUG_ASSERT(file != NULL); if (!file->is_sync) { if (mysql_file_sync(file->handler.file, MYF(MY_WME))) { rc= 1; translog_stop_writing(); DBUG_RETURN(rc); } translog_syncs++; file->is_sync= 1; } } if (sync_dir) { if (!(rc= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)))) translog_syncs++; } DBUG_RETURN(rc); } /** check_skipped_lsn Check if lsn skipped in redo is ok */ void check_skipped_lsn(MARIA_HA *info, LSN lsn, my_bool index_file, pgcache_page_no_t page) { if (lsn <= log_descriptor.horizon) { DBUG_PRINT("info", ("Page is up to date, skipping redo")); } else { /* Give error, but don't flood the log */ if (skipped_lsn_err_count++ < MAX_LSN_ERRORS && ! info->s->redo_error_given++) { eprint(tracef, "Table %s has wrong LSN: " LSN_FMT " on page: %llu", (index_file ? info->s->data_file_name.str : info->s->index_file_name.str), LSN_IN_PARTS(lsn), (ulonglong) page); recovery_found_crashed_tables++; } } } /* @brief Flushes buffers with LSNs in them less or equal address @param lsn address up to which all LSNs should be flushed, can be reset to real last LSN address @parem sent_to_disk returns 'sent to disk' position @param flush_horizon returns horizon of the flush @note About terminology see comment to translog_flush(). */ void translog_flush_buffers(TRANSLOG_ADDRESS *lsn, TRANSLOG_ADDRESS *sent_to_disk, TRANSLOG_ADDRESS *flush_horizon) { dirty_buffer_mask_t dirty_buffer_mask; uint i; uint8 UNINIT_VAR(last_buffer_no), start_buffer_no; DBUG_ENTER("translog_flush_buffers"); /* We will recheck information when will lock buffers one by one so we can use unprotected read here (this is just for speed up buffers processing) */ dirty_buffer_mask= log_descriptor.dirty_buffer_mask; DBUG_PRINT("info", ("Dirty buffer mask: %lx current buffer: %u", (ulong) dirty_buffer_mask, (uint) log_descriptor.bc.buffer_no)); for (i= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; i != log_descriptor.bc.buffer_no && !(dirty_buffer_mask & (1 << i)); i= (i + 1) % TRANSLOG_BUFFERS_NO) {} start_buffer_no= i; DBUG_PRINT("info", ("start from: %u current: %u prev last lsn: " LSN_FMT, (uint) start_buffer_no, (uint) log_descriptor.bc.buffer_no, LSN_IN_PARTS(log_descriptor.bc.buffer->prev_last_lsn))); /* if LSN up to which we have to flush bigger then maximum LSN of previous buffer and at least one LSN was saved in the current buffer (last_lsn != LSN_IMPOSSIBLE) then we have to close the current buffer. */ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 && log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE) { struct st_translog_buffer *buffer= log_descriptor.bc.buffer; *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */ DBUG_PRINT("info", ("LSN to flush fixed to last lsn: " LSN_FMT, LSN_IN_PARTS(*lsn))); last_buffer_no= log_descriptor.bc.buffer_no; log_descriptor.is_everything_flushed= 1; translog_force_current_buffer_to_finish(); translog_buffer_unlock(buffer); } else { if (log_descriptor.bc.buffer->last_lsn == LSN_IMPOSSIBLE && log_descriptor.bc.buffer->prev_last_lsn == LSN_IMPOSSIBLE) { DBUG_PRINT("info", ("There is no LSNs yet generated => do nothing")); translog_unlock(); DBUG_VOID_RETURN; } /* fix lsn if it was horizon */ *lsn= log_descriptor.bc.buffer->prev_last_lsn; DBUG_PRINT("info", ("LSN to flush fixed to prev last lsn: " LSN_FMT, LSN_IN_PARTS(*lsn))); last_buffer_no= ((log_descriptor.bc.buffer_no + TRANSLOG_BUFFERS_NO -1) % TRANSLOG_BUFFERS_NO); translog_unlock(); } /* flush buffers */ *sent_to_disk= translog_get_sent_to_disk(); if (cmp_translog_addr(*lsn, *sent_to_disk) > 0) { DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u", (uint) start_buffer_no, (uint) last_buffer_no)); last_buffer_no= (last_buffer_no + 1) % TRANSLOG_BUFFERS_NO; i= start_buffer_no; do { struct st_translog_buffer *buffer= log_descriptor.buffers + i; translog_buffer_lock(buffer); DBUG_PRINT("info", ("Check buffer: %p #: %u " "prev last LSN: " LSN_FMT " " "last LSN: " LSN_FMT " status: %s", buffer, (uint) i, LSN_IN_PARTS(buffer->prev_last_lsn), LSN_IN_PARTS(buffer->last_lsn), (buffer->file ? "dirty" : "closed"))); if (buffer->prev_last_lsn <= *lsn && buffer->file != NULL) { DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size); *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ? buffer->pre_force_close_horizon : buffer->offset + buffer->size); /* pre_force_close_horizon is reset during new buffer start */ DBUG_PRINT("info", ("flush_horizon: " LSN_FMT, LSN_IN_PARTS(*flush_horizon))); DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon); translog_buffer_flush(buffer); } translog_buffer_unlock(buffer); i= (i + 1) % TRANSLOG_BUFFERS_NO; } while (i != last_buffer_no); *sent_to_disk= translog_get_sent_to_disk(); } DBUG_VOID_RETURN; } /** @brief Flush the log up to given LSN (included) @param lsn log record serial number up to which (inclusive) the log has to be flushed @return Operation status @retval 0 OK @retval 1 Error @note - Non group commit logic: Commits made in passes. Thread which started flush first is performing actual flush, other threads sets new goal (LSN) of the next pass (if it is maximum) and waits for the pass end or just wait for the pass end. - If hard group commit enabled and rate set to zero: The first thread sends all changed buffers to disk. This is repeated as long as there are new LSNs added. The process can not loop forever because we have limited number of threads and they will wait for the data to be synced. Pseudo code: do send changed buffers to disk while new_goal sync - If hard group commit switched ON and less than rate microseconds has passed from last sync, then after buffers have been sent to disk wait until rate microseconds has passed since last sync, do sync and return. This ensures that if we call sync infrequently we don't do any waits. - If soft group commit enabled everything works as with 'non group commit' but the thread doesn't do any real sync(). If rate is not zero the sync() will be performed by a service thread with the given rate when needed (new LSN appears). @note Terminology: 'sent to disk' means written to disk but not sync()ed, 'flushed' mean sent to disk and synced(). */ my_bool translog_flush(TRANSLOG_ADDRESS lsn) { struct timespec abstime; ulonglong UNINIT_VAR(flush_interval); ulonglong time_spent; LSN sent_to_disk= LSN_IMPOSSIBLE; TRANSLOG_ADDRESS flush_horizon; my_bool rc= 0; my_bool hgroup_commit_at_start; DBUG_ENTER("translog_flush"); DBUG_PRINT("enter", ("Flush up to LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); mysql_mutex_lock(&log_descriptor.log_flush_lock); DBUG_PRINT("info", ("Everything is flushed up to " LSN_FMT, LSN_IN_PARTS(log_descriptor.flushed))); if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0) { mysql_mutex_unlock(&log_descriptor.log_flush_lock); DBUG_RETURN(0); } if (log_descriptor.flush_in_progress) { translog_lock(); /* fix lsn if it was horizon */ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0) lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer); translog_unlock(); translog_flush_set_new_goal_and_wait(lsn); if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self())) { /* translog_flush_wait_for_end() release log_flush_lock while is waiting then acquire it again */ translog_flush_wait_for_end(lsn); mysql_mutex_unlock(&log_descriptor.log_flush_lock); DBUG_RETURN(0); } log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; } log_descriptor.flush_in_progress= 1; flush_horizon= log_descriptor.previous_flush_horizon; DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: " LSN_FMT, LSN_IN_PARTS(flush_horizon))); mysql_mutex_unlock(&log_descriptor.log_flush_lock); hgroup_commit_at_start= hard_group_commit; if (hgroup_commit_at_start) flush_interval= group_commit_wait; translog_lock(); if (log_descriptor.is_everything_flushed) { DBUG_PRINT("info", ("everything is flushed")); translog_unlock(); mysql_mutex_lock(&log_descriptor.log_flush_lock); goto out; } for (;;) { /* Following function flushes buffers and makes translog_unlock() */ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon); if (!hgroup_commit_at_start) break; /* flush pass is ended */ retest: /* We do not check time here because mysql_mutex_lock rarely takes a lot of time so we can sacrifice a bit precision to performance (taking into account that microsecond_interval_timer() might be expensive call). */ if (flush_interval == 0) break; /* flush pass is ended */ mysql_mutex_lock(&log_descriptor.log_flush_lock); if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE) { if (flush_interval == 0 || (time_spent= (microsecond_interval_timer() - flush_start)) >= flush_interval) { mysql_mutex_unlock(&log_descriptor.log_flush_lock); break; } DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu", flush_interval - time_spent, flush_interval, time_spent)); /* wait time or next goal */ set_timespec_nsec(abstime, flush_interval - time_spent); mysql_cond_timedwait(&log_descriptor.new_goal_cond, &log_descriptor.log_flush_lock, &abstime); mysql_mutex_unlock(&log_descriptor.log_flush_lock); DBUG_PRINT("info", ("retest conditions")); goto retest; } /* take next goal */ lsn= log_descriptor.next_pass_max_lsn; log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; /* prevent other thread from continue */ log_descriptor.max_lsn_requester= pthread_self(); DBUG_PRINT("info", ("flush took next goal: " LSN_FMT, LSN_IN_PARTS(lsn))); mysql_mutex_unlock(&log_descriptor.log_flush_lock); /* next flush pass */ DBUG_PRINT("info", ("next flush pass")); translog_lock(); } /* sync() files from previous flush till current one */ if (!soft_sync || hgroup_commit_at_start) { if ((rc= translog_sync_files(LSN_FILE_NO(log_descriptor.flushed), LSN_FILE_NO(lsn), sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS && (LSN_FILE_NO(log_descriptor. previous_flush_horizon) != LSN_FILE_NO(flush_horizon) || (LSN_OFFSET(log_descriptor. previous_flush_horizon) / TRANSLOG_PAGE_SIZE) != (LSN_OFFSET(flush_horizon) / TRANSLOG_PAGE_SIZE))))) { sent_to_disk= LSN_IMPOSSIBLE; mysql_mutex_lock(&log_descriptor.log_flush_lock); goto out; } /* keep values for soft sync() and forced sync() actual */ { uint32 fileno= LSN_FILE_NO(lsn); soft_sync_min= fileno; soft_sync_max= fileno; } } else { soft_sync_max= LSN_FILE_NO(lsn); soft_need_sync= 1; } DBUG_ASSERT(flush_horizon <= log_descriptor.horizon); mysql_mutex_lock(&log_descriptor.log_flush_lock); log_descriptor.previous_flush_horizon= flush_horizon; out: if (sent_to_disk != LSN_IMPOSSIBLE) log_descriptor.flushed= sent_to_disk; log_descriptor.flush_in_progress= 0; log_descriptor.flush_no++; DBUG_PRINT("info", ("flush_in_progress is dropped")); mysql_mutex_unlock(&log_descriptor.log_flush_lock); mysql_cond_broadcast(&log_descriptor.log_flush_cond); DBUG_RETURN(rc); } /** @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently open MARIA_SHAREs), give it one and record this assignment in the log (LOGREC_FILE_ID log record). @param tbl_info table @param trn calling transaction @return Operation status @retval 0 OK @retval 1 Error @note Can be called even if share already has an id (then will do nothing) */ int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn) { uint16 id; MARIA_SHARE *share= tbl_info->s; /* If you give an id to a non-BLOCK_RECORD table, you also need to release this id somewhere. Then you can change the assertion. */ DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); /* re-check under mutex to avoid having 2 ids for the same share */ mysql_mutex_lock(&share->intern_lock); if (unlikely(share->id == 0)) { LSN lsn; LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; uchar log_data[FILEID_STORE_SIZE]; /* Inspired by set_short_trid() of trnman.c */ uint i= share->kfile.file % SHARE_ID_MAX + 1; id= 0; do { for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */ { void *tmp= NULL; if (id_to_share[i] == NULL && my_atomic_casptr((void **)&id_to_share[i], &tmp, share)) { id= (uint16) i; break; } } i= 1; /* scan the whole array */ } while (id == 0); DBUG_PRINT("info", ("id_to_share: %p -> %u", share, id)); fileid_store(log_data, id); log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); /* open_file_name is an unresolved name (symlinks are not resolved, datadir is not realpath-ed, etc) which is good: the log can be moved to another directory and continue working. */ log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (uchar *)share->open_file_name.str; log_array[TRANSLOG_INTERNAL_PARTS + 1].length= share->open_file_name.length + 1; /* We can't unlock share->intern_lock before the log entry is written to ensure no one uses the id before it's logged. */ if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info, (translog_size_t) (sizeof(log_data) + log_array[TRANSLOG_INTERNAL_PARTS + 1].length), sizeof(log_array)/sizeof(log_array[0]), log_array, NULL, NULL))) { mysql_mutex_unlock(&share->intern_lock); return 1; } /* Now when translog record is done, we can set share->id. If we set it before, then translog_write_record may pick up the id before it's written to the log. */ share->id= id; share->state.logrec_file_id= lsn; } mysql_mutex_unlock(&share->intern_lock); return 0; } /** @brief Recycles a MARIA_SHARE's short id. @param share table @note Must be called only if share has an id (i.e. id != 0) */ void translog_deassign_id_from_share(MARIA_SHARE *share) { DBUG_PRINT("info", ("id_to_share: %p id %u -> 0", share, share->id)); /* We don't need any mutex as we are called only when closing the last instance of the table or at the end of REPAIR: no writes can be happening. But a Checkpoint may be reading share->id, so we require this mutex: */ mysql_mutex_assert_owner(&share->intern_lock); my_atomic_storeptr((void **)&id_to_share[share->id], 0); share->id= 0; /* useless but safety: */ share->lsn_of_file_id= LSN_IMPOSSIBLE; } void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share, uint16 id) { DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); DBUG_ASSERT(share->id == 0); DBUG_ASSERT(id_to_share[id] == NULL); id_to_share[share->id= id]= share; } /** @brief check if such log file exists @param file_no number of the file to test @retval 0 no such file @retval 1 there is file with such number */ my_bool translog_is_file(uint file_no) { MY_STAT stat_buff; char path[FN_REFLEN]; return (MY_TEST(mysql_file_stat(key_file_translog, translog_filename_by_fileno(file_no, path), &stat_buff, MYF(0)))); } /** @brief returns minimum log file number @param horizon the end of the log @param is_protected true if it is under purge_log protection @retval minimum file number @retval 0 no files found */ static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected) { uint min_file= 1, max_file; DBUG_ENTER("translog_first_file"); if (!is_protected) mysql_mutex_lock(&log_descriptor.purger_lock); if (log_descriptor.min_file_number) { min_file= log_descriptor.min_file_number; if (translog_is_file(log_descriptor.min_file_number)) { DBUG_PRINT("info", ("cached %lu", (ulong) log_descriptor.min_file_number)); if (!is_protected) mysql_mutex_unlock(&log_descriptor.purger_lock); DBUG_RETURN(log_descriptor.min_file_number); } } max_file= LSN_FILE_NO(horizon); if (!translog_is_file(max_file)) { if (!is_protected) mysql_mutex_unlock(&log_descriptor.purger_lock); DBUG_RETURN(max_file); /* For compatibility */ } /* binary search for last file */ while (min_file < max_file) { uint test= (min_file + max_file) / 2; DBUG_PRINT("info", ("min_file: %u test: %u max_file: %u", min_file, test, max_file)); if (translog_is_file(test)) max_file= test; else min_file= test+1; } log_descriptor.min_file_number= max_file; if (!is_protected) mysql_mutex_unlock(&log_descriptor.purger_lock); DBUG_PRINT("info", ("first file :%lu", (ulong) max_file)); DBUG_ASSERT(max_file >= 1); DBUG_RETURN(max_file); } /** @brief returns the most close LSN higher the given chunk address @param addr the chunk address to start from @param horizon the horizon if it is known or LSN_IMPOSSIBLE @retval LSN_ERROR Error @retval LSN_IMPOSSIBLE no LSNs after the address @retval # LSN of the most close LSN higher the given chunk address */ LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon) { TRANSLOG_SCANNER_DATA scanner; LSN result; DBUG_ENTER("translog_next_LSN"); if (horizon == LSN_IMPOSSIBLE) horizon= translog_get_horizon(); if (addr == horizon) DBUG_RETURN(LSN_IMPOSSIBLE); translog_scanner_init(addr, 0, &scanner, 1); /* addr can point not to a chunk beginning but page end so next page beginning. */ if (addr % TRANSLOG_PAGE_SIZE == 0) { /* We are emulating the page end which cased such horizon value to trigger translog_scanner_eop(). We can't just increase addr on page header overhead because it can be file end so we allow translog_get_next_chunk() to skip to the next page in correct way */ scanner.page_addr-= TRANSLOG_PAGE_SIZE; scanner.page_offset= TRANSLOG_PAGE_SIZE; #ifndef DBUG_OFF scanner.page= NULL; /* prevent using incorrect page content */ #endif } /* addr can point not to a chunk beginning but to a page end */ if (translog_scanner_eop(&scanner)) { if (translog_get_next_chunk(&scanner)) { result= LSN_ERROR; goto out; } if (scanner.page == END_OF_LOG) { result= LSN_IMPOSSIBLE; goto out; } } while (!translog_is_LSN_chunk(scanner.page[scanner.page_offset]) && scanner.page[scanner.page_offset] != TRANSLOG_FILLER) { if (translog_get_next_chunk(&scanner)) { result= LSN_ERROR; goto out; } if (scanner.page == END_OF_LOG) { result= LSN_IMPOSSIBLE; goto out; } } if (scanner.page[scanner.page_offset] == TRANSLOG_FILLER) result= LSN_IMPOSSIBLE; /* reached page filler */ else result= scanner.page_addr + scanner.page_offset; out: translog_destroy_scanner(&scanner); DBUG_RETURN(result); } /** @brief returns the LSN of the first record starting in this log @retval LSN_ERROR Error @retval LSN_IMPOSSIBLE no log or the log is empty @retval # LSN of the first record */ LSN translog_first_lsn_in_log() { TRANSLOG_ADDRESS addr, horizon= translog_get_horizon(); TRANSLOG_VALIDATOR_DATA data; uint file; uint16 chunk_offset; uchar *page; DBUG_ENTER("translog_first_lsn_in_log"); DBUG_PRINT("info", ("Horizon: " LSN_FMT, LSN_IN_PARTS(horizon))); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); if (!(file= translog_first_file(horizon, 0))) { /* log has no records yet */ DBUG_RETURN(LSN_IMPOSSIBLE); } addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */ data.addr= &addr; { TRANSLOG_PAGE_SIZE_BUFF psize_buff; if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL || (chunk_offset= translog_get_first_chunk_offset(page)) == 0) DBUG_RETURN(LSN_ERROR); } addr+= chunk_offset; DBUG_RETURN(translog_next_LSN(addr, horizon)); } /** @brief Returns theoretical first LSN if first log is present @retval LSN_ERROR Error @retval LSN_IMPOSSIBLE no log @retval # LSN of the first record */ LSN translog_first_theoretical_lsn() { TRANSLOG_ADDRESS addr= translog_get_horizon(); TRANSLOG_PAGE_SIZE_BUFF psize_buff; uchar *page; TRANSLOG_VALIDATOR_DATA data; DBUG_ENTER("translog_first_theoretical_lsn"); DBUG_PRINT("info", ("Horizon: " LSN_FMT, LSN_IN_PARTS(addr))); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); if (!translog_is_file(1)) DBUG_RETURN(LSN_IMPOSSIBLE); if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE)) { /* log has no records yet */ DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + log_descriptor.page_overhead)); } addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */ data.addr= &addr; if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL) DBUG_RETURN(LSN_ERROR); DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + page_overhead[page[TRANSLOG_PAGE_FLAGS]])); } /** @brief Checks given low water mark and purge files if it is need @param low the last (minimum) address which is need @retval 0 OK @retval 1 Error */ my_bool translog_purge(TRANSLOG_ADDRESS low) { uint32 last_need_file= LSN_FILE_NO(low); uint32 min_unsync; int soft; TRANSLOG_ADDRESS horizon= translog_get_horizon(); int rc= 0; DBUG_ENTER("translog_purge"); DBUG_PRINT("enter", ("low: " LSN_FMT, LSN_IN_PARTS(low))); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); soft= soft_sync; min_unsync= soft_sync_min; DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync)); if (soft && min_unsync < last_need_file) { last_need_file= min_unsync; DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file)); } mysql_mutex_lock(&log_descriptor.purger_lock); DBUG_PRINT("info", ("last_lsn_checked file: %lu:", (ulong) log_descriptor.last_lsn_checked)); if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file) { uint32 i; uint32 min_file= translog_first_file(horizon, 1); DBUG_ASSERT(min_file != 0); /* log is already started */ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file)); for(i= min_file; i < last_need_file && rc == 0; i++) { LSN lsn= translog_get_file_max_lsn_stored(i); if (lsn == LSN_IMPOSSIBLE) break; /* files are still in writing */ if (lsn == LSN_ERROR) { rc= 1; break; } if (cmp_translog_addr(lsn, low) >= 0) break; DBUG_PRINT("info", ("purge file %lu", (ulong) i)); /* remove file descriptor from the cache */ /* log_descriptor.min_file can be changed only here during execution and the function is serialized, so we can access it without problems */ if (i >= log_descriptor.min_file) { TRANSLOG_FILE *file; mysql_rwlock_wrlock(&log_descriptor.open_files_lock); DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); DBUG_ASSERT(log_descriptor.min_file == i); file= *((TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files)); DBUG_PRINT("info", ("Files : %d", log_descriptor.open_files.elements)); DBUG_ASSERT(i == file->number); log_descriptor.min_file++; DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == log_descriptor.open_files.elements); mysql_rwlock_unlock(&log_descriptor.open_files_lock); translog_close_log_file(file); } if (log_purge_type == TRANSLOG_PURGE_IMMIDIATE && ! log_purge_disabled) { char path[FN_REFLEN], *file_name; file_name= translog_filename_by_fileno(i, path); rc= MY_TEST(mysql_file_delete(key_file_translog, file_name, MYF(MY_WME))); } } if (unlikely(rc == 1)) log_descriptor.min_need_file= 0; /* impossible value */ else log_descriptor.min_need_file= i; } mysql_mutex_unlock(&log_descriptor.purger_lock); DBUG_RETURN(rc); } /** @brief Purges files by stored min need file in case of "one demand" purge type @note This function do real work only if it is "one demand" purge type and translog_purge() was called at least once and last time without errors @retval 0 OK @retval 1 Error */ my_bool translog_purge_at_flush() { uint32 i, min_file; int rc= 0; DBUG_ENTER("translog_purge_at_flush"); DBUG_ASSERT(translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY); if (unlikely(translog_status == TRANSLOG_READONLY)) { DBUG_PRINT("info", ("The log is read only => exit")); DBUG_RETURN(0); } if (log_purge_type != TRANSLOG_PURGE_ONDEMAND) { DBUG_PRINT("info", ("It is not \"at_flush\" => exit")); DBUG_RETURN(0); } mysql_mutex_lock(&log_descriptor.purger_lock); if (unlikely(log_descriptor.min_need_file == 0 || log_purge_disabled)) { DBUG_PRINT("info", ("No info about min need file => exit")); mysql_mutex_unlock(&log_descriptor.purger_lock); DBUG_RETURN(0); } min_file= translog_first_file(translog_get_horizon(), 1); DBUG_ASSERT(min_file != 0); /* log is already started */ for(i= min_file; i < log_descriptor.min_need_file ; i++) { char path[FN_REFLEN], *file_name; DBUG_PRINT("info", ("purge file %lu\n", (ulong) i)); file_name= translog_filename_by_fileno(i, path); rc|= MY_TEST(mysql_file_delete(key_file_translog, file_name, MYF(MY_WME))); DBUG_ASSERT(rc == 0); } mysql_mutex_unlock(&log_descriptor.purger_lock); DBUG_RETURN(rc); } /** @brief Gets min file number @param horizon the end of the log @retval minimum file number @retval 0 no files found */ uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon) { return translog_first_file(horizon, 0); } /** @brief Gets min file number which is needed @retval minimum file number @retval 0 unknown */ uint32 translog_get_first_needed_file() { uint32 file_no; mysql_mutex_lock(&log_descriptor.purger_lock); file_no= log_descriptor.min_need_file; mysql_mutex_unlock(&log_descriptor.purger_lock); return file_no; } /** @brief Gets transaction log file size @return transaction log file size */ uint32 translog_get_file_size() { uint32 res; translog_lock(); res= log_descriptor.log_file_max_size; translog_unlock(); return (res); } /** @brief Sets transaction log file size @return Returns actually set transaction log size */ void translog_set_file_size(uint32 size) { struct st_translog_buffer *old_buffer= NULL; DBUG_ENTER("translog_set_file_size"); translog_lock(); DBUG_PRINT("enter", ("Size: %lu", (ulong) size)); DBUG_ASSERT(size % TRANSLOG_PAGE_SIZE == 0); DBUG_ASSERT(size >= TRANSLOG_MIN_FILE_SIZE); log_descriptor.log_file_max_size= size; /* if current file longer then finish it*/ if (LSN_OFFSET(log_descriptor.horizon) >= log_descriptor.log_file_max_size) { old_buffer= log_descriptor.bc.buffer; translog_buffer_next(&log_descriptor.horizon, &log_descriptor.bc, 1); translog_buffer_unlock(old_buffer); } translog_unlock(); if (old_buffer) { translog_buffer_lock(old_buffer); translog_buffer_flush(old_buffer); translog_buffer_unlock(old_buffer); } DBUG_VOID_RETURN; } /** Write debug information to log if we EXTRA_DEBUG is enabled */ my_bool translog_log_debug_info(TRN *trn __attribute__((unused)), enum translog_debug_info_type type __attribute__((unused)), uchar *info __attribute__((unused)), size_t length __attribute__((unused))) { #ifdef EXTRA_DEBUG LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; uchar debug_type; LSN lsn; if (!trn) { /* We can't log the current transaction because we don't have an active transaction. Use a temporary transaction object instead */ trn= &dummy_transaction_object; } debug_type= (uchar) type; log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type; log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1; log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info; log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; return translog_write_record(&lsn, LOGREC_DEBUG_INFO, trn, NULL, (translog_size_t) (1+ length), sizeof(log_array)/sizeof(log_array[0]), log_array, NULL, NULL); #else return 0; #endif } /** Sets soft sync mode @param mode TRUE if we need switch soft sync on else off */ void translog_soft_sync(my_bool mode) { soft_sync= mode; } /** Sets hard group commit @param mode TRUE if we need switch hard group commit on else off */ void translog_hard_group_commit(my_bool mode) { hard_group_commit= mode; } /** @brief forced log sync (used when we are switching modes) */ void translog_sync() { DBUG_ENTER("ma_translog_sync"); /* The following is only true if initalization of translog succeded */ if (log_descriptor.open_files.elements != 0) { uint32 max= get_current_logfile()->number; uint32 min; min= soft_sync_min; if (!min) min= max; translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS); } DBUG_VOID_RETURN; } /** @brief set rate for group commit @param interval interval to set. @note We use this function with additional variable because have to restart service thread with new value which we can't make inside changing variable routine (update_maria_group_commit_interval) */ void translog_set_group_commit_interval(uint32 interval) { DBUG_ENTER("translog_set_group_commit_interval"); group_commit_wait= interval; DBUG_PRINT("info", ("wait: %llu", (ulonglong)group_commit_wait)); DBUG_VOID_RETURN; } /** @brief syncing service thread */ static pthread_handler_t ma_soft_sync_background( void *arg __attribute__((unused))) { my_thread_init(); { DBUG_ENTER("ma_soft_sync_background"); for(;;) { ulonglong prev_loop= microsecond_interval_timer(); ulonglong time, sleep; uint32 min, max, sync_request; min= soft_sync_min; max= soft_sync_max; sync_request= soft_need_sync; soft_sync_min= max; soft_need_sync= 0; sleep= group_commit_wait; if (sync_request) translog_sync_files(min, max, FALSE); time= microsecond_interval_timer() - prev_loop; if (time > sleep) sleep= 0; else sleep-= time; if (my_service_thread_sleep(&soft_sync_control, sleep)) break; } my_thread_end(); DBUG_RETURN(0); } } /** @brief Starts syncing thread */ int translog_soft_sync_start(void) { int res= 0; uint32 min, max; DBUG_ENTER("translog_soft_sync_start"); /* check and init variables */ min= soft_sync_min; max= soft_sync_max; if (!max) soft_sync_max= max= get_current_logfile()->number; if (!min) soft_sync_min= max; soft_need_sync= 1; if (!(res= ma_service_thread_control_init(&soft_sync_control))) if ((res= mysql_thread_create(key_thread_soft_sync, &soft_sync_control.thread, NULL, ma_soft_sync_background, NULL))) soft_sync_control.killed= TRUE; DBUG_RETURN(res); } /** @brief Stops syncing thread */ void translog_soft_sync_end(void) { DBUG_ENTER("translog_soft_sync_end"); if (soft_sync_control.inited) { ma_service_thread_control_end(&soft_sync_control); } DBUG_VOID_RETURN; } /** @brief Dump information about file header page. */ static void dump_header_page(uchar *buff) { LOGHANDLER_FILE_INFO desc; char strbuff[21]; struct tm tmp_tm; time_t header_time; translog_interpret_file_header(&desc, buff); header_time= desc.timestamp/1000000ULL; localtime_r(&header_time, &tmp_tm); printf(" This can be header page:\n" " Timestamp: %04d.%02d.%02d %02d.%02d.%02d (%s)\n" " Aria log version: %lu\n" " Server version: %lu\n" " Server id %lu\n" " Page size %lu\n", tmp_tm.tm_year+1900, tmp_tm.tm_mon+1, tmp_tm.tm_mday, tmp_tm.tm_hour, tmp_tm.tm_min, tmp_tm.tm_sec, llstr(desc.timestamp, strbuff), desc.maria_version, desc.mysql_version, desc.server_id, desc.page_size); if (desc.page_size != TRANSLOG_PAGE_SIZE) printf(" WARNING: page size is not equal compiled in one %lu!!!\n", (ulong) TRANSLOG_PAGE_SIZE); printf(" File number %lu\n" " Max lsn: " LSN_FMT "\n", desc.file_number, LSN_IN_PARTS(desc.max_lsn)); } static const char *record_class_string[]= { "LOGRECTYPE_NOT_ALLOWED", "LOGRECTYPE_VARIABLE_LENGTH", "LOGRECTYPE_PSEUDOFIXEDLENGTH", "LOGRECTYPE_FIXEDLENGTH" }; /** @brief dump information about transaction log chunk @param buffer reference to the whole page @param ptr pointer to the chunk @reval # reference to the next chunk @retval NULL can't interpret data */ static uchar *dump_chunk(uchar *buffer, uchar *ptr) { uint length; if (*ptr == TRANSLOG_FILLER) { printf(" Filler till the page end\n"); for (; ptr < buffer + TRANSLOG_PAGE_SIZE; ptr++) { if (*ptr != TRANSLOG_FILLER) { printf(" WARNING: non filler character met before page end " "(page + 0x%04x: 0x%02x) (stop interpretation)!!!", (uint) (ptr - buffer), (uint) ptr[0]); return NULL; } } return ptr; } if (*ptr == 0 || *ptr == 0xFF) { printf(" WARNING: chunk can't start from 0x0 " "(stop interpretation)!!!\n"); return NULL; } switch (ptr[0] & TRANSLOG_CHUNK_TYPE) { case TRANSLOG_CHUNK_LSN: printf(" LSN chunk type 0 (variable length)\n"); if (likely((ptr[0] & TRANSLOG_REC_TYPE) != TRANSLOG_CHUNK_0_CONT)) { printf(" Record type %u: %s record class %s compressed LSNs: %u\n", ptr[0] & TRANSLOG_REC_TYPE, (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ? log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name : "NULL"), record_class_string[log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. rclass], log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. compressed_LSN); if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != LOGRECTYPE_VARIABLE_LENGTH) { printf(" WARNING: this record class here can't be used " "(stop interpretation)!!!\n"); break; } } else printf(" Continuation of previous chunk 0 header \n"); printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1)); { uchar *hdr_ptr= ptr + 1 + 2; /* chunk type and short trid */ uint16 chunk_len; printf (" Record length: %lu\n", (ulong) translog_variable_record_1group_decode_len(&hdr_ptr)); chunk_len= uint2korr(hdr_ptr); if (chunk_len == 0) printf (" It is 1 group record (chunk length == 0)\n"); else { uint16 groups, i; printf (" Chunk length %u\n", (uint) chunk_len); groups= uint2korr(hdr_ptr + 2); hdr_ptr+= 4; printf (" Number of groups left to the end %u:\n", (uint) groups); for(i= 0; i < groups && hdr_ptr < buffer + TRANSLOG_PAGE_SIZE; i++, hdr_ptr+= LSN_STORE_SIZE + 1) { TRANSLOG_ADDRESS gpr_addr= lsn_korr(hdr_ptr); uint pages= hdr_ptr[LSN_STORE_SIZE]; printf (" Group +#%u: " LSN_FMT " pages: %u\n", (uint) i, LSN_IN_PARTS(gpr_addr), pages); } } } break; case TRANSLOG_CHUNK_FIXED: printf(" LSN chunk type 1 (fixed size)\n"); printf(" Record type %u: %s record class %s compressed LSNs: %u\n", ptr[0] & TRANSLOG_REC_TYPE, (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ? log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name : "NULL"), record_class_string[log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. rclass], log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. compressed_LSN); if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != LOGRECTYPE_PSEUDOFIXEDLENGTH && log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != LOGRECTYPE_FIXEDLENGTH) { printf(" WARNING: this record class here can't be used " "(stop interpretation)!!!\n"); } printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1)); break; case TRANSLOG_CHUNK_NOHDR: printf(" No header chunk type 2(till the end of the page)\n"); if (ptr[0] & TRANSLOG_REC_TYPE) { printf(" WARNING: chunk header content record type: 0x%02x " "(dtop interpretation)!!!", (uint) ptr[0]); return NULL; } break; case TRANSLOG_CHUNK_LNGTH: printf(" Chunk with length type 3\n"); if (ptr[0] & TRANSLOG_REC_TYPE) { printf(" WARNING: chunk header content record type: 0x%02x " "(dtop interpretation)!!!", (uint) ptr[0]); return NULL; } break; } { intptr offset= ptr - buffer; DBUG_ASSERT(offset <= UINT_MAX16); length= translog_get_total_chunk_length(buffer, (uint16)offset); } printf(" Length %u\n", length); ptr+= length; return ptr; } /** @brief Dump information about page with data. */ static void dump_datapage(uchar *buffer, File handler) { uchar *ptr; ulong offset; uint32 page, file; uint header_len; printf(" Page: %ld File number: %ld\n", (ulong) (page= uint3korr(buffer)), (ulong) (file= uint3korr(buffer + 3))); if (page == 0) printf(" WARNING: page == 0!!!\n"); if (file == 0) printf(" WARNING: file == 0!!!\n"); offset= page * TRANSLOG_PAGE_SIZE; printf(" Flags (0x%x):\n", (uint) buffer[TRANSLOG_PAGE_FLAGS]); if (buffer[TRANSLOG_PAGE_FLAGS]) { if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC) printf(" Page CRC\n"); if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) printf(" Sector protection\n"); if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC) printf(" Record CRC (WARNING: not yet implemented!!!)\n"); if (buffer[TRANSLOG_PAGE_FLAGS] & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | TRANSLOG_RECORD_CRC)) { printf(" WARNING: unknown flags (stop interpretation)!!!\n"); return; } } else printf(" No flags\n"); printf(" Page header length: %u\n", (header_len= page_overhead[buffer[TRANSLOG_PAGE_FLAGS]])); if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC) { uint32 crc= uint4korr(buffer + TRANSLOG_PAGE_FLAGS + 1); uint32 ccrc; printf (" Page CRC 0x%04lx\n", (ulong) crc); ccrc= translog_crc(buffer + header_len, TRANSLOG_PAGE_SIZE - header_len); if (crc != ccrc) printf(" WARNING: calculated CRC: 0x%04lx!!!\n", (ulong) ccrc); } if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) { TRANSLOG_FILE tfile; { uchar *table= buffer + header_len - TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; uint i; printf(" Sector protection current value: 0x%02x\n", (uint) table[0]); for (i= 1; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++) { printf(" Sector protection in sector: 0x%02x saved value 0x%02x\n", (uint)buffer[i * DISK_DRIVE_SECTOR_SIZE], (uint)table[i]); } } tfile.number= file; bzero(&tfile.handler, sizeof(tfile.handler)); tfile.handler.file= handler; tfile.was_recovered= 0; tfile.is_sync= 1; if (translog_check_sector_protection(buffer, &tfile)) printf(" WARNING: sector protection found problems!!!\n"); } ptr= buffer + header_len; while (ptr && ptr < buffer + TRANSLOG_PAGE_SIZE) { printf(" Chunk %d %lld:\n", file,((longlong) (ptr - buffer)+ offset)); ptr= dump_chunk(buffer, ptr); } } /** @brief Dump information about page. */ void dump_page(uchar *buffer, File handler) { if (strncmp((char*)maria_trans_file_magic, (char*)buffer, sizeof(maria_trans_file_magic)) == 0) { dump_header_page(buffer); return; } dump_datapage(buffer, handler); } /* Handle backup calls */ void translog_disable_purge() { mysql_mutex_lock(&log_descriptor.purger_lock); log_purge_disabled++; mysql_mutex_unlock(&log_descriptor.purger_lock); } void translog_enable_purge() { mysql_mutex_lock(&log_descriptor.purger_lock); log_purge_disabled--; mysql_mutex_unlock(&log_descriptor.purger_lock); }