/***************************************************************************** Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA *****************************************************************************/ /**************************************************//** @file buf/buf0dump.cc Implements a buffer pool dump/load. Created April 08, 2011 Vasil Dimov *******************************************************/ #include "my_global.h" #include "mysqld.h" #include "my_sys.h" #include "mysql/psi/mysql_stage.h" #include "mysql/psi/psi.h" #include "buf0buf.h" #include "buf0dump.h" #include "dict0dict.h" #include "os0file.h" #include "os0thread.h" #include "srv0srv.h" #include "srv0start.h" #include "sync0rw.h" #include "ut0byte.h" #include #include "mysql/service_wsrep.h" /* wsrep_recovery */ #include static void buf_do_load_dump(); enum status_severity { STATUS_INFO, STATUS_ERR }; #define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) /* Flags that tell the buffer pool dump/load thread which action should it take after being waked up. */ static volatile bool buf_dump_should_start; static volatile bool buf_load_should_start; static bool buf_load_abort_flag; /** Start the buffer pool dump/load task and instructs it to start a dump. */ void buf_dump_start() { buf_dump_should_start= true; buf_do_load_dump(); } /** Start the buffer pool dump/load task and instructs it to start a load. */ void buf_load_start() { buf_load_should_start= true; buf_do_load_dump(); } /*****************************************************************//** Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status to the specified string. The format and the following parameters are the same as the ones used for printf(3). The value of this variable can be retrieved by: SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS'; or by: SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */ static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) void buf_dump_status( /*============*/ enum status_severity severity,/*!< in: status severity */ const char* fmt, /*!< in: format */ ...) /*!< in: extra parameters according to fmt */ { va_list ap; va_start(ap, fmt); vsnprintf( export_vars.innodb_buffer_pool_dump_status, sizeof(export_vars.innodb_buffer_pool_dump_status), fmt, ap); switch (severity) { case STATUS_INFO: ib::info() << export_vars.innodb_buffer_pool_dump_status; break; case STATUS_ERR: ib::error() << export_vars.innodb_buffer_pool_dump_status; break; } va_end(ap); } /*****************************************************************//** Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status to the specified string. The format and the following parameters are the same as the ones used for printf(3). The value of this variable can be retrieved by: SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS'; or by: SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */ static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) void buf_load_status( /*============*/ enum status_severity severity,/*!< in: status severity */ const char* fmt, /*!< in: format */ ...) /*!< in: extra parameters according to fmt */ { va_list ap; va_start(ap, fmt); vsnprintf( export_vars.innodb_buffer_pool_load_status, sizeof(export_vars.innodb_buffer_pool_load_status), fmt, ap); switch (severity) { case STATUS_INFO: ib::info() << export_vars.innodb_buffer_pool_load_status; break; case STATUS_ERR: ib::error() << export_vars.innodb_buffer_pool_load_status; break; } va_end(ap); } /** Returns the directory path where the buffer pool dump file will be created. @return directory path */ static const char* get_buf_dump_dir() { const char* dump_dir; /* The dump file should be created in the default data directory if innodb_data_home_dir is set as an empty string. */ if (!*srv_data_home) { dump_dir = fil_path_to_mysql_datadir; } else { dump_dir = srv_data_home; } return(dump_dir); } /** Generate the path to the buffer pool dump/load file. @param[out] path generated path @param[in] path_size size of 'path', used as in snprintf(3). */ static void buf_dump_generate_path(char *path, size_t path_size) { char buf[FN_REFLEN]; mysql_mutex_lock(&LOCK_global_system_variables); snprintf(buf, sizeof(buf), "%s%c%s", get_buf_dump_dir(), OS_PATH_SEPARATOR, srv_buf_dump_filename); mysql_mutex_unlock(&LOCK_global_system_variables); os_file_type_t type; bool exists = false; bool ret; ret = os_file_status(buf, &exists, &type); /* For realpath() to succeed the file must exist. */ if (ret && exists) { /* my_realpath() assumes the destination buffer is big enough to hold FN_REFLEN bytes. */ ut_a(path_size >= FN_REFLEN); my_realpath(path, buf, 0); } else { /* If it does not exist, then resolve only srv_data_home and append srv_buf_dump_filename to it. */ char srv_data_home_full[FN_REFLEN]; my_realpath(srv_data_home_full, get_buf_dump_dir(), 0); if (srv_data_home_full[strlen(srv_data_home_full) - 1] == OS_PATH_SEPARATOR) { snprintf(path, path_size, "%s%s", srv_data_home_full, srv_buf_dump_filename); } else { snprintf(path, path_size, "%s%c%s", srv_data_home_full, OS_PATH_SEPARATOR, srv_buf_dump_filename); } } } /*****************************************************************//** Perform a buffer pool dump into the file specified by innodb_buffer_pool_filename. If any errors occur then the value of innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status(). The dump filename can be specified by (relative to srv_data_home): SET GLOBAL innodb_buffer_pool_filename='filename'; */ static void buf_dump( /*=====*/ ibool obey_shutdown) /*!< in: quit if we are in a shutting down state */ { #define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown) char full_filename[OS_FILE_MAX_PATH]; char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"]; char now[32]; FILE* f; int ret; buf_dump_generate_path(full_filename, sizeof(full_filename)); snprintf(tmp_filename, sizeof(tmp_filename), "%s.incomplete", full_filename); buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s", full_filename); #if defined(__GLIBC__) || defined(__WIN__) || O_CLOEXEC == 0 f = fopen(tmp_filename, "w" STR_O_CLOEXEC); #else { int fd; fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640); if (fd >= 0) { f = fdopen(fd, "w"); } else { f = NULL; } } #endif if (f == NULL) { buf_dump_status(STATUS_ERR, "Cannot open '%s' for writing: %s", tmp_filename, strerror(errno)); return; } const buf_page_t* bpage; page_id_t* dump; ulint n_pages; ulint j; mysql_mutex_lock(&buf_pool.mutex); n_pages = UT_LIST_GET_LEN(buf_pool.LRU); /* skip empty buffer pools */ if (n_pages == 0) { mysql_mutex_unlock(&buf_pool.mutex); goto done; } if (srv_buf_pool_dump_pct != 100) { ulint t_pages; /* limit the number of total pages dumped to X% of the total number of pages */ t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100; if (n_pages > t_pages) { buf_dump_status(STATUS_INFO, "Restricted to " ULINTPF " pages due to " "innodb_buf_pool_dump_pct=%lu", t_pages, srv_buf_pool_dump_pct); n_pages = t_pages; } if (n_pages == 0) { n_pages = 1; } } dump = static_cast(ut_malloc_nokey( n_pages * sizeof(*dump))); if (dump == NULL) { mysql_mutex_unlock(&buf_pool.mutex); fclose(f); buf_dump_status(STATUS_ERR, "Cannot allocate " ULINTPF " bytes: %s", (ulint) (n_pages * sizeof(*dump)), strerror(errno)); /* leave tmp_filename to exist */ return; } for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0; bpage != NULL && j < n_pages; bpage = UT_LIST_GET_NEXT(LRU, bpage)) { ut_a(bpage->in_file()); const page_id_t id(bpage->id()); if (id.space() == SRV_TMP_SPACE_ID) { /* Ignore the innodb_temporary tablespace. */ continue; } if (bpage->status == buf_page_t::FREED) { continue; } dump[j++] = id; } mysql_mutex_unlock(&buf_pool.mutex); ut_a(j <= n_pages); n_pages = j; for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) { ret = fprintf(f, "%u,%u\n", dump[j].space(), dump[j].page_no()); if (ret < 0) { ut_free(dump); fclose(f); buf_dump_status(STATUS_ERR, "Cannot write to '%s': %s", tmp_filename, strerror(errno)); /* leave tmp_filename to exist */ return; } if (SHUTTING_DOWN() && !(j & 1023)) { service_manager_extend_timeout( INNODB_EXTEND_TIMEOUT_INTERVAL, "Dumping buffer pool page " ULINTPF "/" ULINTPF, j + 1, n_pages); } } ut_free(dump); done: ret = fclose(f); if (ret != 0) { buf_dump_status(STATUS_ERR, "Cannot close '%s': %s", tmp_filename, strerror(errno)); return; } /* else */ ret = unlink(full_filename); if (ret != 0 && errno != ENOENT) { buf_dump_status(STATUS_ERR, "Cannot delete '%s': %s", full_filename, strerror(errno)); /* leave tmp_filename to exist */ return; } /* else */ ret = rename(tmp_filename, full_filename); if (ret != 0) { buf_dump_status(STATUS_ERR, "Cannot rename '%s' to '%s': %s", tmp_filename, full_filename, strerror(errno)); /* leave tmp_filename to exist */ return; } /* else */ /* success */ ut_sprintf_timestamp(now); buf_dump_status(STATUS_INFO, "Buffer pool(s) dump completed at %s", now); /* Though dumping doesn't related to an incomplete load, we reset this to 0 here to indicate that a shutdown can also perform a dump */ export_vars.innodb_buffer_pool_load_incomplete = 0; } /*****************************************************************//** Artificially delay the buffer pool loading if necessary. The idea of this function is to prevent hogging the server with IO and slowing down too much normal client queries. */ UNIV_INLINE void buf_load_throttle_if_needed( /*========================*/ ulint* last_check_time, /*!< in/out: milliseconds since epoch of the last time we did check if throttling is needed, we do the check every srv_io_capacity IO ops. */ ulint* last_activity_count, ulint n_io) /*!< in: number of IO ops done since buffer pool load has started */ { if (n_io % srv_io_capacity < srv_io_capacity - 1) { return; } if (*last_check_time == 0 || *last_activity_count == 0) { *last_check_time = ut_time_ms(); *last_activity_count = srv_get_activity_count(); return; } /* srv_io_capacity IO operations have been performed by buffer pool load since the last time we were here. */ /* If no other activity, then keep going without any delay. */ if (srv_get_activity_count() == *last_activity_count) { return; } /* There has been other activity, throttle. */ ulint now = ut_time_ms(); ulint elapsed_time = now - *last_check_time; /* Notice that elapsed_time is not the time for the last srv_io_capacity IO operations performed by BP load. It is the time elapsed since the last time we detected that there has been other activity. This has a small and acceptable deficiency, e.g.: 1. BP load runs and there is no other activity. 2. Other activity occurs, we run N IO operations after that and enter here (where 0 <= N < srv_io_capacity). 3. last_check_time is very old and we do not sleep at this time, but only update last_check_time and last_activity_count. 4. We run srv_io_capacity more IO operations and call this function again. 5. There has been more other activity and thus we enter here. 6. Now last_check_time is recent and we sleep if necessary to prevent more than srv_io_capacity IO operations per second. The deficiency is that we could have slept at 3., but for this we would have to update last_check_time before the "cur_activity_count == *last_activity_count" check and calling ut_time_ms() that often may turn out to be too expensive. */ if (elapsed_time < 1000 /* 1 sec (1000 milli secs) */) { os_thread_sleep((1000 - elapsed_time) * 1000 /* micro secs */); } *last_check_time = ut_time_ms(); *last_activity_count = srv_get_activity_count(); } /*****************************************************************//** Perform a buffer pool load from the file specified by innodb_buffer_pool_filename. If any errors occur then the value of innodb_buffer_pool_load_status will be set accordingly, see buf_load_status(). The dump filename can be specified by (relative to srv_data_home): SET GLOBAL innodb_buffer_pool_filename='filename'; */ static void buf_load() /*======*/ { char full_filename[OS_FILE_MAX_PATH]; char now[32]; FILE* f; page_id_t* dump; ulint dump_n; ulint i; uint32_t space_id; uint32_t page_no; int fscanf_ret; /* Ignore any leftovers from before */ buf_load_abort_flag = false; buf_dump_generate_path(full_filename, sizeof(full_filename)); buf_load_status(STATUS_INFO, "Loading buffer pool(s) from %s", full_filename); f = fopen(full_filename, "r" STR_O_CLOEXEC); if (f == NULL) { buf_load_status(STATUS_INFO, "Cannot open '%s' for reading: %s", full_filename, strerror(errno)); return; } /* else */ /* First scan the file to estimate how many entries are in it. This file is tiny (approx 500KB per 1GB buffer pool), reading it two times is fine. */ dump_n = 0; while (fscanf(f, "%u,%u", &space_id, &page_no) == 2 && !SHUTTING_DOWN()) { dump_n++; } if (!SHUTTING_DOWN() && !feof(f)) { /* fscanf() returned != 2 */ const char* what; if (ferror(f)) { what = "reading"; } else { what = "parsing"; } fclose(f); buf_load_status(STATUS_ERR, "Error %s '%s'," " unable to load buffer pool (stage 1)", what, full_filename); return; } /* If dump is larger than the buffer pool(s), then we ignore the extra trailing. This could happen if a dump is made, then buffer pool is shrunk and then load is attempted. */ dump_n = std::min(dump_n, buf_pool.get_n_pages()); if (dump_n != 0) { dump = static_cast(ut_malloc_nokey( dump_n * sizeof(*dump))); } else { fclose(f); ut_sprintf_timestamp(now); buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s" " (%s was empty)", now, full_filename); return; } if (dump == NULL) { fclose(f); buf_load_status(STATUS_ERR, "Cannot allocate " ULINTPF " bytes: %s", dump_n * sizeof(*dump), strerror(errno)); return; } rewind(f); export_vars.innodb_buffer_pool_load_incomplete = 1; for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no); if (fscanf_ret != 2) { if (feof(f)) { break; } /* else */ ut_free(dump); fclose(f); buf_load_status(STATUS_ERR, "Error parsing '%s', unable" " to load buffer pool (stage 2)", full_filename); return; } if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) { ut_free(dump); fclose(f); buf_load_status(STATUS_ERR, "Error parsing '%s': bogus" " space,page %u,%u at line " ULINTPF ", unable to load buffer pool", full_filename, space_id, page_no, i); return; } dump[i] = page_id_t(space_id, page_no); } /* Set dump_n to the actual number of initialized elements, i could be smaller than dump_n here if the file got truncated after we read it the first time. */ dump_n = i; fclose(f); if (dump_n == 0) { ut_free(dump); ut_sprintf_timestamp(now); buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s" " (%s was empty or had errors)", now, full_filename); return; } if (!SHUTTING_DOWN()) { std::sort(dump, dump + dump_n); } ulint last_check_time = 0; ulint last_activity_cnt = 0; /* Avoid calling the expensive fil_space_t::get() for each page within the same tablespace. dump[] is sorted by (space, page), so all pages from a given tablespace are consecutive. */ ulint cur_space_id = dump[0].space(); fil_space_t* space = fil_space_t::get(cur_space_id); ulint zip_size = space ? space->zip_size() : 0; PSI_stage_progress* pfs_stage_progress __attribute__((unused)) = mysql_set_stage(srv_stage_buffer_pool_load.m_key); mysql_stage_set_work_estimated(pfs_stage_progress, dump_n); mysql_stage_set_work_completed(pfs_stage_progress, 0); for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { /* space_id for this iteration of the loop */ const ulint this_space_id = dump[i].space(); if (this_space_id == SRV_TMP_SPACE_ID) { /* Ignore the innodb_temporary tablespace. */ continue; } if (this_space_id != cur_space_id) { if (space) { space->release(); } cur_space_id = this_space_id; space = fil_space_t::get(cur_space_id); if (!space) { continue; } zip_size = space->zip_size(); } /* JAN: TODO: As we use background page read below, if tablespace is encrypted we cant use it. */ if (!space || dump[i].page_no() >= space->get_size() || (space->crypt_data && space->crypt_data->encryption != FIL_ENCRYPTION_OFF && space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) { continue; } if (space->is_stopping()) { space->release(); space = nullptr; continue; } space->reacquire(); buf_read_page_background(space, dump[i], zip_size, true); if (buf_load_abort_flag) { if (space) { space->release(); } buf_load_abort_flag = false; ut_free(dump); buf_load_status( STATUS_INFO, "Buffer pool(s) load aborted on request"); /* Premature end, set estimated = completed = i and end the current stage event. */ mysql_stage_set_work_estimated(pfs_stage_progress, i); mysql_stage_set_work_completed(pfs_stage_progress, i); mysql_end_stage(); return; } buf_load_throttle_if_needed( &last_check_time, &last_activity_cnt, i); #ifdef UNIV_DEBUG if ((i+1) >= srv_buf_pool_load_pages_abort) { buf_load_abort_flag = true; } #endif } if (space) { space->release(); } ut_free(dump); ut_sprintf_timestamp(now); if (i == dump_n) { buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s", now); export_vars.innodb_buffer_pool_load_incomplete = 0; } else if (!buf_load_abort_flag) { buf_load_status(STATUS_INFO, "Buffer pool(s) load aborted due to user instigated abort at %s", now); /* intentionally don't reset innodb_buffer_pool_load_incomplete as we don't want a shutdown to save the buffer pool */ } else { buf_load_status(STATUS_INFO, "Buffer pool(s) load aborted due to shutdown at %s", now); /* intentionally don't reset innodb_buffer_pool_load_incomplete as we want to abort without saving the buffer pool */ } /* Make sure that estimated = completed when we end. */ mysql_stage_set_work_completed(pfs_stage_progress, dump_n); /* End the stage progress event. */ mysql_end_stage(); } /** Abort a currently running buffer pool load. */ void buf_load_abort() { buf_load_abort_flag= true; } /*****************************************************************//** This is the main task for buffer pool dump/load. when scheduled either performs a dump or load, depending on server state, state of the variables etc- */ static void buf_dump_load_func(void *) { ut_ad(!srv_read_only_mode); static bool first_time = true; if (first_time && srv_buffer_pool_load_at_startup) { #ifdef WITH_WSREP if (!get_wsrep_recovery()) { #endif /* WITH_WSREP */ buf_load(); #ifdef WITH_WSREP } #endif /* WITH_WSREP */ } first_time = false; while (!SHUTTING_DOWN()) { if (buf_dump_should_start) { buf_dump_should_start = false; buf_dump(true); } if (buf_load_should_start) { buf_load_should_start = false; buf_load(); } if (!buf_dump_should_start && !buf_load_should_start) { return; } } /* In shutdown */ if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) { if (export_vars.innodb_buffer_pool_load_incomplete) { buf_dump_status(STATUS_INFO, "Dumping of buffer pool not started" " as load was incomplete"); #ifdef WITH_WSREP } else if (get_wsrep_recovery()) { #endif /* WITH_WSREP */ } else { buf_dump(false/* do complete dump at shutdown */); } } } /* Execute task with max.concurrency */ static tpool::task_group tpool_group(1); static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group); static bool load_dump_enabled; /** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/ void buf_load_at_startup() { load_dump_enabled= true; if (srv_buffer_pool_load_at_startup) buf_do_load_dump(); } static void buf_do_load_dump() { if (load_dump_enabled && !buf_dump_load_task.is_running()) srv_thread_pool->submit_task(&buf_dump_load_task); } /** Wait for currently running load/dumps to finish*/ void buf_load_dump_end() { ut_ad(SHUTTING_DOWN()); buf_dump_load_task.wait(); }