summaryrefslogtreecommitdiffstats
path: root/sql/log.cc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--sql/log.cc12168
1 files changed, 12168 insertions, 0 deletions
diff --git a/sql/log.cc b/sql/log.cc
new file mode 100644
index 00000000..d3879aad
--- /dev/null
+++ b/sql/log.cc
@@ -0,0 +1,12168 @@
+/* Copyright (c) 2000, 2018, Oracle and/or its affiliates.
+ Copyright (c) 2009, 2022, MariaDB Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+
+/**
+ @file
+
+ @brief
+ logging of commands
+
+ @todo
+ Abort logging when we get an error in reading or writing log files
+*/
+
+#include "mariadb.h" /* NO_EMBEDDED_ACCESS_CHECKS */
+#include "sql_priv.h"
+#include "log.h"
+#include "sql_base.h" // open_log_table
+#include "sql_repl.h"
+#include "sql_delete.h" // mysql_truncate
+#include "sql_parse.h" // command_name
+#include "sql_time.h" // calc_time_from_sec, my_time_compare
+#include "tztime.h" // my_tz_OFFSET0, struct Time_zone
+#include "log_event.h" // Query_log_event
+#include "rpl_filter.h"
+#include "rpl_rli.h"
+#include "sql_audit.h"
+#include "mysqld.h"
+#include "ddl_log.h"
+
+#include <my_dir.h>
+#include <m_ctype.h> // For test_if_number
+
+#include <set_var.h> // for Sys_last_gtid_ptr
+
+#ifdef _WIN32
+#include "message.h"
+#endif
+
+#include "sql_plugin.h"
+#include "debug_sync.h"
+#include "sql_show.h"
+#include "my_pthread.h"
+#include "semisync_master.h"
+#include "sp_rcontext.h"
+#include "sp_head.h"
+#include "sql_table.h"
+
+#include "wsrep_mysqld.h"
+#ifdef WITH_WSREP
+#include "wsrep_trans_observer.h"
+#include "wsrep_status.h"
+#endif /* WITH_WSREP */
+
+#ifdef HAVE_REPLICATION
+#include "semisync_master.h"
+#include "semisync_slave.h"
+#include <utility> // pair
+#endif
+
+/* max size of the log message */
+#define MAX_LOG_BUFFER_SIZE 1024
+#define MAX_TIME_SIZE 32
+#define MY_OFF_T_UNDEF (~(my_off_t)0UL)
+/* Truncate cache log files bigger than this */
+#define CACHE_FILE_TRUNC_SIZE 65536
+
+#define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
+
+handlerton *binlog_hton;
+LOGGER logger;
+
+const char *log_bin_index= 0;
+const char *log_bin_basename= 0;
+
+MYSQL_BIN_LOG mysql_bin_log(&sync_binlog_period);
+
+static bool test_if_number(const char *str,
+ ulong *res, bool allow_wildcards);
+static int binlog_init(void *p);
+static int binlog_close_connection(handlerton *hton, THD *thd);
+static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv);
+static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
+static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
+ THD *thd);
+static int binlog_rollback(handlerton *hton, THD *thd, bool all);
+static int binlog_prepare(handlerton *hton, THD *thd, bool all);
+static int binlog_start_consistent_snapshot(handlerton *hton, THD *thd);
+static int binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
+ Log_event *end_ev, bool all, bool using_stmt,
+ bool using_trx, bool is_ro_1pc);
+
+static const LEX_CSTRING write_error_msg=
+ { STRING_WITH_LEN("error writing to the binary log") };
+
+static my_bool opt_optimize_thread_scheduling= TRUE;
+ulong binlog_checksum_options;
+#ifndef DBUG_OFF
+ulong opt_binlog_dbug_fsync_sleep= 0;
+#endif
+
+mysql_mutex_t LOCK_prepare_ordered;
+mysql_cond_t COND_prepare_ordered;
+mysql_mutex_t LOCK_after_binlog_sync;
+mysql_mutex_t LOCK_commit_ordered;
+
+static ulonglong binlog_status_var_num_commits;
+static ulonglong binlog_status_var_num_group_commits;
+static ulonglong binlog_status_group_commit_trigger_count;
+static ulonglong binlog_status_group_commit_trigger_lock_wait;
+static ulonglong binlog_status_group_commit_trigger_timeout;
+static char binlog_snapshot_file[FN_REFLEN];
+static ulonglong binlog_snapshot_position;
+
+static const char *fatal_log_error=
+ "Could not use %s for logging (error %d). "
+ "Turning logging off for the whole duration of the MariaDB server process. "
+ "To turn it on again: fix the cause, shutdown the MariaDB server and "
+ "restart it.";
+
+
+static SHOW_VAR binlog_status_vars_detail[]=
+{
+ {"commits",
+ (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
+ {"group_commits",
+ (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
+ {"group_commit_trigger_count",
+ (char *)&binlog_status_group_commit_trigger_count, SHOW_LONGLONG},
+ {"group_commit_trigger_lock_wait",
+ (char *)&binlog_status_group_commit_trigger_lock_wait, SHOW_LONGLONG},
+ {"group_commit_trigger_timeout",
+ (char *)&binlog_status_group_commit_trigger_timeout, SHOW_LONGLONG},
+ {"snapshot_file",
+ (char *)&binlog_snapshot_file, SHOW_CHAR},
+ {"snapshot_position",
+ (char *)&binlog_snapshot_position, SHOW_LONGLONG},
+ {NullS, NullS, SHOW_LONG}
+};
+
+/*
+ Variables for the binlog background thread.
+ Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
+ */
+static bool binlog_background_thread_started= false;
+static bool binlog_background_thread_stop= false;
+static MYSQL_BIN_LOG::xid_count_per_binlog *
+ binlog_background_thread_queue= NULL;
+
+static bool start_binlog_background_thread();
+
+static rpl_binlog_state rpl_global_gtid_binlog_state;
+
+void setup_log_handling()
+{
+ rpl_global_gtid_binlog_state.init();
+}
+
+
+/**
+ purge logs, master and slave sides both, related error code
+ converter.
+ Called from @c purge_error_message(), @c MYSQL_BIN_LOG::reset_logs()
+
+ @param res an internal to purging routines error code
+
+ @return the user level error code ER_*
+*/
+uint purge_log_get_error_code(int res)
+{
+ uint errcode= 0;
+
+ switch (res) {
+ case 0: break;
+ case LOG_INFO_EOF: errcode= ER_UNKNOWN_TARGET_BINLOG; break;
+ case LOG_INFO_IO: errcode= ER_IO_ERR_LOG_INDEX_READ; break;
+ case LOG_INFO_INVALID:errcode= ER_BINLOG_PURGE_PROHIBITED; break;
+ case LOG_INFO_SEEK: errcode= ER_FSEEK_FAIL; break;
+ case LOG_INFO_MEM: errcode= ER_OUT_OF_RESOURCES; break;
+ case LOG_INFO_FATAL: errcode= ER_BINLOG_PURGE_FATAL_ERR; break;
+ case LOG_INFO_IN_USE: errcode= ER_LOG_IN_USE; break;
+ case LOG_INFO_EMFILE: errcode= ER_BINLOG_PURGE_EMFILE; break;
+ default: errcode= ER_LOG_PURGE_UNKNOWN_ERR; break;
+ }
+
+ return errcode;
+}
+
+/**
+ Silence all errors and warnings reported when performing a write
+ to a log table.
+ Errors and warnings are not reported to the client or SQL exception
+ handlers, so that the presence of logging does not interfere and affect
+ the logic of an application.
+*/
+class Silence_log_table_errors : public Internal_error_handler
+{
+ char m_message[MYSQL_ERRMSG_SIZE];
+public:
+ Silence_log_table_errors()
+ {
+ m_message[0]= '\0';
+ }
+
+ virtual ~Silence_log_table_errors() = default;
+
+ virtual bool handle_condition(THD *thd,
+ uint sql_errno,
+ const char* sql_state,
+ Sql_condition::enum_warning_level *level,
+ const char* msg,
+ Sql_condition ** cond_hdl);
+ const char *message() const { return m_message; }
+};
+
+bool
+Silence_log_table_errors::handle_condition(THD *,
+ uint,
+ const char*,
+ Sql_condition::enum_warning_level*,
+ const char* msg,
+ Sql_condition ** cond_hdl)
+{
+ *cond_hdl= NULL;
+ strmake_buf(m_message, msg);
+ return TRUE;
+}
+
+sql_print_message_func sql_print_message_handlers[3] =
+{
+ sql_print_information,
+ sql_print_warning,
+ sql_print_error
+};
+
+
+/**
+ Create the name of the log file
+
+ @param[OUT] out a pointer to a new allocated name will go there
+ @param[IN] log_ext The extension for the file (e.g .log)
+ @param[IN] once whether to use malloc_once or a normal malloc.
+*/
+void make_default_log_name(char **out, const char* log_ext, bool once)
+{
+ char buff[FN_REFLEN+10];
+ fn_format(buff, opt_log_basename, "", log_ext, MYF(MY_REPLACE_EXT));
+ if (once)
+ *out= my_once_strdup(buff, MYF(MY_WME));
+ else
+ {
+ my_free(*out);
+ *out= my_strdup(PSI_INSTRUMENT_ME, buff, MYF(MY_WME));
+ }
+}
+
+
+/*
+ Helper classes to store non-transactional and transactional data
+ before copying it to the binary log.
+*/
+class binlog_cache_data
+{
+public:
+ binlog_cache_data(): m_pending(0), status(0),
+ before_stmt_pos(MY_OFF_T_UNDEF),
+ incident(FALSE),
+ saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
+ ptr_binlog_cache_disk_use(0)
+ { }
+
+ ~binlog_cache_data()
+ {
+ DBUG_ASSERT(empty());
+ close_cached_file(&cache_log);
+ }
+
+ /*
+ Return 1 if there is no relevant entries in the cache
+
+ This is:
+ - Cache is empty
+ - There are row or critical (DDL?) events in the cache
+
+ The status test is needed to avoid writing entries with only
+ a table map entry, which would crash in do_apply_event() on the slave
+ as it assumes that there is always a row entry after a table map.
+ */
+ bool empty() const
+ {
+ return (pending() == NULL &&
+ (my_b_write_tell(&cache_log) == 0 ||
+ ((status & (LOGGED_ROW_EVENT | LOGGED_CRITICAL)) == 0)));
+ }
+
+ Rows_log_event *pending() const
+ {
+ return m_pending;
+ }
+
+ void set_pending(Rows_log_event *const pending_arg)
+ {
+ m_pending= pending_arg;
+ }
+
+ void set_incident(void)
+ {
+ incident= TRUE;
+ }
+
+ bool has_incident(void)
+ {
+ return(incident);
+ }
+
+ void reset()
+ {
+ bool cache_was_empty= empty();
+ bool truncate_file= (cache_log.file != -1 &&
+ my_b_write_tell(&cache_log) > CACHE_FILE_TRUNC_SIZE);
+ truncate(0,1); // Forget what's in cache
+ if (!cache_was_empty)
+ compute_statistics();
+ if (truncate_file)
+ my_chsize(cache_log.file, 0, 0, MYF(MY_WME));
+
+ status= 0;
+ incident= FALSE;
+ before_stmt_pos= MY_OFF_T_UNDEF;
+ DBUG_ASSERT(empty());
+ }
+
+ my_off_t get_byte_position() const
+ {
+ return my_b_tell(&cache_log);
+ }
+
+ my_off_t get_prev_position()
+ {
+ return(before_stmt_pos);
+ }
+
+ void set_prev_position(my_off_t pos)
+ {
+ before_stmt_pos= pos;
+ }
+
+ void restore_prev_position()
+ {
+ truncate(before_stmt_pos);
+ }
+
+ void restore_savepoint(my_off_t pos)
+ {
+ truncate(pos);
+ if (pos < before_stmt_pos)
+ before_stmt_pos= MY_OFF_T_UNDEF;
+ }
+
+ void set_binlog_cache_info(my_off_t param_max_binlog_cache_size,
+ ulong *param_ptr_binlog_cache_use,
+ ulong *param_ptr_binlog_cache_disk_use)
+ {
+ /*
+ The assertions guarantee that the set_binlog_cache_info is
+ called just once and information passed as parameters are
+ never zero.
+
+ This is done while calling the constructor binlog_cache_mngr.
+ We cannot set information in the constructor binlog_cache_data
+ because the space for binlog_cache_mngr is allocated through
+ a placement new.
+
+ In the future, we can refactor this and change it to avoid
+ the set_binlog_info.
+ */
+ DBUG_ASSERT(saved_max_binlog_cache_size == 0);
+ DBUG_ASSERT(param_max_binlog_cache_size != 0);
+ DBUG_ASSERT(ptr_binlog_cache_use == 0);
+ DBUG_ASSERT(param_ptr_binlog_cache_use != 0);
+ DBUG_ASSERT(ptr_binlog_cache_disk_use == 0);
+ DBUG_ASSERT(param_ptr_binlog_cache_disk_use != 0);
+
+ saved_max_binlog_cache_size= param_max_binlog_cache_size;
+ ptr_binlog_cache_use= param_ptr_binlog_cache_use;
+ ptr_binlog_cache_disk_use= param_ptr_binlog_cache_disk_use;
+ cache_log.end_of_file= saved_max_binlog_cache_size;
+ }
+
+ void add_status(enum_logged_status status_arg)
+ {
+ status|= status_arg;
+ }
+
+ /*
+ Cache to store data before copying it to the binary log.
+ */
+ IO_CACHE cache_log;
+
+private:
+ /*
+ Pending binrows event. This event is the event where the rows are currently
+ written.
+ */
+ Rows_log_event *m_pending;
+
+ /*
+ Bit flags for what has been writing to cache. Used to
+ discard logs without any data changes.
+ see enum_logged_status;
+ */
+ uint32 status;
+
+ /*
+ Binlog position before the start of the current statement.
+ */
+ my_off_t before_stmt_pos;
+
+ /*
+ This indicates that some events did not get into the cache and most likely
+ it is corrupted.
+ */
+ bool incident;
+
+ /**
+ This function computes binlog cache and disk usage.
+ */
+ void compute_statistics()
+ {
+ statistic_increment(*ptr_binlog_cache_use, &LOCK_status);
+ if (cache_log.disk_writes != 0)
+ {
+#ifdef REAL_STATISTICS
+ statistic_add(*ptr_binlog_cache_disk_use,
+ cache_log.disk_writes, &LOCK_status);
+#else
+ statistic_increment(*ptr_binlog_cache_disk_use, &LOCK_status);
+#endif
+ cache_log.disk_writes= 0;
+ }
+ }
+
+ /*
+ Stores the values of maximum size of the cache allowed when this cache
+ is configured. This corresponds to either
+ . max_binlog_cache_size or max_binlog_stmt_cache_size.
+ */
+ my_off_t saved_max_binlog_cache_size;
+
+ /*
+ Stores a pointer to the status variable that keeps track of the in-memory
+ cache usage. This corresponds to either
+ . binlog_cache_use or binlog_stmt_cache_use.
+ */
+ ulong *ptr_binlog_cache_use;
+
+ /*
+ Stores a pointer to the status variable that keeps track of the disk
+ cache usage. This corresponds to either
+ . binlog_cache_disk_use or binlog_stmt_cache_disk_use.
+ */
+ ulong *ptr_binlog_cache_disk_use;
+
+ /*
+ It truncates the cache to a certain position. This includes deleting the
+ pending event.
+ */
+ void truncate(my_off_t pos, bool reset_cache=0)
+ {
+ DBUG_PRINT("info", ("truncating to position %lu", (ulong) pos));
+ cache_log.error=0;
+ if (pending())
+ {
+ delete pending();
+ set_pending(0);
+ }
+ reinit_io_cache(&cache_log, WRITE_CACHE, pos, 0, reset_cache);
+ cache_log.end_of_file= saved_max_binlog_cache_size;
+ }
+
+ binlog_cache_data& operator=(const binlog_cache_data& info);
+ binlog_cache_data(const binlog_cache_data& info);
+};
+
+
+void Log_event_writer::add_status(enum_logged_status status)
+{
+ if (likely(cache_data))
+ cache_data->add_status(status);
+}
+
+void Log_event_writer::set_incident()
+{
+ cache_data->set_incident();
+}
+
+
+class binlog_cache_mngr {
+public:
+ binlog_cache_mngr(my_off_t param_max_binlog_stmt_cache_size,
+ my_off_t param_max_binlog_cache_size,
+ ulong *param_ptr_binlog_stmt_cache_use,
+ ulong *param_ptr_binlog_stmt_cache_disk_use,
+ ulong *param_ptr_binlog_cache_use,
+ ulong *param_ptr_binlog_cache_disk_use)
+ : last_commit_pos_offset(0), using_xa(FALSE), xa_xid(0)
+ {
+ stmt_cache.set_binlog_cache_info(param_max_binlog_stmt_cache_size,
+ param_ptr_binlog_stmt_cache_use,
+ param_ptr_binlog_stmt_cache_disk_use);
+ trx_cache.set_binlog_cache_info(param_max_binlog_cache_size,
+ param_ptr_binlog_cache_use,
+ param_ptr_binlog_cache_disk_use);
+ last_commit_pos_file[0]= 0;
+ }
+
+ void reset(bool do_stmt, bool do_trx)
+ {
+ if (do_stmt)
+ stmt_cache.reset();
+ if (do_trx)
+ {
+ trx_cache.reset();
+ using_xa= FALSE;
+ last_commit_pos_file[0]= 0;
+ last_commit_pos_offset= 0;
+ }
+ }
+
+ binlog_cache_data* get_binlog_cache_data(bool is_transactional)
+ {
+ return (is_transactional ? &trx_cache : &stmt_cache);
+ }
+
+ IO_CACHE* get_binlog_cache_log(bool is_transactional)
+ {
+ return (is_transactional ? &trx_cache.cache_log : &stmt_cache.cache_log);
+ }
+
+ binlog_cache_data stmt_cache;
+
+ binlog_cache_data trx_cache;
+
+ /*
+ Binlog position for current transaction.
+ For START TRANSACTION WITH CONSISTENT SNAPSHOT, this is the binlog
+ position corresponding to the snapshot taken. During (and after) commit,
+ this is set to the binlog position corresponding to just after the
+ commit (so storage engines can store it in their transaction log).
+ */
+ char last_commit_pos_file[FN_REFLEN];
+ my_off_t last_commit_pos_offset;
+
+ /*
+ Flag set true if this transaction is committed with log_xid() as part of
+ XA, false if not.
+ */
+ bool using_xa;
+ my_xid xa_xid;
+ bool need_unlog;
+ /*
+ Id of binlog that transaction was written to; only needed if need_unlog is
+ true.
+ */
+ ulong binlog_id;
+ /* Set if we get an error during commit that must be returned from unlog(). */
+ bool delayed_error;
+ //Will be reset when gtid is written into binlog
+ uchar gtid_flags3;
+ decltype (rpl_gtid::seq_no) sa_seq_no;
+private:
+
+ binlog_cache_mngr& operator=(const binlog_cache_mngr& info);
+ binlog_cache_mngr(const binlog_cache_mngr& info);
+};
+
+/**
+ The function handles the first phase of two-phase binlogged ALTER.
+ On master binlogs START ALTER when that is configured to do so.
+ On slave START ALTER gets binlogged and its gtid committed into gtid slave pos
+ table.
+
+ @param thd Thread handle.
+ @param start_alter_id Start Alter identifier or zero.
+ @param[out]
+ partial_alter Is set to true when Start Alter phase is completed.
+ @param if_exists True indicates the binary logging of the query
+ should be done with "if exists" option.
+
+ @return false on success, true on failure
+ @return @c partial_alter set to @c true when START ALTER phase
+ has been completed
+*/
+bool write_bin_log_start_alter(THD *thd, bool& partial_alter,
+ uint64 start_alter_id, bool if_exists)
+{
+#if defined(HAVE_REPLICATION)
+ if (thd->variables.option_bits & OPTION_BIN_TMP_LOG_OFF)
+ return false;
+
+ if (start_alter_id)
+ {
+ if (thd->rgi_slave->get_finish_event_group_called())
+ return false; // can get here through retrying
+
+ DBUG_EXECUTE_IF("at_write_start_alter", {
+ debug_sync_set_action(thd,
+ STRING_WITH_LEN("now wait_for alter_cont"));
+ });
+
+ Master_info *mi= thd->rgi_slave->rli->mi;
+ start_alter_info *info= thd->rgi_slave->sa_info;
+ bool is_shutdown= false;
+
+ info->sa_seq_no= start_alter_id;
+ info->domain_id= thd->variables.gtid_domain_id;
+ mysql_mutex_lock(&mi->start_alter_list_lock);
+ // possible stop-slave's marking of the whole alter state list is checked
+ is_shutdown= mi->is_shutdown;
+ mi->start_alter_list.push_back(info, &mi->mem_root);
+ mysql_mutex_unlock(&mi->start_alter_list_lock);
+ info->state= start_alter_state::REGISTERED;
+ thd->rgi_slave->commit_orderer.wait_for_prior_commit(thd);
+ thd->rgi_slave->start_alter_ev->update_pos(thd->rgi_slave);
+ if (mysql_bin_log.is_open())
+ {
+ Write_log_with_flags wlwf (thd, Gtid_log_event::FL_START_ALTER_E1);
+ if (write_bin_log(thd, true, thd->query(), thd->query_length()))
+ {
+ DBUG_ASSERT(thd->is_error());
+ return true;
+ }
+ }
+ thd->rgi_slave->mark_start_commit();
+ thd->wakeup_subsequent_commits(0);
+ thd->rgi_slave->finish_start_alter_event_group();
+
+ if (is_shutdown)
+ {
+ /* SA exists abruptly and will notify any CA|RA waiter. */
+ mysql_mutex_lock(&mi->start_alter_lock);
+ /*
+ If there is (or will be) unlikely any CA it will execute
+ the whole query before to stop itself.
+ */
+ info->direct_commit_alter= true;
+ info->state= start_alter_state::ROLLBACK_ALTER;
+ mysql_mutex_unlock(&mi->start_alter_lock);
+
+ return true;
+ }
+
+ return false;
+ }
+#endif
+
+#ifndef WITH_WSREP
+ rpl_group_info *rgi= thd->rgi_slave ? thd->rgi_slave : thd->rgi_fake;
+#else
+ rpl_group_info *rgi= thd->slave_thread ? thd->rgi_slave :
+ WSREP(thd) ? (thd->wsrep_rgi ? thd->wsrep_rgi : thd->rgi_fake) :
+ thd->rgi_fake;
+#endif
+
+ if (!rgi && thd->variables.binlog_alter_two_phase)
+ {
+ /* slave applier can handle here only regular ALTER */
+ DBUG_ASSERT(!rgi || !(rgi->gtid_ev_flags_extra &
+ (Gtid_log_event::FL_START_ALTER_E1 |
+ Gtid_log_event::FL_COMMIT_ALTER_E1 |
+ Gtid_log_event::FL_ROLLBACK_ALTER_E1)));
+
+ /*
+ After logging binlog state stays flagged with SA flags3 an seq_no.
+ The state is not reset after write_bin_log() is done which is
+ deferred for the second logging phase.
+ */
+ thd->set_binlog_flags_for_alter(Gtid_log_event::FL_START_ALTER_E1);
+ if(write_bin_log_with_if_exists(thd, false, false, if_exists, false))
+ {
+ DBUG_ASSERT(thd->is_error());
+
+ thd->set_binlog_flags_for_alter(0);
+ return true;
+ }
+ partial_alter= true;
+ }
+ else if (rgi && rgi->direct_commit_alter)
+ {
+ DBUG_ASSERT(rgi->gtid_ev_flags_extra &
+ Gtid_log_event::FL_COMMIT_ALTER_E1);
+
+ partial_alter= true;
+ }
+
+ return false;
+}
+
+bool LOGGER::is_log_table_enabled(uint log_table_type)
+{
+ switch (log_table_type) {
+ case QUERY_LOG_SLOW:
+ return (table_log_handler != NULL) && global_system_variables.sql_log_slow
+ && (log_output_options & LOG_TABLE);
+ case QUERY_LOG_GENERAL:
+ return (table_log_handler != NULL) && opt_log
+ && (log_output_options & LOG_TABLE);
+ default:
+ DBUG_ASSERT(0);
+ return FALSE; /* make compiler happy */
+ }
+}
+
+/**
+ Check if a given table is opened log table
+
+ @param table Table to check
+ @param check_if_opened Only fail if it's a log table in use
+ @param error_msg String to put in error message if not ok.
+ No error message if 0
+ @return 0 ok
+ @return # Type of log file
+ */
+
+int check_if_log_table(const TABLE_LIST *table,
+ bool check_if_opened,
+ const char *error_msg)
+{
+ int result= 0;
+ if (table->db.length == 5 &&
+ !my_strcasecmp(table_alias_charset, table->db.str, "mysql"))
+ {
+ const char *table_name= table->table_name.str;
+
+ if (table->table_name.length == 11 &&
+ !my_strcasecmp(table_alias_charset, table_name, "general_log"))
+ {
+ result= QUERY_LOG_GENERAL;
+ goto end;
+ }
+
+ if (table->table_name.length == 8 &&
+ !my_strcasecmp(table_alias_charset, table_name, "slow_log"))
+ {
+ result= QUERY_LOG_SLOW;
+ goto end;
+ }
+ }
+ return 0;
+
+end:
+ if (!check_if_opened || logger.is_log_table_enabled(result))
+ {
+ if (error_msg)
+ my_error(ER_BAD_LOG_STATEMENT, MYF(0), error_msg);
+ return result;
+ }
+ return 0;
+}
+
+
+Log_to_csv_event_handler::Log_to_csv_event_handler() = default;
+
+
+Log_to_csv_event_handler::~Log_to_csv_event_handler() = default;
+
+
+void Log_to_csv_event_handler::cleanup()
+{
+ logger.is_log_tables_initialized= FALSE;
+}
+
+/* log event handlers */
+
+/**
+ Log command to the general log table
+
+ Log given command to the general log table.
+
+ @param event_time command start timestamp
+ @param user_host the pointer to the string with user@host info
+ @param user_host_len length of the user_host string. this is computed
+ once and passed to all general log event handlers
+ @param thread_id Id of the thread, issued a query
+ @param command_type the type of the command being logged
+ @param command_type_len the length of the string above
+ @param sql_text the very text of the query being executed
+ @param sql_text_len the length of sql_text string
+
+
+ @return This function attempts to never call my_error(). This is
+ necessary, because general logging happens already after a statement
+ status has been sent to the client, so the client can not see the
+ error anyway. Besides, the error is not related to the statement
+ being executed and is internal, and thus should be handled
+ internally (@todo: how?).
+ If a write to the table has failed, the function attempts to
+ write to a short error message to the file. The failure is also
+ indicated in the return value.
+
+ @retval FALSE OK
+ @retval TRUE error occurred
+*/
+
+bool Log_to_csv_event_handler::
+ log_general(THD *thd, my_hrtime_t event_time, const char *user_host, size_t user_host_len, my_thread_id thread_id_arg,
+ const char *command_type, size_t command_type_len,
+ const char *sql_text, size_t sql_text_len,
+ CHARSET_INFO *client_cs)
+{
+ TABLE_LIST table_list;
+ TABLE *table;
+ bool result= TRUE;
+ bool need_close= FALSE;
+ bool need_pop= FALSE;
+ bool need_rnd_end= FALSE;
+ uint field_index;
+ Silence_log_table_errors error_handler;
+ Open_tables_backup open_tables_backup;
+ THD::used_t save_time_zone_used= thd->used & THD::TIME_ZONE_USED;
+ DBUG_ENTER("log_general");
+
+ /*
+ CSV uses TIME_to_timestamp() internally if table needs to be repaired
+ which will set TIME_ZONE_USED
+ */
+
+ table_list.init_one_table(&MYSQL_SCHEMA_NAME, &GENERAL_LOG_NAME, 0,
+ TL_WRITE_CONCURRENT_INSERT);
+
+ /*
+ 1) open_log_table generates an error of the
+ table can not be opened or is corrupted.
+ 2) "INSERT INTO general_log" can generate warning sometimes.
+
+ Suppress these warnings and errors, they can't be dealt with
+ properly anyway.
+
+ QQ: this problem needs to be studied in more detail.
+ Comment this 2 lines and run "cast.test" to see what's happening.
+ */
+ thd->push_internal_handler(& error_handler);
+ need_pop= TRUE;
+
+ if (!(table= open_log_table(thd, &table_list, &open_tables_backup)))
+ goto err;
+
+ need_close= TRUE;
+
+ if (table->file->extra(HA_EXTRA_MARK_AS_LOG_TABLE) ||
+ table->file->ha_rnd_init_with_error(0))
+ goto err;
+
+ need_rnd_end= TRUE;
+
+ /* Honor next number columns if present */
+ table->next_number_field= table->found_next_number_field;
+
+ /*
+ NOTE: we do not call restore_record() here, as all fields are
+ filled by the Logger (=> no need to load default ones).
+ */
+
+ /*
+ We do not set a value for table->field[0], as it will use
+ default value (which is CURRENT_TIMESTAMP).
+ */
+
+ /* check that all columns exist */
+ if (table->s->fields < 6)
+ goto err;
+
+ DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
+
+ table->field[0]->store_timestamp(
+ hrtime_to_my_time(event_time), hrtime_sec_part(event_time));
+
+ /* do a write */
+ if (table->field[1]->store(user_host, user_host_len, client_cs) ||
+ table->field[2]->store((longlong) thread_id_arg, TRUE) ||
+ table->field[3]->store((longlong) global_system_variables.server_id,
+ TRUE) ||
+ table->field[4]->store(command_type, command_type_len, client_cs))
+ goto err;
+
+ /*
+ A positive return value in store() means truncation.
+ Still logging a message in the log in this case.
+ */
+ table->field[5]->flags|= FIELDFLAG_HEX_ESCAPE;
+ if (table->field[5]->store(sql_text, sql_text_len, client_cs) < 0)
+ goto err;
+
+ /* mark all fields as not null */
+ table->field[1]->set_notnull();
+ table->field[2]->set_notnull();
+ table->field[3]->set_notnull();
+ table->field[4]->set_notnull();
+ table->field[5]->set_notnull();
+
+ /* Set any extra columns to their default values */
+ for (field_index= 6 ; field_index < table->s->fields ; field_index++)
+ {
+ table->field[field_index]->set_default();
+ }
+
+ if (table->file->ha_write_row(table->record[0]))
+ goto err;
+
+ result= FALSE;
+
+err:
+ if (result && !thd->killed)
+ sql_print_error("Failed to write to mysql.general_log: %s",
+ error_handler.message());
+
+ if (need_rnd_end)
+ {
+ table->file->ha_rnd_end();
+ table->file->ha_release_auto_increment();
+ }
+ if (need_pop)
+ thd->pop_internal_handler();
+ if (need_close)
+ close_log_table(thd, &open_tables_backup);
+
+ thd->used= (thd->used & ~THD::TIME_ZONE_USED) | save_time_zone_used;
+ DBUG_RETURN(result);
+}
+
+
+/*
+ Log a query to the slow log table
+
+ SYNOPSIS
+ log_slow()
+ thd THD of the query
+ current_time current timestamp
+ user_host the pointer to the string with user@host info
+ user_host_len length of the user_host string. this is computed once
+ and passed to all general log event handlers
+ query_time Amount of time the query took to execute (in microseconds)
+ lock_time Amount of time the query was locked (in microseconds)
+ is_command The flag, which determines, whether the sql_text is a
+ query or an administrator command (these are treated
+ differently by the old logging routines)
+ sql_text the very text of the query or administrator command
+ processed
+ sql_text_len the length of sql_text string
+
+ DESCRIPTION
+
+ Log a query to the slow log table
+
+ RETURN
+ FALSE - OK
+ TRUE - error occurred
+*/
+
+bool Log_to_csv_event_handler::
+ log_slow(THD *thd, my_hrtime_t current_time,
+ const char *user_host, size_t user_host_len,
+ ulonglong query_utime, ulonglong lock_utime, bool is_command,
+ const char *sql_text, size_t sql_text_len)
+{
+ TABLE_LIST table_list;
+ TABLE *table;
+ bool result= TRUE;
+ bool need_close= FALSE;
+ bool need_rnd_end= FALSE;
+ Silence_log_table_errors error_handler;
+ Open_tables_backup open_tables_backup;
+ CHARSET_INFO *client_cs= thd->variables.character_set_client;
+ THD::used_t save_time_zone_used= thd->used & THD::TIME_ZONE_USED;
+ ulong query_time= (ulong) MY_MIN(query_utime/1000000, TIME_MAX_VALUE_SECONDS);
+ ulong lock_time= (ulong) MY_MIN(lock_utime/1000000, TIME_MAX_VALUE_SECONDS);
+ ulong query_time_micro= (ulong) (query_utime % 1000000);
+ ulong lock_time_micro= (ulong) (lock_utime % 1000000);
+ DBUG_ENTER("Log_to_csv_event_handler::log_slow");
+
+ thd->push_internal_handler(& error_handler);
+
+ table_list.init_one_table(&MYSQL_SCHEMA_NAME, &SLOW_LOG_NAME, 0,
+ TL_WRITE_CONCURRENT_INSERT);
+
+ if (!(table= open_log_table(thd, &table_list, &open_tables_backup)))
+ goto err;
+
+ need_close= TRUE;
+
+ if (table->file->extra(HA_EXTRA_MARK_AS_LOG_TABLE) ||
+ table->file->ha_rnd_init_with_error(0))
+ goto err;
+
+ need_rnd_end= TRUE;
+
+ /* Honor next number columns if present */
+ table->next_number_field= table->found_next_number_field;
+
+ restore_record(table, s->default_values); // Get empty record
+
+ /* check that all columns exist */
+ if (table->s->fields < 13)
+ goto err;
+
+ /* store the time and user values */
+ DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
+ table->field[0]->store_timestamp(
+ hrtime_to_my_time(current_time), hrtime_sec_part(current_time));
+ if (table->field[1]->store(user_host, user_host_len, client_cs))
+ goto err;
+
+ /*
+ A TIME field can not hold the full longlong range; query_time or
+ lock_time may be truncated without warning here, if greater than
+ 839 hours (~35 days)
+ */
+ MYSQL_TIME t;
+ t.neg= 0;
+
+ /* fill in query_time field */
+ calc_time_from_sec(&t, query_time, query_time_micro);
+ if (table->field[2]->store_time(&t))
+ goto err;
+ /* lock_time */
+ calc_time_from_sec(&t, lock_time, lock_time_micro);
+ if (table->field[3]->store_time(&t))
+ goto err;
+ /* rows_sent */
+ if (table->field[4]->store((longlong) thd->get_sent_row_count(), TRUE))
+ goto err;
+ /* rows_examined */
+ if (table->field[5]->store((longlong) thd->get_examined_row_count(), TRUE))
+ goto err;
+
+ /* fill database field */
+ if (thd->db.str)
+ {
+ if (table->field[6]->store(thd->db.str, thd->db.length, client_cs))
+ goto err;
+ table->field[6]->set_notnull();
+ }
+
+ if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
+ {
+ if (table->
+ field[7]->store((longlong)
+ thd->first_successful_insert_id_in_prev_stmt_for_binlog,
+ TRUE))
+ goto err;
+ table->field[7]->set_notnull();
+ }
+
+ /*
+ Set value if we do an insert on autoincrement column. Note that for
+ some engines (those for which get_auto_increment() does not leave a
+ table lock until the statement ends), this is just the first value and
+ the next ones used may not be contiguous to it.
+ */
+ if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
+ {
+ if (table->
+ field[8]->store((longlong)
+ thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(), TRUE))
+ goto err;
+ table->field[8]->set_notnull();
+ }
+
+ if (table->field[9]->store((longlong)global_system_variables.server_id, TRUE))
+ goto err;
+ table->field[9]->set_notnull();
+
+ /*
+ Column sql_text.
+ A positive return value in store() means truncation.
+ Still logging a message in the log in this case.
+ */
+ if (table->field[10]->store(sql_text, sql_text_len, client_cs) < 0)
+ goto err;
+
+ if (table->field[11]->store((longlong) thd->thread_id, TRUE))
+ goto err;
+
+ /* Rows_affected */
+ if (table->field[12]->store(thd->get_stmt_da()->is_ok() ?
+ (longlong) thd->get_stmt_da()->affected_rows() :
+ 0, TRUE))
+ goto err;
+
+ if (table->file->ha_write_row(table->record[0]))
+ goto err;
+
+ result= FALSE;
+
+err:
+ thd->pop_internal_handler();
+
+ if (result && !thd->killed)
+ sql_print_error("Failed to write to mysql.slow_log: %s",
+ error_handler.message());
+
+ if (need_rnd_end)
+ {
+ table->file->ha_rnd_end();
+ table->file->ha_release_auto_increment();
+ }
+ if (need_close)
+ close_log_table(thd, &open_tables_backup);
+ thd->used= (thd->used & ~THD::TIME_ZONE_USED) | save_time_zone_used;
+ DBUG_RETURN(result);
+}
+
+int Log_to_csv_event_handler::
+ activate_log(THD *thd, uint log_table_type)
+{
+ TABLE_LIST table_list;
+ TABLE *table;
+ LEX_CSTRING *UNINIT_VAR(log_name);
+ int result;
+ Open_tables_backup open_tables_backup;
+
+ DBUG_ENTER("Log_to_csv_event_handler::activate_log");
+
+ if (log_table_type == QUERY_LOG_GENERAL)
+ {
+ log_name= &GENERAL_LOG_NAME;
+ }
+ else
+ {
+ DBUG_ASSERT(log_table_type == QUERY_LOG_SLOW);
+
+ log_name= &SLOW_LOG_NAME;
+ }
+ table_list.init_one_table(&MYSQL_SCHEMA_NAME, log_name, 0, TL_WRITE_CONCURRENT_INSERT);
+
+ table= open_log_table(thd, &table_list, &open_tables_backup);
+ if (table)
+ {
+ result= 0;
+ close_log_table(thd, &open_tables_backup);
+ }
+ else
+ result= 1;
+
+ DBUG_RETURN(result);
+}
+
+bool Log_to_csv_event_handler::
+ log_error(enum loglevel level, const char *format, va_list args)
+{
+ /* No log table is implemented */
+ DBUG_ASSERT(0);
+ return FALSE;
+}
+
+bool Log_to_file_event_handler::
+ log_error(enum loglevel level, const char *format,
+ va_list args)
+{
+ return vprint_msg_to_log(level, format, args);
+}
+
+void Log_to_file_event_handler::init_pthread_objects()
+{
+ mysql_log.init_pthread_objects();
+ mysql_slow_log.init_pthread_objects();
+}
+
+
+/** Wrapper around MYSQL_LOG::write() for slow log. */
+
+bool Log_to_file_event_handler::
+ log_slow(THD *thd, my_hrtime_t current_time,
+ const char *user_host, size_t user_host_len,
+ ulonglong query_utime, ulonglong lock_utime, bool is_command,
+ const char *sql_text, size_t sql_text_len)
+{
+ Silence_log_table_errors error_handler;
+ thd->push_internal_handler(&error_handler);
+ bool retval= mysql_slow_log.write(thd, hrtime_to_my_time(current_time),
+ user_host, user_host_len,
+ query_utime, lock_utime, is_command,
+ sql_text, sql_text_len);
+ thd->pop_internal_handler();
+ return retval;
+}
+
+
+/**
+ Wrapper around MYSQL_LOG::write() for general log. We need it since we
+ want all log event handlers to have the same signature.
+*/
+
+bool Log_to_file_event_handler::
+ log_general(THD *thd, my_hrtime_t event_time, const char *user_host, size_t user_host_len, my_thread_id thread_id_arg,
+ const char *command_type, size_t command_type_len,
+ const char *sql_text, size_t sql_text_len,
+ CHARSET_INFO *client_cs)
+{
+ Silence_log_table_errors error_handler;
+ thd->push_internal_handler(&error_handler);
+ bool retval= mysql_log.write(hrtime_to_time(event_time), user_host,
+ user_host_len,
+ thread_id_arg, command_type, command_type_len,
+ sql_text, sql_text_len);
+ thd->pop_internal_handler();
+ return retval;
+}
+
+
+bool Log_to_file_event_handler::init()
+{
+ if (!is_initialized)
+ {
+ if (global_system_variables.sql_log_slow)
+ mysql_slow_log.open_slow_log(opt_slow_logname);
+
+ if (opt_log)
+ mysql_log.open_query_log(opt_logname);
+
+ is_initialized= TRUE;
+ }
+
+ return FALSE;
+}
+
+
+void Log_to_file_event_handler::cleanup()
+{
+ mysql_log.cleanup();
+ mysql_slow_log.cleanup();
+}
+
+void Log_to_file_event_handler::flush()
+{
+ /* reopen log files */
+ if (opt_log)
+ mysql_log.reopen_file();
+ if (global_system_variables.sql_log_slow)
+ mysql_slow_log.reopen_file();
+}
+
+/*
+ Log error with all enabled log event handlers
+
+ SYNOPSIS
+ error_log_print()
+
+ level The level of the error significance: NOTE,
+ WARNING or ERROR.
+ format format string for the error message
+ args list of arguments for the format string
+
+ RETURN
+ FALSE - OK
+ TRUE - error occurred
+*/
+
+bool LOGGER::error_log_print(enum loglevel level, const char *format,
+ va_list args)
+{
+ bool error= FALSE;
+ Log_event_handler **current_handler;
+ THD *thd= current_thd;
+
+ if (likely(thd))
+ thd->error_printed_to_log= 1;
+
+ /* currently we don't need locking here as there is no error_log table */
+ for (current_handler= error_log_handler_list ; *current_handler ;)
+ error= (*current_handler++)->log_error(level, format, args) || error;
+
+ return error;
+}
+
+
+void LOGGER::cleanup_base()
+{
+ DBUG_ASSERT(inited == 1);
+ mysql_rwlock_destroy(&LOCK_logger);
+ if (table_log_handler)
+ {
+ table_log_handler->cleanup();
+ delete table_log_handler;
+ table_log_handler= NULL;
+ }
+ if (file_log_handler)
+ file_log_handler->cleanup();
+}
+
+
+void LOGGER::cleanup_end()
+{
+ DBUG_ASSERT(inited == 1);
+ if (file_log_handler)
+ {
+ delete file_log_handler;
+ file_log_handler=NULL;
+ }
+ inited= 0;
+}
+
+
+/**
+ Perform basic log initialization: create file-based log handler and
+ init error log.
+*/
+void LOGGER::init_base()
+{
+ DBUG_ASSERT(inited == 0);
+ inited= 1;
+
+ /*
+ Here we create file log handler. We don't do it for the table log handler
+ here as it cannot be created so early. The reason is THD initialization,
+ which depends on the system variables (parsed later).
+ */
+ if (!file_log_handler)
+ file_log_handler= new Log_to_file_event_handler;
+
+ /* by default we use traditional error log */
+ init_error_log(LOG_FILE);
+
+ file_log_handler->init_pthread_objects();
+ mysql_rwlock_init(key_rwlock_LOCK_logger, &LOCK_logger);
+}
+
+
+void LOGGER::init_log_tables()
+{
+ if (!table_log_handler)
+ table_log_handler= new Log_to_csv_event_handler;
+
+ if (!is_log_tables_initialized &&
+ !table_log_handler->init() && !file_log_handler->init())
+ is_log_tables_initialized= TRUE;
+}
+
+
+/**
+ Close and reopen the slow log (with locks).
+
+ @returns FALSE.
+*/
+bool LOGGER::flush_slow_log()
+{
+ /*
+ Now we lock logger, as nobody should be able to use logging routines while
+ log tables are closed
+ */
+ logger.lock_exclusive();
+
+ /* Reopen slow log file */
+ if (global_system_variables.sql_log_slow)
+ file_log_handler->get_mysql_slow_log()->reopen_file();
+
+ /* End of log flush */
+ logger.unlock();
+
+ return 0;
+}
+
+
+/**
+ Close and reopen the general log (with locks).
+
+ @returns FALSE.
+*/
+bool LOGGER::flush_general_log()
+{
+ /*
+ Now we lock logger, as nobody should be able to use logging routines while
+ log tables are closed
+ */
+ logger.lock_exclusive();
+
+ /* Reopen general log file */
+ if (opt_log)
+ file_log_handler->get_mysql_log()->reopen_file();
+
+ /* End of log flush */
+ logger.unlock();
+
+ return 0;
+}
+
+
+/*
+ Log slow query with all enabled log event handlers
+
+ SYNOPSIS
+ slow_log_print()
+
+ thd THD of the query being logged
+ query The query being logged
+ query_length The length of the query string
+ current_utime Current time in microseconds (from undefined start)
+
+ RETURN
+ FALSE OK
+ TRUE error occurred
+*/
+
+bool LOGGER::slow_log_print(THD *thd, const char *query, size_t query_length,
+ ulonglong current_utime)
+
+{
+ bool error= FALSE;
+ Log_event_handler **current_handler;
+ bool is_command= FALSE;
+ char user_host_buff[MAX_USER_HOST_SIZE + 1];
+ Security_context *sctx= thd->security_ctx;
+ uint user_host_len= 0;
+ ulonglong query_utime, lock_utime;
+
+ DBUG_ASSERT(thd->enable_slow_log);
+ /*
+ Print the message to the buffer if we have slow log enabled
+ */
+
+ if (*slow_log_handler_list)
+ {
+ /* do not log slow queries from replication threads */
+ if (!thd->variables.sql_log_slow)
+ return 0;
+
+ lock_shared();
+ if (!global_system_variables.sql_log_slow)
+ {
+ unlock();
+ return 0;
+ }
+
+ /* fill in user_host value: the format is "%s[%s] @ %s [%s]" */
+ user_host_len= (uint)(strxnmov(user_host_buff, MAX_USER_HOST_SIZE,
+ sctx->priv_user, "[",
+ sctx->user ? sctx->user : (thd->slave_thread ? "SQL_SLAVE" : ""), "] @ ",
+ sctx->host ? sctx->host : "", " [",
+ sctx->ip ? sctx->ip : "", "]", NullS) -
+ user_host_buff);
+
+ DBUG_ASSERT(thd->start_utime);
+ DBUG_ASSERT(thd->start_time);
+ query_utime= (current_utime - thd->start_utime);
+ lock_utime= (thd->utime_after_lock - thd->start_utime);
+ my_hrtime_t current_time= { hrtime_from_time(thd->start_time) +
+ thd->start_time_sec_part + query_utime };
+
+ if (!query || thd->get_command() == COM_STMT_PREPARE)
+ {
+ is_command= TRUE;
+ query= command_name[thd->get_command()].str;
+ query_length= (uint)command_name[thd->get_command()].length;
+ }
+
+ for (current_handler= slow_log_handler_list; *current_handler ;)
+ error= (*current_handler++)->log_slow(thd, current_time,
+ user_host_buff, user_host_len,
+ query_utime, lock_utime, is_command,
+ query, query_length) || error;
+
+ unlock();
+ }
+ return error;
+}
+
+bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
+ const char *query, size_t query_length)
+{
+ bool error= FALSE;
+ Log_event_handler **current_handler= general_log_handler_list;
+ char user_host_buff[MAX_USER_HOST_SIZE + 1];
+ uint user_host_len= 0;
+ my_hrtime_t current_time;
+
+ DBUG_ASSERT(thd);
+
+ user_host_len= make_user_name(thd, user_host_buff);
+
+ current_time= my_hrtime();
+
+ mysql_audit_general_log(thd, hrtime_to_time(current_time),
+ user_host_buff, user_host_len,
+ command_name[(uint) command].str,
+ (uint)command_name[(uint) command].length,
+ query, (uint)query_length);
+
+ if (opt_log && log_command(thd, command))
+ {
+ lock_shared();
+ while (*current_handler)
+ error|= (*current_handler++)->
+ log_general(thd, current_time, user_host_buff,
+ user_host_len, thd->thread_id,
+ command_name[(uint) command].str,
+ command_name[(uint) command].length,
+ query, query_length,
+ thd->variables.character_set_client) || error;
+ unlock();
+ }
+
+ return error;
+}
+
+bool LOGGER::general_log_print(THD *thd, enum enum_server_command command,
+ const char *format, va_list args)
+{
+ size_t message_buff_len= 0;
+ char message_buff[MAX_LOG_BUFFER_SIZE];
+
+ /* prepare message */
+ if (format)
+ message_buff_len= my_vsnprintf(message_buff, sizeof(message_buff),
+ format, args);
+ else
+ message_buff[0]= '\0';
+
+ return general_log_write(thd, command, message_buff, message_buff_len);
+}
+
+void LOGGER::init_error_log(ulonglong error_log_printer)
+{
+ if (error_log_printer & LOG_NONE)
+ {
+ error_log_handler_list[0]= 0;
+ return;
+ }
+
+ switch (error_log_printer) {
+ case LOG_FILE:
+ error_log_handler_list[0]= file_log_handler;
+ error_log_handler_list[1]= 0;
+ break;
+ /* these two are disabled for now */
+ case LOG_TABLE:
+ DBUG_ASSERT(0);
+ break;
+ case LOG_TABLE|LOG_FILE:
+ DBUG_ASSERT(0);
+ break;
+ }
+}
+
+void LOGGER::init_slow_log(ulonglong slow_log_printer)
+{
+ if (slow_log_printer & LOG_NONE)
+ {
+ slow_log_handler_list[0]= 0;
+ return;
+ }
+
+ switch (slow_log_printer) {
+ case LOG_FILE:
+ slow_log_handler_list[0]= file_log_handler;
+ slow_log_handler_list[1]= 0;
+ break;
+ case LOG_TABLE:
+ slow_log_handler_list[0]= table_log_handler;
+ slow_log_handler_list[1]= 0;
+ break;
+ case LOG_TABLE|LOG_FILE:
+ slow_log_handler_list[0]= file_log_handler;
+ slow_log_handler_list[1]= table_log_handler;
+ slow_log_handler_list[2]= 0;
+ break;
+ }
+}
+
+void LOGGER::init_general_log(ulonglong general_log_printer)
+{
+ if (general_log_printer & LOG_NONE)
+ {
+ general_log_handler_list[0]= 0;
+ return;
+ }
+
+ switch (general_log_printer) {
+ case LOG_FILE:
+ general_log_handler_list[0]= file_log_handler;
+ general_log_handler_list[1]= 0;
+ break;
+ case LOG_TABLE:
+ general_log_handler_list[0]= table_log_handler;
+ general_log_handler_list[1]= 0;
+ break;
+ case LOG_TABLE|LOG_FILE:
+ general_log_handler_list[0]= file_log_handler;
+ general_log_handler_list[1]= table_log_handler;
+ general_log_handler_list[2]= 0;
+ break;
+ }
+}
+
+
+bool LOGGER::activate_log_handler(THD* thd, uint log_type)
+{
+ MYSQL_QUERY_LOG *file_log;
+ bool res= FALSE;
+ lock_exclusive();
+ switch (log_type) {
+ case QUERY_LOG_SLOW:
+ if (!global_system_variables.sql_log_slow)
+ {
+ file_log= file_log_handler->get_mysql_slow_log();
+
+ file_log->open_slow_log(opt_slow_logname);
+ if (table_log_handler->activate_log(thd, QUERY_LOG_SLOW))
+ {
+ /* Error printed by open table in activate_log() */
+ res= TRUE;
+ file_log->close(0);
+ }
+ else
+ {
+ init_slow_log(log_output_options);
+ global_system_variables.sql_log_slow= TRUE;
+ }
+ }
+ break;
+ case QUERY_LOG_GENERAL:
+ if (!opt_log)
+ {
+ file_log= file_log_handler->get_mysql_log();
+
+ file_log->open_query_log(opt_logname);
+ if (table_log_handler->activate_log(thd, QUERY_LOG_GENERAL))
+ {
+ /* Error printed by open table in activate_log() */
+ res= TRUE;
+ file_log->close(0);
+ }
+ else
+ {
+ init_general_log(log_output_options);
+ opt_log= TRUE;
+ }
+ }
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ unlock();
+ return res;
+}
+
+
+void LOGGER::deactivate_log_handler(THD *thd, uint log_type)
+{
+ my_bool *tmp_opt= 0;
+ MYSQL_LOG *UNINIT_VAR(file_log);
+
+ switch (log_type) {
+ case QUERY_LOG_SLOW:
+ tmp_opt= &global_system_variables.sql_log_slow;
+ file_log= file_log_handler->get_mysql_slow_log();
+ break;
+ case QUERY_LOG_GENERAL:
+ tmp_opt= &opt_log;
+ file_log= file_log_handler->get_mysql_log();
+ break;
+ default:
+ MY_ASSERT_UNREACHABLE();
+ }
+
+ if (!(*tmp_opt))
+ return;
+
+ lock_exclusive();
+ file_log->close(0);
+ *tmp_opt= FALSE;
+ unlock();
+}
+
+
+/* the parameters are unused for the log tables */
+bool Log_to_csv_event_handler::init()
+{
+ return 0;
+}
+
+int LOGGER::set_handlers(ulonglong slow_log_printer,
+ ulonglong general_log_printer)
+{
+ lock_exclusive();
+
+ if ((slow_log_printer & LOG_TABLE || general_log_printer & LOG_TABLE) &&
+ !is_log_tables_initialized)
+ {
+ slow_log_printer= (slow_log_printer & ~LOG_TABLE) | LOG_FILE;
+ general_log_printer= (general_log_printer & ~LOG_TABLE) | LOG_FILE;
+
+ sql_print_error("Failed to initialize log tables. "
+ "Falling back to the old-fashioned logs");
+ }
+
+ init_slow_log(slow_log_printer);
+ init_general_log(general_log_printer);
+
+ unlock();
+
+ return 0;
+}
+
+ /*
+ Save position of binary log transaction cache.
+
+ SYNPOSIS
+ binlog_trans_log_savepos()
+
+ thd The thread to take the binlog data from
+ pos Pointer to variable where the position will be stored
+
+ DESCRIPTION
+
+ Save the current position in the binary log transaction cache into
+ the variable pointed to by 'pos'
+ */
+
+static void
+binlog_trans_log_savepos(THD *thd, my_off_t *pos)
+{
+ DBUG_ENTER("binlog_trans_log_savepos");
+ DBUG_ASSERT(pos != NULL);
+ binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
+ DBUG_ASSERT((WSREP(thd) && wsrep_emulate_bin_log) || mysql_bin_log.is_open());
+ *pos= cache_mngr->trx_cache.get_byte_position();
+ DBUG_PRINT("return", ("*pos: %lu", (ulong) *pos));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Truncate the binary log transaction cache.
+
+ SYNPOSIS
+ binlog_trans_log_truncate()
+
+ thd The thread to take the binlog data from
+ pos Position to truncate to
+
+ DESCRIPTION
+
+ Truncate the binary log to the given position. Will not change
+ anything else.
+
+ */
+static void
+binlog_trans_log_truncate(THD *thd, my_off_t pos)
+{
+ DBUG_ENTER("binlog_trans_log_truncate");
+ DBUG_PRINT("enter", ("pos: %lu", (ulong) pos));
+
+ DBUG_ASSERT(thd_get_ha_data(thd, binlog_hton) != NULL);
+ /* Only true if binlog_trans_log_savepos() wasn't called before */
+ DBUG_ASSERT(pos != ~(my_off_t) 0);
+
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+ cache_mngr->trx_cache.restore_savepoint(pos);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ this function is mostly a placeholder.
+ conceptually, binlog initialization (now mostly done in MYSQL_BIN_LOG::open)
+ should be moved here.
+*/
+
+int binlog_init(void *p)
+{
+ binlog_hton= (handlerton *)p;
+ binlog_hton->savepoint_offset= sizeof(my_off_t);
+ binlog_hton->close_connection= binlog_close_connection;
+ binlog_hton->savepoint_set= binlog_savepoint_set;
+ binlog_hton->savepoint_rollback= binlog_savepoint_rollback;
+ binlog_hton->savepoint_rollback_can_release_mdl=
+ binlog_savepoint_rollback_can_release_mdl;
+ binlog_hton->commit= [](handlerton *, THD *thd, bool all) { return 0; };
+ binlog_hton->rollback= binlog_rollback;
+ binlog_hton->drop_table= [](handlerton *, const char*) { return -1; };
+ if (WSREP_ON || opt_bin_log)
+ {
+ binlog_hton->prepare= binlog_prepare;
+ binlog_hton->start_consistent_snapshot= binlog_start_consistent_snapshot;
+ }
+
+ binlog_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN | HTON_NO_ROLLBACK;
+ return 0;
+}
+
+#ifdef WITH_WSREP
+#include "wsrep_binlog.h"
+#endif /* WITH_WSREP */
+static int binlog_close_connection(handlerton *hton, THD *thd)
+{
+ DBUG_ENTER("binlog_close_connection");
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+#ifdef WITH_WSREP
+ if (WSREP(thd) && cache_mngr && !cache_mngr->trx_cache.empty()) {
+ IO_CACHE* cache= cache_mngr->get_binlog_cache_log(true);
+ uchar *buf;
+ size_t len=0;
+ wsrep_write_cache_buf(cache, &buf, &len);
+ WSREP_WARN("binlog trx cache not empty (%zu bytes) @ connection close %lld",
+ len, (longlong) thd->thread_id);
+ if (len > 0) wsrep_dump_rbr_buf(thd, buf, len);
+
+ cache = cache_mngr->get_binlog_cache_log(false);
+ wsrep_write_cache_buf(cache, &buf, &len);
+ WSREP_WARN("binlog stmt cache not empty (%zu bytes) @ connection close %lld",
+ len, (longlong) thd->thread_id);
+ if (len > 0) wsrep_dump_rbr_buf(thd, buf, len);
+ }
+#endif /* WITH_WSREP */
+ DBUG_ASSERT(cache_mngr->trx_cache.empty());
+ DBUG_ASSERT(cache_mngr->stmt_cache.empty());
+ cache_mngr->~binlog_cache_mngr();
+ my_free(cache_mngr);
+ DBUG_RETURN(0);
+}
+
+/*
+ This function flushes a cache upon commit/rollback.
+
+ SYNOPSIS
+ binlog_flush_cache()
+
+ thd The thread whose transaction should be ended
+ cache_mngr Pointer to the binlog_cache_mngr to use
+ all True if the entire transaction should be ended, false if
+ only the statement transaction should be ended.
+ end_ev The end event to use (COMMIT, ROLLBACK, or commit XID)
+ using_stmt True if the statement cache should be flushed
+ using_trx True if the transaction cache should be flushed
+
+ DESCRIPTION
+
+ End the currently transaction or statement. The transaction can be either
+ a real transaction or a statement transaction.
+
+ This can be to commit a transaction, with a COMMIT query event or an XA
+ commit XID event. But it can also be to rollback a transaction with a
+ ROLLBACK query event, used for rolling back transactions which also
+ contain updates to non-transactional tables. Or it can be a flush of
+ a statement cache.
+ */
+
+static int
+binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
+ Log_event *end_ev, bool all, bool using_stmt,
+ bool using_trx, bool is_ro_1pc= false)
+{
+ int error= 0;
+ DBUG_ENTER("binlog_flush_cache");
+ DBUG_PRINT("enter", ("end_ev: %p", end_ev));
+
+ if ((using_stmt && !cache_mngr->stmt_cache.empty()) ||
+ (using_trx && !cache_mngr->trx_cache.empty()) ||
+ thd->transaction->xid_state.is_explicit_XA())
+ {
+ if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE))
+ DBUG_RETURN(1);
+ if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE))
+ DBUG_RETURN(1);
+
+ /*
+ Doing a commit or a rollback including non-transactional tables,
+ i.e., ending a transaction where we might write the transaction
+ cache to the binary log.
+
+ We can always end the statement when ending a transaction since
+ transactions are not allowed inside stored functions. If they
+ were, we would have to ensure that we're not ending a statement
+ inside a stored function.
+ */
+ error= mysql_bin_log.write_transaction_to_binlog(thd, cache_mngr,
+ end_ev, all,
+ using_stmt, using_trx,
+ is_ro_1pc);
+ }
+ else
+ {
+ /*
+ This can happen in row-format binlog with something like
+ BEGIN; INSERT INTO nontrans_table; INSERT IGNORE INTO trans_table;
+ The nontrans_table is written directly into the binlog before commit,
+ and if the trans_table is ignored there will be no rows to write when
+ we get here.
+
+ So there is no work to do. Therefore, we will not increment any XID
+ count, so we must not decrement any XID count in unlog().
+ */
+ cache_mngr->need_unlog= 0;
+ }
+ cache_mngr->reset(using_stmt, using_trx);
+
+ DBUG_ASSERT(!using_stmt || cache_mngr->stmt_cache.empty());
+ DBUG_ASSERT(!using_trx || cache_mngr->trx_cache.empty());
+ DBUG_RETURN(error);
+}
+
+
+/**
+ This function flushes the stmt-cache upon commit.
+
+ @param thd The thread whose transaction should be flushed
+ @param cache_mngr Pointer to the cache manager
+
+ @return
+ nonzero if an error pops up when flushing the cache.
+*/
+static inline int
+binlog_commit_flush_stmt_cache(THD *thd, bool all,
+ binlog_cache_mngr *cache_mngr)
+{
+ DBUG_ENTER("binlog_commit_flush_stmt_cache");
+#ifdef WITH_WSREP
+ if (thd->wsrep_mysql_replicated > 0)
+ {
+ DBUG_ASSERT(WSREP(thd));
+ WSREP_DEBUG("avoiding binlog_commit_flush_trx_cache: %d",
+ thd->wsrep_mysql_replicated);
+ return 0;
+ }
+#endif
+
+ Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
+ FALSE, TRUE, TRUE, 0);
+ DBUG_RETURN(binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, FALSE));
+}
+
+
+inline size_t serialize_with_xid(XID *xid, char *buf,
+ const char *query, size_t q_len)
+{
+ memcpy(buf, query, q_len);
+
+ return
+ q_len + strlen(static_cast<event_xid_t*>(xid)->serialize(buf + q_len));
+}
+
+
+/**
+ This function flushes the trx-cache upon commit.
+
+ @param thd The thread whose transaction should be flushed
+ @param cache_mngr Pointer to the cache manager
+
+ @return
+ nonzero if an error pops up when flushing the cache.
+*/
+static inline int
+binlog_commit_flush_trx_cache(THD *thd, bool all, binlog_cache_mngr *cache_mngr,
+ bool ro_1pc)
+{
+ DBUG_ENTER("binlog_commit_flush_trx_cache");
+
+ const char query[]= "XA COMMIT ";
+ const size_t q_len= sizeof(query) - 1; // do not count trailing 0
+ char buf[q_len + ser_buf_size]= "COMMIT";
+ size_t buflen= sizeof("COMMIT") - 1;
+
+ if (thd->lex->sql_command == SQLCOM_XA_COMMIT &&
+ thd->lex->xa_opt != XA_ONE_PHASE)
+ {
+ DBUG_ASSERT(thd->transaction->xid_state.is_explicit_XA());
+ DBUG_ASSERT(thd->transaction->xid_state.get_state_code() ==
+ XA_PREPARED);
+
+ buflen= serialize_with_xid(thd->transaction->xid_state.get_xid(),
+ buf, query, q_len);
+ }
+ Query_log_event end_evt(thd, buf, buflen, TRUE, TRUE, TRUE, 0);
+
+ DBUG_RETURN(binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE, ro_1pc));
+}
+
+
+/**
+ This function flushes the trx-cache upon rollback.
+
+ @param thd The thread whose transaction should be flushed
+ @param cache_mngr Pointer to the cache manager
+
+ @return
+ nonzero if an error pops up when flushing the cache.
+*/
+static inline int
+binlog_rollback_flush_trx_cache(THD *thd, bool all,
+ binlog_cache_mngr *cache_mngr)
+{
+ const char query[]= "XA ROLLBACK ";
+ const size_t q_len= sizeof(query) - 1; // do not count trailing 0
+ char buf[q_len + ser_buf_size]= "ROLLBACK";
+ size_t buflen= sizeof("ROLLBACK") - 1;
+
+ if (thd->transaction->xid_state.is_explicit_XA())
+ {
+ /* for not prepared use plain ROLLBACK */
+ if (thd->transaction->xid_state.get_state_code() == XA_PREPARED)
+ buflen= serialize_with_xid(thd->transaction->xid_state.get_xid(),
+ buf, query, q_len);
+ }
+ Query_log_event end_evt(thd, buf, buflen, TRUE, TRUE, TRUE, 0);
+
+ return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
+}
+
+/**
+ This function flushes the trx-cache upon commit.
+
+ @param thd The thread whose transaction should be flushed
+ @param cache_mngr Pointer to the cache manager
+ @param xid Transaction Id
+
+ @return
+ nonzero if an error pops up when flushing the cache.
+*/
+static inline int
+binlog_commit_flush_xid_caches(THD *thd, binlog_cache_mngr *cache_mngr,
+ bool all, my_xid xid)
+{
+ DBUG_ASSERT(xid); // replaced former treatment of ONE-PHASE XA
+
+ Xid_log_event end_evt(thd, xid, TRUE);
+ return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
+}
+
+/**
+ This function truncates the transactional cache upon committing or rolling
+ back either a transaction or a statement.
+
+ @param thd The thread whose transaction should be flushed
+ @param cache_mngr Pointer to the cache data to be flushed
+ @param all @c true means truncate the transaction, otherwise the
+ statement must be truncated.
+
+ @return
+ nonzero if an error pops up when truncating the transactional cache.
+*/
+static int
+binlog_truncate_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
+{
+ DBUG_ENTER("binlog_truncate_trx_cache");
+ int error=0;
+ /*
+ This function handles transactional changes and as such this flag
+ equals to true.
+ */
+ bool const is_transactional= TRUE;
+
+ DBUG_PRINT("info", ("thd->options={ %s %s}, transaction: %s",
+ FLAGSTR(thd->variables.option_bits, OPTION_NOT_AUTOCOMMIT),
+ FLAGSTR(thd->variables.option_bits, OPTION_BEGIN),
+ all ? "all" : "stmt"));
+
+ thd->binlog_remove_pending_rows_event(TRUE, is_transactional);
+ /*
+ If rolling back an entire transaction or a single statement not
+ inside a transaction, we reset the transaction cache.
+ */
+ if (ending_trans(thd, all))
+ {
+ if (cache_mngr->trx_cache.has_incident())
+ error= mysql_bin_log.write_incident(thd);
+
+ thd->reset_binlog_for_next_statement();
+
+ cache_mngr->reset(false, true);
+ }
+ /*
+ If rolling back a statement in a transaction, we truncate the
+ transaction cache to remove the statement.
+ */
+ else
+ cache_mngr->trx_cache.restore_prev_position();
+
+ DBUG_ASSERT(thd->binlog_get_pending_rows_event(is_transactional) == NULL);
+ DBUG_RETURN(error);
+}
+
+
+inline bool is_preparing_xa(THD *thd)
+{
+ return
+ thd->transaction->xid_state.is_explicit_XA() &&
+ thd->lex->sql_command == SQLCOM_XA_PREPARE;
+}
+
+
+static int binlog_prepare(handlerton *hton, THD *thd, bool all)
+{
+ /* Do nothing unless the transaction is a user XA. */
+ return is_preparing_xa(thd) ? binlog_commit(thd, all, FALSE) : 0;
+}
+
+
+int binlog_commit_by_xid(handlerton *hton, XID *xid)
+{
+ int rc= 0;
+ THD *thd= current_thd;
+
+ if (thd->is_current_stmt_binlog_disabled())
+ {
+ return thd->wait_for_prior_commit();
+ }
+
+ /* the asserted state can't be reachable with xa commit */
+ DBUG_ASSERT(!thd->get_stmt_da()->is_error() ||
+ thd->get_stmt_da()->sql_errno() != ER_XA_RBROLLBACK);
+ /*
+ This is a recovered user xa transaction commit.
+ Create a "temporary" binlog transaction to write the commit record
+ into binlog.
+ */
+ THD_TRANS trans;
+ trans.ha_list= NULL;
+
+ thd->ha_data[hton->slot].ha_info[1].register_ha(&trans, hton);
+ thd->ha_data[binlog_hton->slot].ha_info[1].set_trx_read_write();
+ (void) thd->binlog_setup_trx_data();
+
+ DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_COMMIT);
+
+ rc= binlog_commit(thd, TRUE, FALSE);
+ thd->ha_data[binlog_hton->slot].ha_info[1].reset();
+
+ return rc;
+}
+
+
+int binlog_rollback_by_xid(handlerton *hton, XID *xid)
+{
+ int rc= 0;
+ THD *thd= current_thd;
+
+ if (thd->is_current_stmt_binlog_disabled())
+ {
+ return thd->wait_for_prior_commit();
+ }
+
+ if (thd->get_stmt_da()->is_error() &&
+ thd->get_stmt_da()->sql_errno() == ER_XA_RBROLLBACK)
+ return rc;
+
+ THD_TRANS trans;
+ trans.ha_list= NULL;
+
+ thd->ha_data[hton->slot].ha_info[1].register_ha(&trans, hton);
+ thd->ha_data[hton->slot].ha_info[1].set_trx_read_write();
+ (void) thd->binlog_setup_trx_data();
+
+ DBUG_ASSERT(thd->lex->sql_command == SQLCOM_XA_ROLLBACK ||
+ (thd->transaction->xid_state.get_state_code() == XA_ROLLBACK_ONLY));
+
+ rc= binlog_rollback(hton, thd, TRUE);
+ thd->ha_data[hton->slot].ha_info[1].reset();
+
+ return rc;
+}
+
+
+inline bool is_prepared_xa(THD *thd)
+{
+ return thd->transaction->xid_state.is_explicit_XA() &&
+ thd->transaction->xid_state.get_state_code() == XA_PREPARED;
+}
+
+
+/*
+ We flush the cache wrapped in a beging/rollback if:
+ . aborting a single or multi-statement transaction and;
+ . the OPTION_BINLOG_THIS_TRX is active or;
+ . the format is STMT and a non-trans table was updated or;
+ . the format is MIXED and a temporary non-trans table was
+ updated or;
+ . the format is MIXED, non-trans table was updated and
+ aborting a single statement transaction;
+*/
+static bool trans_cannot_safely_rollback(THD *thd, bool all)
+{
+ DBUG_ASSERT(ending_trans(thd, all));
+ ulong binlog_format= thd->wsrep_binlog_format(thd->variables.binlog_format);
+
+ return ((thd->variables.option_bits & OPTION_BINLOG_THIS_TRX) ||
+ (trans_has_updated_non_trans_table(thd) &&
+ binlog_format == BINLOG_FORMAT_STMT) ||
+ (thd->transaction->all.has_modified_non_trans_temp_table() &&
+ binlog_format == BINLOG_FORMAT_MIXED) ||
+ (trans_has_updated_non_trans_table(thd) &&
+ ending_single_stmt_trans(thd,all) &&
+ binlog_format == BINLOG_FORMAT_MIXED) ||
+ is_prepared_xa(thd));
+}
+
+
+/**
+ Specific log flusher invoked through log_xa_prepare().
+*/
+static int binlog_commit_flush_xa_prepare(THD *thd, bool all,
+ binlog_cache_mngr *cache_mngr)
+{
+ XID *xid= thd->transaction->xid_state.get_xid();
+ {
+ // todo assert wsrep_simulate || is_open()
+
+ /*
+ Log the XA END event first.
+ We don't do that in trans_xa_end() as XA COMMIT ONE PHASE
+ is logged as simple BEGIN/COMMIT so the XA END should
+ not get to the log.
+ */
+ const char query[]= "XA END ";
+ const size_t q_len= sizeof(query) - 1; // do not count trailing 0
+ char buf[q_len + ser_buf_size];
+ size_t buflen;
+ binlog_cache_data *cache_data;
+ IO_CACHE *file;
+
+ memcpy(buf, query, q_len);
+ buflen= q_len +
+ strlen(static_cast<event_xid_t*>(xid)->serialize(buf + q_len));
+ cache_data= cache_mngr->get_binlog_cache_data(true);
+ file= &cache_data->cache_log;
+ thd->lex->sql_command= SQLCOM_XA_END;
+ Query_log_event xa_end(thd, buf, buflen, true, false, true, 0);
+ if (mysql_bin_log.write_event(&xa_end, cache_data, file))
+ return 1;
+ thd->lex->sql_command= SQLCOM_XA_PREPARE;
+ }
+
+ cache_mngr->using_xa= FALSE;
+ XA_prepare_log_event end_evt(thd, xid, FALSE);
+
+ return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
+}
+
+/**
+ This function is called once after each statement.
+
+ It has the responsibility to flush the caches to the binary log on commits.
+
+ @param thd The client thread that executes the transaction.
+ @param all This is @c true if this is a real transaction commit, and
+ @false otherwise.
+ @param ro_1pc read-only one-phase commit transaction
+*/
+int binlog_commit(THD *thd, bool all, bool ro_1pc)
+{
+ int error= 0;
+ PSI_stage_info org_stage;
+ DBUG_ENTER("binlog_commit");
+
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+ if (!cache_mngr)
+ {
+ DBUG_ASSERT(WSREP(thd) ||
+ (thd->lex->sql_command != SQLCOM_XA_PREPARE &&
+ !(thd->lex->sql_command == SQLCOM_XA_COMMIT &&
+ thd->lex->xa_opt == XA_ONE_PHASE)));
+
+ DBUG_RETURN(0);
+ }
+ /*
+ This is true if we are doing an alter table that is replicated as
+ CREATE TABLE ... SELECT
+ */
+ if (thd->variables.option_bits & OPTION_BIN_COMMIT_OFF)
+ DBUG_RETURN(0);
+
+ DBUG_PRINT("debug",
+ ("all: %d, in_transaction: %s, all.modified_non_trans_table: %s, stmt.modified_non_trans_table: %s",
+ all,
+ YESNO(thd->in_multi_stmt_transaction_mode()),
+ YESNO(thd->transaction->all.modified_non_trans_table),
+ YESNO(thd->transaction->stmt.modified_non_trans_table)));
+
+ thd->backup_stage(&org_stage);
+ THD_STAGE_INFO(thd, stage_binlog_write);
+#ifdef WITH_WSREP
+ // DON'T clear stmt cache in case we are in transaction
+ if (!cache_mngr->stmt_cache.empty() &&
+ (!wsrep_on(thd) || ending_trans(thd, all)))
+#else
+ if (!cache_mngr->stmt_cache.empty())
+#endif
+ {
+ error= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
+ }
+
+ if (cache_mngr->trx_cache.empty() &&
+ (thd->transaction->xid_state.get_state_code() != XA_PREPARED ||
+ !(thd->ha_data[binlog_hton->slot].ha_info[1].is_started() &&
+ thd->ha_data[binlog_hton->slot].ha_info[1].is_trx_read_write())))
+ {
+ /*
+ This is an empty transaction commit (both the regular and xa),
+ or such transaction xa-prepare or
+ either one's statement having no effect on the transactional cache
+ as any prior to it.
+ The empty xa-prepare sinks in only when binlog is read-only.
+ */
+ cache_mngr->reset(false, true);
+ THD_STAGE_INFO(thd, org_stage);
+ DBUG_RETURN(error);
+ }
+
+ /*
+ We commit the transaction if:
+ - We are not in a transaction and committing a statement, or
+ - We are in a transaction and a full transaction is committed.
+ Otherwise, we accumulate the changes.
+ */
+ if (likely(!error) && ending_trans(thd, all))
+ {
+ bool is_xa_prepare= is_preparing_xa(thd);
+
+ error= is_xa_prepare ?
+ binlog_commit_flush_xa_prepare(thd, all, cache_mngr) :
+ binlog_commit_flush_trx_cache (thd, all, cache_mngr, ro_1pc);
+ // the user xa is unlogged on common exec path with the "empty" xa case
+ if (cache_mngr->need_unlog && !is_xa_prepare)
+ {
+ error=
+ mysql_bin_log.unlog(BINLOG_COOKIE_MAKE(cache_mngr->binlog_id,
+ cache_mngr->delayed_error), 1);
+ cache_mngr->need_unlog= false;
+ }
+ }
+ /*
+ This is part of the stmt rollback.
+ */
+ if (!all)
+ cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
+
+ THD_STAGE_INFO(thd, org_stage);
+ DBUG_RETURN(error);
+}
+
+/**
+ This function is called when a transaction or a statement is rolled back.
+
+ @param hton The binlog handlerton.
+ @param thd The client thread that executes the transaction.
+ @param all This is @c true if this is a real transaction rollback, and
+ @false otherwise.
+
+ @see handlerton::rollback
+*/
+static int binlog_rollback(handlerton *hton, THD *thd, bool all)
+{
+ DBUG_ENTER("binlog_rollback");
+
+ int error= 0;
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+ if (!cache_mngr)
+ {
+ DBUG_ASSERT(WSREP(thd));
+ DBUG_ASSERT(thd->lex->sql_command != SQLCOM_XA_ROLLBACK);
+
+ DBUG_RETURN(0);
+ }
+
+ DBUG_PRINT("debug", ("all: %s, all.modified_non_trans_table: %s, stmt.modified_non_trans_table: %s",
+ YESNO(all),
+ YESNO(thd->transaction->all.modified_non_trans_table),
+ YESNO(thd->transaction->stmt.modified_non_trans_table)));
+
+ /*
+ If an incident event is set we do not flush the content of the statement
+ cache because it may be corrupted.
+ */
+ if (cache_mngr->stmt_cache.has_incident())
+ {
+ error |= static_cast<int>(mysql_bin_log.write_incident(thd));
+ cache_mngr->reset(true, false);
+ }
+ else if (!cache_mngr->stmt_cache.empty())
+ {
+ error |= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
+ }
+
+ if (!cache_mngr->trx_cache.has_incident() && cache_mngr->trx_cache.empty() &&
+ (thd->transaction->xid_state.get_state_code() != XA_PREPARED ||
+ !(thd->ha_data[binlog_hton->slot].ha_info[1].is_started() &&
+ thd->ha_data[binlog_hton->slot].ha_info[1].is_trx_read_write())))
+ {
+ /*
+ The same comments apply as in the binlog commit method's branch.
+ */
+ cache_mngr->reset(false, true);
+ thd->reset_binlog_for_next_statement();
+ DBUG_RETURN(error);
+ }
+ if (!wsrep_emulate_bin_log && mysql_bin_log.check_write_error(thd))
+ {
+ /*
+ "all == true" means that a "rollback statement" triggered the error and
+ this function was called. However, this must not happen as a rollback
+ is written directly to the binary log. And in auto-commit mode, a single
+ statement that is rolled back has the flag all == false.
+ */
+ DBUG_ASSERT(!all);
+ /*
+ We reach this point if the effect of a statement did not properly get into
+ a cache and need to be rolled back.
+ */
+ error |= binlog_truncate_trx_cache(thd, cache_mngr, all);
+ }
+ else if (likely(!error))
+ {
+ ulong binlog_format= thd->wsrep_binlog_format(thd->variables.binlog_format);
+ if (ending_trans(thd, all) && trans_cannot_safely_rollback(thd, all))
+ error= binlog_rollback_flush_trx_cache(thd, all, cache_mngr);
+ /*
+ Truncate the cache if:
+ . aborting a single or multi-statement transaction or;
+ . the current statement created or dropped a temporary table
+ while having actual STATEMENT format;
+ . the format is not STMT or no non-trans table was
+ updated and;
+ . the format is not MIXED or no temporary non-trans table
+ was updated.
+ */
+ else if (ending_trans(thd, all) ||
+ (!(thd->transaction->stmt.has_created_dropped_temp_table() &&
+ !thd->is_current_stmt_binlog_format_row()) &&
+ (!stmt_has_updated_non_trans_table(thd) ||
+ binlog_format != BINLOG_FORMAT_STMT) &&
+ (!thd->transaction->stmt.has_modified_non_trans_temp_table() ||
+ binlog_format != BINLOG_FORMAT_MIXED)))
+ error= binlog_truncate_trx_cache(thd, cache_mngr, all);
+ }
+
+ /*
+ This is part of the stmt rollback.
+ */
+ if (!all)
+ cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
+ thd->reset_binlog_for_next_statement();
+
+ DBUG_RETURN(error);
+}
+
+
+void binlog_reset_cache(THD *thd)
+{
+ binlog_cache_mngr *const cache_mngr= opt_bin_log ?
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton) : 0;
+ DBUG_ENTER("binlog_reset_cache");
+ if (cache_mngr)
+ {
+ thd->binlog_remove_pending_rows_event(TRUE, TRUE);
+ cache_mngr->reset(true, true);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+void MYSQL_BIN_LOG::set_write_error(THD *thd, bool is_transactional)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::set_write_error");
+
+ write_error= 1;
+
+ if (unlikely(check_write_error(thd)))
+ DBUG_VOID_RETURN;
+
+ if (my_errno == EFBIG)
+ {
+ if (is_transactional)
+ {
+ my_message(ER_TRANS_CACHE_FULL, ER_THD(thd, ER_TRANS_CACHE_FULL), MYF(0));
+ }
+ else
+ {
+ my_message(ER_STMT_CACHE_FULL, ER_THD(thd, ER_STMT_CACHE_FULL), MYF(0));
+ }
+ }
+ else
+ {
+ my_error(ER_ERROR_ON_WRITE, MYF(0), name, errno);
+ }
+#ifdef WITH_WSREP
+ /* If wsrep transaction is active and binlog emulation is on,
+ binlog write error may leave transaction without any registered
+ htons. This makes wsrep rollback hooks to be skipped and the
+ transaction will remain alive in wsrep world after rollback.
+ Register binlog hton here to ensure that rollback happens in full. */
+ if (WSREP_EMULATE_BINLOG(thd))
+ {
+ if (is_transactional)
+ trans_register_ha(thd, TRUE, binlog_hton, 0);
+ trans_register_ha(thd, FALSE, binlog_hton, 0);
+ }
+#endif /* WITH_WSREP */
+ DBUG_VOID_RETURN;
+}
+
+bool MYSQL_BIN_LOG::check_write_error(THD *thd)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::check_write_error");
+
+ bool checked= FALSE;
+
+ if (likely(!thd->is_error()))
+ DBUG_RETURN(checked);
+
+ switch (thd->get_stmt_da()->sql_errno())
+ {
+ case ER_TRANS_CACHE_FULL:
+ case ER_STMT_CACHE_FULL:
+ case ER_ERROR_ON_WRITE:
+ case ER_BINLOG_LOGGING_IMPOSSIBLE:
+ checked= TRUE;
+ break;
+ }
+
+ DBUG_RETURN(checked);
+}
+
+
+/**
+ @note
+ How do we handle this (unlikely but legal) case:
+ @verbatim
+ [transaction] + [update to non-trans table] + [rollback to savepoint] ?
+ @endverbatim
+ The problem occurs when a savepoint is before the update to the
+ non-transactional table. Then when there's a rollback to the savepoint, if we
+ simply truncate the binlog cache, we lose the part of the binlog cache where
+ the update is. If we want to not lose it, we need to write the SAVEPOINT
+ command and the ROLLBACK TO SAVEPOINT command to the binlog cache. The latter
+ is easy: it's just write at the end of the binlog cache, but the former
+ should be *inserted* to the place where the user called SAVEPOINT. The
+ solution is that when the user calls SAVEPOINT, we write it to the binlog
+ cache (so no need to later insert it). As transactions are never intermixed
+ in the binary log (i.e. they are serialized), we won't have conflicts with
+ savepoint names when using mysqlbinlog or in the slave SQL thread.
+ Then when ROLLBACK TO SAVEPOINT is called, if we updated some
+ non-transactional table, we don't truncate the binlog cache but instead write
+ ROLLBACK TO SAVEPOINT to it; otherwise we truncate the binlog cache (which
+ will chop the SAVEPOINT command from the binlog cache, which is good as in
+ that case there is no need to have it in the binlog).
+*/
+
+static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv)
+{
+ int error= 1;
+ DBUG_ENTER("binlog_savepoint_set");
+
+ char buf[1024];
+
+ String log_query(buf, sizeof(buf), &my_charset_bin);
+ if (log_query.copy(STRING_WITH_LEN("SAVEPOINT "), &my_charset_bin) ||
+ append_identifier(thd, &log_query, &thd->lex->ident))
+ DBUG_RETURN(1);
+ int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
+ Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
+ TRUE, FALSE, TRUE, errcode);
+ /*
+ We cannot record the position before writing the statement
+ because a rollback to a savepoint (.e.g. consider it "S") would
+ prevent the savepoint statement (i.e. "SAVEPOINT S") from being
+ written to the binary log despite the fact that the server could
+ still issue other rollback statements to the same savepoint (i.e.
+ "S").
+ Given that the savepoint is valid until the server releases it,
+ ie, until the transaction commits or it is released explicitly,
+ we need to log it anyway so that we don't have "ROLLBACK TO S"
+ or "RELEASE S" without the preceding "SAVEPOINT S" in the binary
+ log.
+ */
+ if (likely(!(error= mysql_bin_log.write(&qinfo))))
+ binlog_trans_log_savepos(thd, (my_off_t*) sv);
+
+ DBUG_RETURN(error);
+}
+
+static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv)
+{
+ DBUG_ENTER("binlog_savepoint_rollback");
+
+ /*
+ Write ROLLBACK TO SAVEPOINT to the binlog cache if we have updated some
+ non-transactional table. Otherwise, truncate the binlog cache starting
+ from the SAVEPOINT command.
+ */
+#ifdef WITH_WSREP
+ /* for streaming replication, we must replicate savepoint rollback so that
+ slaves can maintain SR transactions
+ */
+ if (unlikely(thd->wsrep_trx().is_streaming() ||
+ (trans_has_updated_non_trans_table(thd)) ||
+ (thd->variables.option_bits & OPTION_BINLOG_THIS_TRX)))
+#else
+ if (unlikely(trans_has_updated_non_trans_table(thd) ||
+ (thd->variables.option_bits & OPTION_BINLOG_THIS_TRX)))
+#endif /* WITH_WSREP */
+ {
+ char buf[1024];
+ String log_query(buf, sizeof(buf), &my_charset_bin);
+ if (log_query.copy(STRING_WITH_LEN("ROLLBACK TO "), &my_charset_bin) ||
+ append_identifier(thd, &log_query, &thd->lex->ident))
+ DBUG_RETURN(1);
+ int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
+ Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
+ TRUE, FALSE, TRUE, errcode);
+ DBUG_RETURN(mysql_bin_log.write(&qinfo));
+ }
+
+ binlog_trans_log_truncate(thd, *(my_off_t*)sv);
+
+ /*
+ When a SAVEPOINT is executed inside a stored function/trigger we force the
+ pending event to be flushed with a STMT_END_F flag and reset binlog
+ as well to ensure that following DMLs will have a clean state to start
+ with. ROLLBACK inside a stored routine has to finalize possibly existing
+ current row-based pending event with cleaning up table maps. That ensures
+ that following DMLs will have a clean state to start with.
+ */
+ if (thd->in_sub_stmt)
+ thd->reset_binlog_for_next_statement();
+
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Check whether binlog state allows to safely release MDL locks after
+ rollback to savepoint.
+
+ @param hton The binlog handlerton.
+ @param thd The client thread that executes the transaction.
+
+ @return true - It is safe to release MDL locks.
+ false - If it is not.
+*/
+static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
+ THD *thd)
+{
+ DBUG_ENTER("binlog_savepoint_rollback_can_release_mdl");
+ /*
+ If we have not updated any non-transactional tables rollback
+ to savepoint will simply truncate binlog cache starting from
+ SAVEPOINT command. So it should be safe to release MDL acquired
+ after SAVEPOINT command in this case.
+ */
+ DBUG_RETURN(!trans_cannot_safely_rollback(thd, true));
+}
+
+
+int check_binlog_magic(IO_CACHE* log, const char** errmsg)
+{
+ uchar magic[4];
+ DBUG_ASSERT(my_b_tell(log) == 0);
+
+ if (my_b_read(log, magic, sizeof(magic)))
+ {
+ *errmsg = "I/O error reading the header from the binary log";
+ sql_print_error("%s, errno=%d, io cache code=%d", *errmsg, my_errno,
+ log->error);
+ return 1;
+ }
+ if (bcmp(magic, BINLOG_MAGIC, sizeof(magic)))
+ {
+ *errmsg = "Binlog has bad magic number; It's not a binary log file that can be used by this version of MariaDB";
+ return 1;
+ }
+ return 0;
+}
+
+
+File open_binlog(IO_CACHE *log, const char *log_file_name, const char **errmsg)
+{
+ File file;
+ DBUG_ENTER("open_binlog");
+
+ if ((file= mysql_file_open(key_file_binlog,
+ log_file_name, O_RDONLY | O_BINARY | O_SHARE,
+ MYF(MY_WME))) < 0)
+ {
+ sql_print_error("Failed to open log (file '%s', errno %d)",
+ log_file_name, my_errno);
+ *errmsg = "Could not open log file";
+ goto err;
+ }
+ if (init_io_cache_ext(log, file, (size_t)binlog_file_cache_size, READ_CACHE,
+ 0, 0, MYF(MY_WME|MY_DONT_CHECK_FILESIZE), key_file_binlog_cache))
+ {
+ sql_print_error("Failed to create a cache on log (file '%s')",
+ log_file_name);
+ *errmsg = "Could not open log file";
+ goto err;
+ }
+ if (check_binlog_magic(log,errmsg))
+ goto err;
+ DBUG_RETURN(file);
+
+err:
+ if (file >= 0)
+ {
+ mysql_file_close(file, MYF(0));
+ end_io_cache(log);
+ }
+ DBUG_RETURN(-1);
+}
+
+#ifdef _WIN32
+static int eventSource = 0;
+
+static void setup_windows_event_source()
+{
+ HKEY hRegKey= NULL;
+ DWORD dwError= 0;
+ TCHAR szPath[MAX_PATH];
+ DWORD dwTypes;
+
+ if (eventSource) // Ensure that we are only called once
+ return;
+ eventSource= 1;
+
+ // Create the event source registry key
+ dwError= RegCreateKey(HKEY_LOCAL_MACHINE,
+ "SYSTEM\\CurrentControlSet\\Services\\EventLog\\Application\\MariaDB",
+ &hRegKey);
+
+ /* Name of the PE module that contains the message resource */
+ GetModuleFileName(NULL, szPath, MAX_PATH);
+
+ /* Register EventMessageFile */
+ dwError = RegSetValueEx(hRegKey, "EventMessageFile", 0, REG_EXPAND_SZ,
+ (PBYTE) szPath, (DWORD) (strlen(szPath) + 1));
+
+ /* Register supported event types */
+ dwTypes= (EVENTLOG_ERROR_TYPE | EVENTLOG_WARNING_TYPE |
+ EVENTLOG_INFORMATION_TYPE);
+ dwError= RegSetValueEx(hRegKey, "TypesSupported", 0, REG_DWORD,
+ (LPBYTE) &dwTypes, sizeof dwTypes);
+
+ RegCloseKey(hRegKey);
+}
+
+#endif /* _WIN32 */
+
+
+/**
+ Find a unique filename for 'filename.#'.
+
+ Set '#' to the number next to the maximum found in the most
+ recent log file extension.
+
+ This function will return nonzero if: (i) the generated name
+ exceeds FN_REFLEN; (ii) if the number of extensions is exhausted;
+ or (iii) some other error happened while examining the filesystem.
+
+ @param name Base name of file
+ @param min_log_number_to_use minimum log number to choose. Set by
+ CHANGE MASTER .. TO
+ @param last_used_log_number If 0, find log number based on files.
+ If not 0, then use *last_used_log_number +1
+ Will be update to new generated number
+ @return
+ 0 ok
+ nonzero if not possible to get unique filename.
+*/
+
+static int find_uniq_filename(char *name, ulong min_log_number_to_use,
+ ulong *last_used_log_number)
+{
+ char buff[FN_REFLEN], ext_buf[FN_REFLEN];
+ struct st_my_dir *dir_info;
+ struct fileinfo *file_info;
+ ulong max_found= 0, next= 0, number= 0;
+ size_t i, buf_length, length;
+ char *start, *end;
+ int error= 0;
+ DBUG_ENTER("find_uniq_filename");
+
+ length= dirname_part(buff, name, &buf_length);
+ start= name + length;
+ end= strend(start);
+
+ *end='.';
+ length= (size_t) (end - start + 1);
+
+ /* The following matches the code for my_dir () below */
+ DBUG_EXECUTE_IF("error_unique_log_filename",
+ {
+ strmov(end,".1");
+ DBUG_RETURN(1);
+ });
+
+ if (*last_used_log_number)
+ max_found= *last_used_log_number;
+ else
+ {
+ if (unlikely(!(dir_info= my_dir(buff, MYF(MY_DONT_SORT)))))
+ { // This shouldn't happen
+ strmov(end,".1"); // use name+1
+ DBUG_RETURN(1);
+ }
+ file_info= dir_info->dir_entry;
+ max_found= min_log_number_to_use ? min_log_number_to_use-1 : 0;
+ for (i= dir_info->number_of_files ; i-- ; file_info++)
+ {
+ if (strncmp(file_info->name, start, length) == 0 &&
+ test_if_number(file_info->name+length, &number,0))
+ {
+ set_if_bigger(max_found, number);
+ }
+ }
+ my_dirend(dir_info);
+ }
+
+ /* check if reached the maximum possible extension number */
+ if (max_found >= MAX_LOG_UNIQUE_FN_EXT)
+ {
+ sql_print_error("Log filename extension number exhausted: %06lu. \
+Please fix this by archiving old logs and \
+updating the index files.", max_found);
+ error= 1;
+ goto end;
+ }
+
+ next= max_found + 1;
+ if (sprintf(ext_buf, "%06lu", next)<0)
+ {
+ error= 1;
+ goto end;
+ }
+ *end++='.';
+
+ /*
+ Check if the generated extension size + the file name exceeds the
+ buffer size used. If one did not check this, then the filename might be
+ truncated, resulting in error.
+ */
+ if (((strlen(ext_buf) + (end - name)) >= FN_REFLEN))
+ {
+ sql_print_error("Log filename too large: %s%s (%zu). \
+Please fix this by archiving old logs and updating the \
+index files.", name, ext_buf, (strlen(ext_buf) + (end - name)));
+ error= 1;
+ goto end;
+ }
+
+ if (sprintf(end, "%06lu", next)<0)
+ {
+ error= 1;
+ goto end;
+ }
+ *last_used_log_number= next;
+
+ /* print warning if reaching the end of available extensions. */
+ if ((next > (MAX_LOG_UNIQUE_FN_EXT - LOG_WARN_UNIQUE_FN_EXT_LEFT)))
+ sql_print_warning("Next log extension: %lu. \
+Remaining log filename extensions: %lu. \
+Please consider archiving some logs.", next, (MAX_LOG_UNIQUE_FN_EXT - next));
+
+end:
+ DBUG_RETURN(error);
+}
+
+
+bool MYSQL_LOG::init_and_set_log_file_name(const char *log_name,
+ const char *new_name,
+ ulong next_log_number,
+ enum_log_type log_type_arg,
+ enum cache_type io_cache_type_arg)
+{
+ log_type= log_type_arg;
+ io_cache_type= io_cache_type_arg;
+
+ if (new_name)
+ {
+ strmov(log_file_name, new_name);
+ }
+ else if (!new_name && generate_new_name(log_file_name, log_name,
+ next_log_number))
+ return TRUE;
+
+ return FALSE;
+}
+
+
+/*
+ Open a (new) log file.
+
+ SYNOPSIS
+ open()
+
+ log_name The name of the log to open
+ log_type_arg The type of the log. E.g. LOG_NORMAL
+ new_name The new name for the logfile. This is only needed
+ when the method is used to open the binlog file.
+ io_cache_type_arg The type of the IO_CACHE to use for this log file
+
+ DESCRIPTION
+ Open the logfile, init IO_CACHE and write startup messages
+ (in case of general and slow query logs).
+
+ RETURN VALUES
+ 0 ok
+ 1 error
+*/
+
+bool MYSQL_LOG::open(
+#ifdef HAVE_PSI_INTERFACE
+ PSI_file_key log_file_key,
+#endif
+ const char *log_name, enum_log_type log_type_arg,
+ const char *new_name, ulong next_log_number,
+ enum cache_type io_cache_type_arg)
+{
+ char buff[FN_REFLEN];
+ MY_STAT f_stat;
+ File file= -1;
+ my_off_t seek_offset;
+ bool is_fifo = false;
+ int open_flags= O_CREAT | O_BINARY | O_CLOEXEC;
+ DBUG_ENTER("MYSQL_LOG::open");
+ DBUG_PRINT("enter", ("log_type: %d", (int) log_type_arg));
+
+ write_error= 0;
+
+ if (!(name= my_strdup(key_memory_MYSQL_LOG_name, log_name, MYF(MY_WME))))
+ {
+ name= (char *)log_name; // for the error message
+ goto err;
+ }
+
+ /*
+ log_type is LOG_UNKNOWN if we should not generate a new name
+ This is only used when called from MYSQL_BINARY_LOG::open, which
+ has already updated log_file_name.
+ */
+ if (log_type_arg != LOG_UNKNOWN &&
+ init_and_set_log_file_name(name, new_name, next_log_number,
+ log_type_arg, io_cache_type_arg))
+ goto err;
+
+ is_fifo = my_stat(log_file_name, &f_stat, MYF(0)) &&
+ MY_S_ISFIFO(f_stat.st_mode);
+
+ if (io_cache_type == SEQ_READ_APPEND)
+ open_flags |= O_RDWR | O_APPEND;
+ else
+ open_flags |= O_WRONLY | (log_type == LOG_BIN ? 0 : O_APPEND);
+
+ if (is_fifo)
+ open_flags |= O_NONBLOCK;
+
+ db[0]= 0;
+
+#ifdef HAVE_PSI_INTERFACE
+ /* Keep the key for reopen */
+ m_log_file_key= log_file_key;
+#endif
+
+ if ((file= mysql_file_open(log_file_key, log_file_name, open_flags,
+ MYF(MY_WME))) < 0)
+ goto err;
+
+ if (is_fifo)
+ seek_offset= 0;
+ else if ((seek_offset= mysql_file_tell(file, MYF(MY_WME))))
+ goto err;
+
+ if (init_io_cache(&log_file, file, (log_type == LOG_NORMAL ? IO_SIZE :
+ LOG_BIN_IO_SIZE),
+ io_cache_type, seek_offset, 0,
+ MYF(MY_WME | MY_NABP |
+ ((log_type == LOG_BIN) ? MY_WAIT_IF_FULL : 0))))
+ goto err;
+
+ if (log_type == LOG_NORMAL)
+ {
+ char *end;
+ size_t len=my_snprintf(buff, sizeof(buff), "%s, Version: %s (%s). "
+#ifdef EMBEDDED_LIBRARY
+ "embedded library\n",
+ my_progname, server_version, MYSQL_COMPILATION_COMMENT
+#elif defined(_WIN32)
+ "started with:\nTCP Port: %d, Named Pipe: %s\n",
+ my_progname, server_version, MYSQL_COMPILATION_COMMENT,
+ mysqld_port, mysqld_unix_port
+#else
+ "started with:\nTcp port: %d Unix socket: %s\n",
+ my_progname, server_version, MYSQL_COMPILATION_COMMENT,
+ mysqld_port, mysqld_unix_port
+#endif
+ );
+ end= strnmov(buff + len, "Time\t\t Id Command\tArgument\n",
+ sizeof(buff) - len);
+ if (my_b_write(&log_file, (uchar*) buff, (uint) (end-buff)) ||
+ flush_io_cache(&log_file))
+ goto err;
+ }
+
+ log_state= LOG_OPENED;
+ DBUG_RETURN(0);
+
+err:
+ sql_print_error(fatal_log_error, name, errno);
+ if (file >= 0)
+ mysql_file_close(file, MYF(0));
+ end_io_cache(&log_file);
+ my_free(name);
+ name= NULL;
+ log_state= LOG_CLOSED;
+ DBUG_RETURN(1);
+}
+
+MYSQL_LOG::MYSQL_LOG()
+ : name(0), write_error(FALSE), inited(FALSE), log_type(LOG_UNKNOWN),
+ log_state(LOG_CLOSED)
+{
+ /*
+ We don't want to initialize LOCK_Log here as such initialization depends on
+ safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
+ called only in main(). Doing initialization here would make it happen
+ before main().
+ */
+ bzero((char*) &log_file, sizeof(log_file));
+}
+
+void MYSQL_LOG::init_pthread_objects()
+{
+ DBUG_ASSERT(inited == 0);
+ inited= 1;
+ mysql_mutex_init(key_LOG_LOCK_log, &LOCK_log, MY_MUTEX_INIT_SLOW);
+}
+
+/*
+ Close the log file
+
+ SYNOPSIS
+ close()
+ exiting Bitmask. LOG_CLOSE_TO_BE_OPENED is used if we intend to call
+ open at once after close. LOG_CLOSE_DELAYED_CLOSE is used for
+ binlog rotation, to delay actual close of the old file until
+ we have successfully created the new file.
+
+ NOTES
+ One can do an open on the object at once after doing a close.
+ The internal structures are not freed until cleanup() is called
+*/
+
+void MYSQL_LOG::close(uint exiting)
+{ // One can't set log_type here!
+ DBUG_ENTER("MYSQL_LOG::close");
+ DBUG_PRINT("enter",("exiting: %d", (int) exiting));
+ if (log_state == LOG_OPENED)
+ {
+ end_io_cache(&log_file);
+
+ if (log_type == LOG_BIN && mysql_file_sync(log_file.file, MYF(MY_WME)) && ! write_error)
+ {
+ write_error= 1;
+ sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), name, errno);
+ }
+
+ if (!(exiting & LOG_CLOSE_DELAYED_CLOSE) &&
+ mysql_file_close(log_file.file, MYF(MY_WME)) && ! write_error)
+ {
+ write_error= 1;
+ sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), name, errno);
+ }
+ }
+
+ log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
+ my_free(name);
+ name= NULL;
+ DBUG_VOID_RETURN;
+}
+
+/** This is called only once. */
+
+void MYSQL_LOG::cleanup()
+{
+ DBUG_ENTER("cleanup");
+ if (inited)
+ {
+ inited= 0;
+ mysql_mutex_destroy(&LOCK_log);
+ close(0);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+int MYSQL_LOG::generate_new_name(char *new_name, const char *log_name,
+ ulong next_log_number)
+{
+ fn_format(new_name, log_name, mysql_data_home, "", 4);
+ return 0;
+}
+
+int MYSQL_BIN_LOG::generate_new_name(char *new_name, const char *log_name,
+ ulong next_log_number)
+{
+ fn_format(new_name, log_name, mysql_data_home, "", 4);
+ if (!fn_ext(log_name)[0])
+ {
+ if (DBUG_IF("binlog_inject_new_name_error") ||
+ unlikely(find_uniq_filename(new_name, next_log_number,
+ &last_used_log_number)))
+ {
+ THD *thd= current_thd;
+ if (unlikely(thd))
+ my_error(ER_NO_UNIQUE_LOGFILE, MYF(ME_FATAL), log_name);
+ sql_print_error(ER_DEFAULT(ER_NO_UNIQUE_LOGFILE), log_name);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ Reopen the log file
+
+ SYNOPSIS
+ reopen_file()
+
+ DESCRIPTION
+ Reopen the log file. The method is used during FLUSH LOGS
+ and locks LOCK_log mutex
+*/
+
+
+void MYSQL_QUERY_LOG::reopen_file()
+{
+ char *save_name;
+ DBUG_ENTER("MYSQL_LOG::reopen_file");
+
+ mysql_mutex_lock(&LOCK_log);
+ if (!is_open())
+ {
+ DBUG_PRINT("info",("log is closed"));
+ mysql_mutex_unlock(&LOCK_log);
+ DBUG_VOID_RETURN;
+ }
+
+ save_name= name;
+ name= 0; // Don't free name
+ close(LOG_CLOSE_TO_BE_OPENED);
+
+ /*
+ Note that at this point, log_state != LOG_CLOSED (important for is_open()).
+ */
+
+ open(
+#ifdef HAVE_PSI_INTERFACE
+ m_log_file_key,
+#endif
+ save_name, log_type, 0, 0, io_cache_type);
+ my_free(save_name);
+
+ mysql_mutex_unlock(&LOCK_log);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Write a command to traditional general log file
+
+ SYNOPSIS
+ write()
+
+ event_time command start timestamp
+ user_host the pointer to the string with user@host info
+ user_host_len length of the user_host string. this is computed once
+ and passed to all general log event handlers
+ thread_id Id of the thread, issued a query
+ command_type the type of the command being logged
+ command_type_len the length of the string above
+ sql_text the very text of the query being executed
+ sql_text_len the length of sql_text string
+
+ DESCRIPTION
+
+ Log given command to to normal (not rotable) log file
+
+ RETURN
+ FASE - OK
+ TRUE - error occurred
+*/
+
+bool MYSQL_QUERY_LOG::write(time_t event_time, const char *user_host,
+ size_t user_host_len, my_thread_id thread_id_arg,
+ const char *command_type, size_t command_type_len,
+ const char *sql_text, size_t sql_text_len)
+{
+ char buff[32];
+ char local_time_buff[MAX_TIME_SIZE];
+ struct tm start;
+ size_t time_buff_len= 0;
+
+ mysql_mutex_lock(&LOCK_log);
+
+ /* Test if someone closed between the is_open test and lock */
+ if (is_open())
+ {
+ /* for testing output of timestamp and thread id */
+ DBUG_EXECUTE_IF("reset_log_last_time", last_time= 0;);
+
+ /* Note that my_b_write() assumes it knows the length for this */
+ if (event_time != last_time)
+ {
+ last_time= event_time;
+
+ localtime_r(&event_time, &start);
+
+ time_buff_len= my_snprintf(local_time_buff, MAX_TIME_SIZE,
+ "%02d%02d%02d %2d:%02d:%02d\t",
+ start.tm_year % 100, start.tm_mon + 1,
+ start.tm_mday, start.tm_hour,
+ start.tm_min, start.tm_sec);
+
+ if (my_b_write(&log_file, (uchar*) local_time_buff, time_buff_len))
+ goto err;
+ }
+ else
+ if (my_b_write(&log_file, (uchar*) "\t\t" ,2) < 0)
+ goto err;
+
+ /* command_type, thread_id */
+ size_t length= my_snprintf(buff, 32, "%6llu ", thread_id_arg);
+
+ if (my_b_write(&log_file, (uchar*) buff, length))
+ goto err;
+
+ if (my_b_write(&log_file, (uchar*) command_type, command_type_len))
+ goto err;
+
+ if (my_b_write(&log_file, (uchar*) "\t", 1))
+ goto err;
+
+ /* sql_text */
+ if (my_b_write(&log_file, (uchar*) sql_text, sql_text_len))
+ goto err;
+
+ if (my_b_write(&log_file, (uchar*) "\n", 1) ||
+ flush_io_cache(&log_file))
+ goto err;
+ }
+
+ mysql_mutex_unlock(&LOCK_log);
+ return FALSE;
+err:
+
+ if (!write_error)
+ {
+ write_error= 1;
+ sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), name, errno);
+ }
+ mysql_mutex_unlock(&LOCK_log);
+ return TRUE;
+}
+
+
+/*
+ Log a query to the traditional slow log file
+
+ SYNOPSIS
+ write()
+
+ thd THD of the query
+ current_time current timestamp
+ user_host the pointer to the string with user@host info
+ user_host_len length of the user_host string. this is computed once
+ and passed to all general log event handlers
+ query_utime Amount of time the query took to execute (in microseconds)
+ lock_utime Amount of time the query was locked (in microseconds)
+ is_command The flag, which determines, whether the sql_text is a
+ query or an administrator command.
+ sql_text the very text of the query or administrator command
+ processed
+ sql_text_len the length of sql_text string
+
+ DESCRIPTION
+
+ Log a query to the slow log file.
+
+ RETURN
+ FALSE - OK
+ TRUE - error occurred
+*/
+
+bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
+ const char *user_host, size_t user_host_len,
+ ulonglong query_utime,
+ ulonglong lock_utime, bool is_command,
+ const char *sql_text, size_t sql_text_len)
+{
+ bool error= 0;
+ char llbuff[22];
+ DBUG_ENTER("MYSQL_QUERY_LOG::write");
+
+ mysql_mutex_lock(&LOCK_log);
+ if (is_open())
+ { // Safety against reopen
+ char buff[80], *end;
+ char query_time_buff[22+7], lock_time_buff[22+7];
+ size_t buff_len;
+ ulonglong log_slow_verbosity= thd->variables.log_slow_verbosity;
+ if (log_slow_verbosity & LOG_SLOW_VERBOSITY_FULL)
+ log_slow_verbosity= ~(ulonglong) 0;
+
+ end= buff;
+
+ if (!(specialflag & SPECIAL_SHORT_LOG_FORMAT))
+ {
+ if (current_time != last_time)
+ {
+ last_time= current_time;
+ struct tm start;
+ localtime_r(&current_time, &start);
+
+ buff_len= my_snprintf(buff, sizeof buff,
+ "# Time: %02d%02d%02d %2d:%02d:%02d\n",
+ start.tm_year % 100, start.tm_mon + 1,
+ start.tm_mday, start.tm_hour,
+ start.tm_min, start.tm_sec);
+
+ /* Note that my_b_write() assumes it knows the length for this */
+ if (my_b_write(&log_file, (uchar*) buff, buff_len))
+ goto err;
+ }
+ const uchar uh[]= "# User@Host: ";
+ if (my_b_write(&log_file, uh, sizeof(uh) - 1) ||
+ my_b_write(&log_file, (uchar*) user_host, user_host_len) ||
+ my_b_write(&log_file, (uchar*) "\n", 1))
+ goto err;
+
+ sprintf(query_time_buff, "%.6f", ulonglong2double(query_utime)/1000000.0);
+ sprintf(lock_time_buff, "%.6f", ulonglong2double(lock_utime)/1000000.0);
+ if (my_b_printf(&log_file,
+ "# Thread_id: %lu Schema: %s QC_hit: %s\n"
+ "# Query_time: %s Lock_time: %s Rows_sent: %lu Rows_examined: %lu\n"
+ "# Rows_affected: %lu Bytes_sent: %lu\n",
+ (ulong) thd->thread_id, thd->get_db(),
+ ((thd->query_plan_flags & QPLAN_QC) ? "Yes" : "No"),
+ query_time_buff, lock_time_buff,
+ (ulong) thd->get_sent_row_count(),
+ (ulong) thd->get_examined_row_count(),
+ (ulong) thd->get_affected_rows(),
+ (ulong) (thd->status_var.bytes_sent - thd->bytes_sent_old)))
+ goto err;
+
+ if (unlikely(log_slow_verbosity &
+ LOG_SLOW_VERBOSITY_ENGINE) &&
+ thd->handler_stats.has_stats())
+ {
+ ha_handler_stats *stats= &thd->handler_stats;
+ double tracker_frequency= timer_tracker_frequency();
+ sprintf(query_time_buff, "%.4f",
+ 1000.0 * ulonglong2double(stats->pages_read_time)/
+ tracker_frequency);
+ sprintf(lock_time_buff, "%.4f",
+ 1000.0 * ulonglong2double(stats->engine_time)/
+ tracker_frequency);
+
+ if (my_b_printf(&log_file,
+ "# Pages_accessed: %lu Pages_read: %lu "
+ "Pages_updated: %lu Old_rows_read: %lu\n"
+ "# Pages_read_time: %s Engine_time: %s\n",
+ (ulong) stats->pages_accessed,
+ (ulong) stats->pages_read_count,
+ (ulong) stats->pages_updated,
+ (ulong) stats->undo_records_read,
+ query_time_buff, lock_time_buff))
+ goto err;
+ }
+
+ if ((log_slow_verbosity & LOG_SLOW_VERBOSITY_QUERY_PLAN) &&
+ thd->tmp_tables_used &&
+ my_b_printf(&log_file,
+ "# Tmp_tables: %lu Tmp_disk_tables: %lu "
+ "Tmp_table_sizes: %s\n",
+ (ulong) thd->tmp_tables_used,
+ (ulong) thd->tmp_tables_disk_used,
+ llstr(thd->tmp_tables_size, llbuff)))
+ goto err;
+
+ if (thd->spcont &&
+ my_b_printf(&log_file, "# Stored_routine: %s\n",
+ ErrConvDQName(thd->spcont->m_sp).ptr()))
+ goto err;
+
+ if ((log_slow_verbosity & LOG_SLOW_VERBOSITY_QUERY_PLAN) &&
+ (thd->query_plan_flags &
+ (QPLAN_FULL_SCAN | QPLAN_FULL_JOIN | QPLAN_TMP_TABLE |
+ QPLAN_TMP_DISK | QPLAN_FILESORT | QPLAN_FILESORT_DISK |
+ QPLAN_FILESORT_PRIORITY_QUEUE)) &&
+ my_b_printf(&log_file,
+ "# Full_scan: %s Full_join: %s "
+ "Tmp_table: %s Tmp_table_on_disk: %s\n"
+ "# Filesort: %s Filesort_on_disk: %s Merge_passes: %lu "
+ "Priority_queue: %s\n",
+ ((thd->query_plan_flags & QPLAN_FULL_SCAN) ? "Yes" : "No"),
+ ((thd->query_plan_flags & QPLAN_FULL_JOIN) ? "Yes" : "No"),
+ (thd->tmp_tables_used ? "Yes" : "No"),
+ (thd->tmp_tables_disk_used ? "Yes" : "No"),
+ ((thd->query_plan_flags & QPLAN_FILESORT) ? "Yes" : "No"),
+ ((thd->query_plan_flags & QPLAN_FILESORT_DISK) ?
+ "Yes" : "No"),
+ thd->query_plan_fsort_passes,
+ ((thd->query_plan_flags & QPLAN_FILESORT_PRIORITY_QUEUE) ?
+ "Yes" : "No")
+ ))
+ goto err;
+ if (log_slow_verbosity & LOG_SLOW_VERBOSITY_EXPLAIN && thd->lex->explain)
+ {
+ StringBuffer<128> buf;
+ DBUG_ASSERT(!thd->free_list);
+ if (!print_explain_for_slow_log(thd->lex, thd, &buf))
+ if (my_b_printf(&log_file, "%s", buf.c_ptr_safe()))
+ goto err;
+ thd->free_items();
+ }
+ if ((log_slow_verbosity & LOG_SLOW_VERBOSITY_WARNINGS) &&
+ thd->get_stmt_da()->unsafe_statement_warn_count())
+ {
+ Diagnostics_area::Sql_condition_iterator it=
+ thd->get_stmt_da()->sql_conditions();
+ ulong idx, max_warnings= thd->variables.log_slow_max_warnings;
+ const Sql_condition *err;
+ my_b_printf(&log_file, "# Warnings\n");
+ for (idx= 0; (err= it++) && idx < max_warnings; idx++)
+ {
+ my_b_printf(&log_file, "# %-15s %4u %.*s\n",
+ warning_level_names[err->get_level()].str,
+ (uint) err->get_sql_errno(),
+ (int) err->get_message_octet_length(),
+ err->get_message_text());
+ }
+ }
+ if (thd->db.str && strcmp(thd->db.str, db))
+ { // Database changed
+ if (my_b_printf(&log_file,"use %s;\n",thd->db.str))
+ goto err;
+ strmov(db,thd->db.str);
+ }
+ if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
+ {
+ end=strmov(end, ",last_insert_id=");
+ end=longlong10_to_str((longlong)
+ thd->first_successful_insert_id_in_prev_stmt_for_binlog,
+ end, -10);
+ }
+ // Save value if we do an insert.
+ if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
+ {
+ if (!(specialflag & SPECIAL_SHORT_LOG_FORMAT))
+ {
+ end=strmov(end,",insert_id=");
+ end=longlong10_to_str((longlong)
+ thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(),
+ end, -10);
+ }
+ }
+ /*
+ This info used to show up randomly, depending on whether the query
+ checked the query start time or not. now we always write current
+ timestamp to the slow log
+ */
+ end= strmov(end, ",timestamp=");
+ end= int10_to_str((long) current_time, end, 10);
+
+ if (end != buff)
+ {
+ *end++=';';
+ *end='\n';
+ if (my_b_write(&log_file, (uchar*) "SET ", 4) ||
+ my_b_write(&log_file, (uchar*) buff + 1, (uint) (end-buff)))
+ goto err;
+ }
+ if (is_command)
+ {
+ end= strxmov(buff, "# administrator command: ", NullS);
+ buff_len= (ulong) (end - buff);
+ DBUG_EXECUTE_IF("simulate_slow_log_write_error",
+ {DBUG_SET("+d,simulate_file_write_error");});
+ if(my_b_write(&log_file, (uchar*) buff, buff_len))
+ goto err;
+ }
+ if (my_b_write(&log_file, (uchar*) sql_text, sql_text_len) ||
+ my_b_write(&log_file, (uchar*) ";\n",2) ||
+ flush_io_cache(&log_file))
+ goto err;
+
+ }
+ }
+end:
+ mysql_mutex_unlock(&LOCK_log);
+ DBUG_RETURN(error);
+
+err:
+ error= 1;
+ if (!write_error)
+ {
+ write_error= 1;
+ sql_print_error(ER_THD(thd, ER_ERROR_ON_WRITE), name, errno);
+ }
+ goto end;
+}
+
+
+/**
+ @todo
+ The following should be using fn_format(); We just need to
+ first change fn_format() to cut the file name if it's too long.
+*/
+const char *MYSQL_LOG::generate_name(const char *log_name,
+ const char *suffix,
+ bool strip_ext, char *buff)
+{
+ if (!log_name || !log_name[0])
+ {
+ strmake(buff, pidfile_name, FN_REFLEN - strlen(suffix) - 1);
+ return (const char *)
+ fn_format(buff, buff, "", suffix, MYF(MY_REPLACE_EXT|MY_REPLACE_DIR));
+ }
+ // get rid of extension if the log is binary to avoid problems
+ if (strip_ext)
+ {
+ char *p= fn_ext(log_name);
+ uint length= (uint) (p - log_name);
+ strmake(buff, log_name, MY_MIN(length, FN_REFLEN-1));
+ return (const char*)buff;
+ }
+ return log_name;
+}
+
+
+/*
+ Print some additional information about addition/removal of
+ XID list entries.
+ TODO: Remove once MDEV-9510 is fixed.
+*/
+#ifdef WITH_WSREP
+#define WSREP_XID_LIST_ENTRY(X, Y) \
+ if (wsrep_debug) \
+ { \
+ char buf[FN_REFLEN]; \
+ strmake(buf, Y->binlog_name, Y->binlog_name_len); \
+ WSREP_DEBUG(X, buf, Y->binlog_id); \
+ }
+#else
+#define WSREP_XID_LIST_ENTRY(X, Y) do { } while(0)
+#endif
+
+MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
+ :reset_master_pending(0), mark_xid_done_waiting(0),
+ bytes_written(0), last_used_log_number(0),
+ file_id(1), open_count(1),
+ group_commit_queue(0), group_commit_queue_busy(FALSE),
+ num_commits(0), num_group_commits(0),
+ group_commit_trigger_count(0), group_commit_trigger_timeout(0),
+ group_commit_trigger_lock_wait(0),
+ sync_period_ptr(sync_period), sync_counter(0),
+ state_file_deleted(false), binlog_state_recover_done(false),
+ is_relay_log(0), relay_signal_cnt(0),
+ checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
+ relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
+ description_event_for_exec(0), description_event_for_queue(0),
+ current_binlog_id(0), reset_master_count(0)
+{
+ /*
+ We don't want to initialize locks here as such initialization depends on
+ safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
+ called only in main(). Doing initialization here would make it happen
+ before main().
+ */
+ index_file_name[0] = 0;
+ bzero((char*) &index_file, sizeof(index_file));
+ bzero((char*) &purge_index_file, sizeof(purge_index_file));
+}
+
+void MYSQL_BIN_LOG::stop_background_thread()
+{
+ if (binlog_background_thread_started)
+ {
+ mysql_mutex_lock(&LOCK_binlog_background_thread);
+ binlog_background_thread_stop= true;
+ mysql_cond_signal(&COND_binlog_background_thread);
+ while (binlog_background_thread_stop)
+ mysql_cond_wait(&COND_binlog_background_thread_end,
+ &LOCK_binlog_background_thread);
+ mysql_mutex_unlock(&LOCK_binlog_background_thread);
+ binlog_background_thread_started= false;
+ binlog_background_thread_stop= true; // mark it's not going to restart
+ }
+}
+
+/* this is called only once */
+
+void MYSQL_BIN_LOG::cleanup()
+{
+ DBUG_ENTER("cleanup");
+ if (inited)
+ {
+ xid_count_per_binlog *b;
+
+ /* Wait for the binlog background thread to stop. */
+ if (!is_relay_log)
+ stop_background_thread();
+
+ inited= 0;
+ mysql_mutex_lock(&LOCK_log);
+ close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT);
+ mysql_mutex_unlock(&LOCK_log);
+ delete description_event_for_queue;
+ delete description_event_for_exec;
+
+ while ((b= binlog_xid_count_list.get()))
+ {
+ /*
+ There should be no pending XIDs at shutdown, and only one entry (for
+ the active binlog file) in the list.
+ */
+ DBUG_ASSERT(b->xid_count == 0);
+ DBUG_ASSERT(!binlog_xid_count_list.head());
+ WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::cleanup(): Removing xid_list_entry "
+ "for %s (%lu)", b);
+ delete b;
+ }
+
+ mysql_mutex_destroy(&LOCK_log);
+ mysql_mutex_destroy(&LOCK_index);
+ mysql_mutex_destroy(&LOCK_xid_list);
+ mysql_mutex_destroy(&LOCK_binlog_background_thread);
+ mysql_mutex_destroy(&LOCK_binlog_end_pos);
+ mysql_cond_destroy(&COND_relay_log_updated);
+ mysql_cond_destroy(&COND_bin_log_updated);
+ mysql_cond_destroy(&COND_queue_busy);
+ mysql_cond_destroy(&COND_xid_list);
+ mysql_cond_destroy(&COND_binlog_background_thread);
+ mysql_cond_destroy(&COND_binlog_background_thread_end);
+ }
+
+ /*
+ Free data for global binlog state.
+ We can't do that automatically as we need to do this before
+ safemalloc is shut down
+ */
+ if (!is_relay_log)
+ rpl_global_gtid_binlog_state.free();
+ DBUG_VOID_RETURN;
+}
+
+
+/* Init binlog-specific vars */
+void MYSQL_BIN_LOG::init(ulong max_size_arg)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::init");
+ max_size= max_size_arg;
+ DBUG_PRINT("info",("max_size: %lu", max_size));
+ DBUG_VOID_RETURN;
+}
+
+
+void MYSQL_BIN_LOG::init_pthread_objects()
+{
+ MYSQL_LOG::init_pthread_objects();
+ mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW);
+ mysql_mutex_setflags(&LOCK_index, MYF_NO_DEADLOCK_DETECTION);
+ mysql_mutex_init(key_BINLOG_LOCK_xid_list,
+ &LOCK_xid_list, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(m_key_relay_log_update, &COND_relay_log_updated, 0);
+ mysql_cond_init(m_key_bin_log_update, &COND_bin_log_updated, 0);
+ mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
+ mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
+
+ mysql_mutex_init(key_BINLOG_LOCK_binlog_background_thread,
+ &LOCK_binlog_background_thread, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(key_BINLOG_COND_binlog_background_thread,
+ &COND_binlog_background_thread, 0);
+ mysql_cond_init(key_BINLOG_COND_binlog_background_thread_end,
+ &COND_binlog_background_thread_end, 0);
+
+ mysql_mutex_init(m_key_LOCK_binlog_end_pos, &LOCK_binlog_end_pos,
+ MY_MUTEX_INIT_SLOW);
+}
+
+
+bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
+ const char *log_name, bool need_mutex)
+{
+ File index_file_nr= -1;
+ DBUG_ASSERT(!my_b_inited(&index_file));
+
+ /*
+ First open of this class instance
+ Create an index file that will hold all file names uses for logging.
+ Add new entries to the end of it.
+ */
+ myf opt= MY_UNPACK_FILENAME;
+ if (!index_file_name_arg)
+ {
+ index_file_name_arg= log_name; // Use same basename for index file
+ opt= MY_UNPACK_FILENAME | MY_REPLACE_EXT;
+ }
+ fn_format(index_file_name, index_file_name_arg, mysql_data_home,
+ ".index", opt);
+ if ((index_file_nr= mysql_file_open(m_key_file_log_index,
+ index_file_name,
+ O_RDWR | O_CREAT | O_BINARY | O_CLOEXEC,
+ MYF(MY_WME))) < 0 ||
+ mysql_file_sync(index_file_nr, MYF(MY_WME)) ||
+ init_io_cache_ext(&index_file, index_file_nr,
+ IO_SIZE, WRITE_CACHE,
+ mysql_file_seek(index_file_nr, 0L, MY_SEEK_END, MYF(0)),
+ 0, MYF(MY_WME | MY_WAIT_IF_FULL),
+ m_key_file_log_index_cache) ||
+ DBUG_IF("fault_injection_openning_index"))
+ {
+ /*
+ TODO: all operations creating/deleting the index file or a log, should
+ call my_sync_dir() or my_sync_dir_by_file() to be durable.
+ TODO: file creation should be done with mysql_file_create()
+ not mysql_file_open().
+ */
+ if (index_file_nr >= 0)
+ mysql_file_close(index_file_nr, MYF(0));
+ return TRUE;
+ }
+
+#ifdef HAVE_REPLICATION
+ /*
+ Sync the index by purging any binary log file that is not registered.
+ In other words, either purge binary log files that were removed from
+ the index but not purged from the file system due to a crash or purge
+ any binary log file that was created but not register in the index
+ due to a crash.
+ */
+
+ if (set_purge_index_file_name(index_file_name_arg) ||
+ open_purge_index_file(FALSE) ||
+ purge_index_entry(NULL, NULL, need_mutex) ||
+ close_purge_index_file() ||
+ DBUG_IF("fault_injection_recovering_index"))
+ {
+ sql_print_error("MYSQL_BIN_LOG::open_index_file failed to sync the index "
+ "file.");
+ return TRUE;
+ }
+#endif
+
+ return FALSE;
+}
+
+
+/**
+ Open a (new) binlog file.
+
+ - Open the log file and the index file. Register the new
+ file name in it
+ - When calling this when the file is in use, you must have a locks
+ on LOCK_log and LOCK_index.
+
+ @retval
+ 0 ok
+ @retval
+ 1 error
+*/
+
+bool MYSQL_BIN_LOG::open(const char *log_name,
+ const char *new_name,
+ ulong next_log_number,
+ enum cache_type io_cache_type_arg,
+ ulong max_size_arg,
+ bool null_created_arg,
+ bool need_mutex)
+{
+ File file= -1;
+ xid_count_per_binlog *new_xid_list_entry= NULL, *b;
+ DBUG_ENTER("MYSQL_BIN_LOG::open");
+
+ mysql_mutex_assert_owner(&LOCK_log);
+
+ if (!is_relay_log)
+ {
+ if (!binlog_state_recover_done)
+ {
+ binlog_state_recover_done= true;
+ if (do_binlog_recovery(opt_bin_logname, false))
+ DBUG_RETURN(1);
+ }
+
+ if ((!binlog_background_thread_started &&
+ !binlog_background_thread_stop) &&
+ start_binlog_background_thread())
+ DBUG_RETURN(1);
+ }
+
+ /* We need to calculate new log file name for purge to delete old */
+ if (init_and_set_log_file_name(log_name, new_name, next_log_number,
+ LOG_BIN, io_cache_type_arg))
+ {
+ sql_print_error("MYSQL_BIN_LOG::open failed to generate new file name.");
+ if (!is_relay_log)
+ goto err;
+ DBUG_RETURN(1);
+ }
+
+#ifdef HAVE_REPLICATION
+ if (open_purge_index_file(TRUE) ||
+ register_create_index_entry(log_file_name) ||
+ sync_purge_index_file() ||
+ DBUG_IF("fault_injection_registering_index"))
+ {
+ /**
+ TODO:
+ Although this was introduced to appease valgrind when
+ injecting emulated faults using
+ fault_injection_registering_index it may be good to consider
+ what actually happens when open_purge_index_file succeeds but
+ register or sync fails.
+
+ Perhaps we might need the code below in MYSQL_LOG_BIN::cleanup
+ for "real life" purposes as well?
+ */
+ DBUG_EXECUTE_IF("fault_injection_registering_index", {
+ if (my_b_inited(&purge_index_file))
+ {
+ end_io_cache(&purge_index_file);
+ my_close(purge_index_file.file, MYF(0));
+ }
+ });
+
+ sql_print_error("MYSQL_BIN_LOG::open failed to sync the index file.");
+ DBUG_RETURN(1);
+ }
+ DBUG_EXECUTE_IF("crash_create_non_critical_before_update_index", DBUG_SUICIDE(););
+#endif
+
+ write_error= 0;
+
+ /* open the main log file */
+ if (MYSQL_LOG::open(
+#ifdef HAVE_PSI_INTERFACE
+ m_key_file_log,
+#endif
+ log_name,
+ LOG_UNKNOWN, /* Don't generate new name */
+ 0, 0, io_cache_type_arg))
+ {
+#ifdef HAVE_REPLICATION
+ close_purge_index_file();
+#endif
+ DBUG_RETURN(1); /* all warnings issued */
+ }
+
+ init(max_size_arg);
+
+ open_count++;
+
+ DBUG_ASSERT(log_type == LOG_BIN);
+
+ {
+ bool write_file_name_to_index_file=0;
+
+ if (!my_b_filelength(&log_file))
+ {
+ /*
+ The binary log file was empty (probably newly created)
+ This is the normal case and happens when the user doesn't specify
+ an extension for the binary log files.
+ In this case we write a standard header to it.
+ */
+ if (my_b_safe_write(&log_file, BINLOG_MAGIC,
+ BIN_LOG_HEADER_SIZE))
+ goto err;
+ bytes_written+= BIN_LOG_HEADER_SIZE;
+ write_file_name_to_index_file= 1;
+ }
+
+ {
+ /*
+ In 4.x we put Start event only in the first binlog. But from 5.0 we
+ want a Start event even if this is not the very first binlog.
+ */
+ Format_description_log_event s(BINLOG_VERSION);
+ /*
+ don't set LOG_EVENT_BINLOG_IN_USE_F for SEQ_READ_APPEND io_cache
+ as we won't be able to reset it later
+ */
+ if (io_cache_type == WRITE_CACHE)
+ s.flags |= LOG_EVENT_BINLOG_IN_USE_F;
+
+ if (is_relay_log)
+ {
+ if (relay_log_checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF)
+ relay_log_checksum_alg=
+ opt_slave_sql_verify_checksum ? (enum_binlog_checksum_alg) binlog_checksum_options
+ : BINLOG_CHECKSUM_ALG_OFF;
+ s.checksum_alg= relay_log_checksum_alg;
+ s.set_relay_log_event();
+ }
+ else
+ s.checksum_alg= (enum_binlog_checksum_alg)binlog_checksum_options;
+
+ crypto.scheme = 0;
+ DBUG_ASSERT(s.checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
+ if (!s.is_valid())
+ goto err;
+ s.dont_set_created= null_created_arg;
+ if (write_event(&s))
+ goto err;
+ bytes_written+= s.data_written;
+
+ if (encrypt_binlog)
+ {
+ uint key_version= encryption_key_get_latest_version(ENCRYPTION_KEY_SYSTEM_DATA);
+ if (key_version == ENCRYPTION_KEY_VERSION_INVALID)
+ {
+ sql_print_error("Failed to enable encryption of binary logs");
+ goto err;
+ }
+
+ if (key_version != ENCRYPTION_KEY_NOT_ENCRYPTED)
+ {
+ if (my_random_bytes(crypto.nonce, sizeof(crypto.nonce)))
+ goto err;
+
+ Start_encryption_log_event sele(1, key_version, crypto.nonce);
+ sele.checksum_alg= s.checksum_alg;
+ if (write_event(&sele))
+ goto err;
+
+ // Start_encryption_log_event is written, enable the encryption
+ if (crypto.init(sele.crypto_scheme, key_version))
+ goto err;
+ }
+ }
+
+ if (!is_relay_log)
+ {
+ char buf[FN_REFLEN];
+
+ /*
+ Output a Gtid_list_log_event at the start of the binlog file.
+
+ This is used to quickly determine which GTIDs are found in binlog
+ files earlier than this one, and which are found in this (or later)
+ binlogs.
+
+ The list gives a mapping from (domain_id, server_id) -> seq_no (so
+ this means that there is at most one entry for every unique pair
+ (domain_id, server_id) in the list). It indicates that this seq_no is
+ the last one found in an earlier binlog file for this (domain_id,
+ server_id) combination - so any higher seq_no should be search for
+ from this binlog file, or a later one.
+
+ This allows to locate the binlog file containing a given GTID by
+ scanning backwards, reading just the Gtid_list_log_event at the
+ start of each file, and scanning only the relevant binlog file when
+ found, not all binlog files.
+
+ The existence of a given entry (domain_id, server_id, seq_no)
+ guarantees only that this seq_no will not be found in this or any
+ later binlog file. It does not guarantee that it can be found it an
+ earlier binlog file, for example the file may have been purged.
+
+ If there is no entry for a given (domain_id, server_id) pair, then
+ it means that no such GTID exists in any earlier binlog. It is
+ permissible to remove such pair from future Gtid_list_log_events
+ if all previous binlog files containing such GTIDs have been purged
+ (though such optimization is not performed at the time of this
+ writing). So if there is no entry for given GTID it means that such
+ GTID should be search for in this or later binlog file, same as if
+ there had been an entry (domain_id, server_id, 0).
+ */
+
+ Gtid_list_log_event gl_ev(&rpl_global_gtid_binlog_state, 0);
+ if (write_event(&gl_ev))
+ goto err;
+
+ /* Output a binlog checkpoint event at the start of the binlog file. */
+
+ /*
+ Construct an entry in the binlog_xid_count_list for the new binlog
+ file (we will not link it into the list until we know the new file
+ is successfully created; otherwise we would have to remove it again
+ if creation failed, which gets tricky since other threads may have
+ seen the entry in the meantime - and we do not want to hold
+ LOCK_xid_list for long periods of time).
+
+ Write the current binlog checkpoint into the log, so XA recovery will
+ know from where to start recovery.
+ */
+ size_t off= dirname_length(log_file_name);
+ uint len= static_cast<uint>(strlen(log_file_name) - off);
+ new_xid_list_entry= new xid_count_per_binlog(log_file_name+off, len);
+ if (!new_xid_list_entry)
+ goto err;
+
+ /*
+ Find the name for the Initial binlog checkpoint.
+
+ Normally this will just be the first entry, as we delete entries
+ when their count drops to zero. But we scan the list to handle any
+ corner case, eg. for the first binlog file opened after startup, the
+ list will be empty.
+ */
+ mysql_mutex_lock(&LOCK_xid_list);
+ I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+ while ((b= it++) && b->xid_count == 0)
+ ;
+ mysql_mutex_unlock(&LOCK_xid_list);
+ if (!b)
+ b= new_xid_list_entry;
+ if (b->binlog_name)
+ strmake(buf, b->binlog_name, b->binlog_name_len);
+ else
+ goto err;
+ Binlog_checkpoint_log_event ev(buf, len);
+ DBUG_EXECUTE_IF("crash_before_write_checkpoint_event",
+ flush_io_cache(&log_file);
+ mysql_file_sync(log_file.file, MYF(MY_WME));
+ DBUG_SUICIDE(););
+ if (write_event(&ev))
+ goto err;
+ bytes_written+= ev.data_written;
+ }
+ }
+ if (description_event_for_queue &&
+ description_event_for_queue->binlog_version>=4)
+ {
+ /*
+ This is a relay log written to by the I/O slave thread.
+ Write the event so that others can later know the format of this relay
+ log.
+ Note that this event is very close to the original event from the
+ master (it has binlog version of the master, event types of the
+ master), so this is suitable to parse the next relay log's event. It
+ has been produced by
+ Format_description_log_event::Format_description_log_event(char* buf,).
+ Why don't we want to write the description_event_for_queue if this
+ event is for format<4 (3.23 or 4.x): this is because in that case, the
+ description_event_for_queue describes the data received from the
+ master, but not the data written to the relay log (*conversion*),
+ which is in format 4 (slave's).
+ */
+ /*
+ Set 'created' to 0, so that in next relay logs this event does not
+ trigger cleaning actions on the slave in
+ Format_description_log_event::apply_event_impl().
+ */
+ description_event_for_queue->created= 0;
+ /* Don't set log_pos in event header */
+ description_event_for_queue->set_artificial_event();
+
+ if (write_event(description_event_for_queue))
+ goto err;
+ bytes_written+= description_event_for_queue->data_written;
+ }
+ if (flush_io_cache(&log_file) ||
+ mysql_file_sync(log_file.file, MYF(MY_WME)))
+ goto err;
+
+ my_off_t offset= my_b_tell(&log_file);
+
+ if (!is_relay_log)
+ {
+ /* update binlog_end_pos so that it can be read by after sync hook */
+ reset_binlog_end_pos(log_file_name, offset);
+
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ strmake_buf(last_commit_pos_file, log_file_name);
+ last_commit_pos_offset= offset;
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+ }
+
+ if (write_file_name_to_index_file)
+ {
+#ifdef HAVE_REPLICATION
+#ifdef ENABLED_DEBUG_SYNC
+ if (current_thd)
+ DEBUG_SYNC(current_thd, "binlog_open_before_update_index");
+#endif
+ DBUG_EXECUTE_IF("crash_create_critical_before_update_index", DBUG_SUICIDE(););
+#endif
+
+ DBUG_ASSERT(my_b_inited(&index_file) != 0);
+ reinit_io_cache(&index_file, WRITE_CACHE,
+ my_b_filelength(&index_file), 0, 0);
+ /*
+ As this is a new log file, we write the file name to the index
+ file. As every time we write to the index file, we sync it.
+ */
+ if (DBUG_IF("fault_injection_updating_index") ||
+ my_b_write(&index_file, (uchar*) log_file_name,
+ strlen(log_file_name)) ||
+ my_b_write(&index_file, (uchar*) "\n", 1) ||
+ flush_io_cache(&index_file) ||
+ mysql_file_sync(index_file.file, MYF(MY_WME)))
+ goto err;
+
+#ifdef HAVE_REPLICATION
+ DBUG_EXECUTE_IF("crash_create_after_update_index", DBUG_SUICIDE(););
+#endif
+ }
+ }
+
+ if (!is_relay_log)
+ {
+ /*
+ Now the file was created successfully, so we can link in the entry for
+ the new binlog file in binlog_xid_count_list.
+ */
+ mysql_mutex_lock(&LOCK_xid_list);
+ ++current_binlog_id;
+ new_xid_list_entry->binlog_id= current_binlog_id;
+ /* Remove any initial entries with no pending XIDs. */
+ while ((b= binlog_xid_count_list.head()) && b->xid_count == 0)
+ {
+ WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::open(): Removing xid_list_entry for "
+ "%s (%lu)", b);
+ delete binlog_xid_count_list.get();
+ }
+ mysql_cond_broadcast(&COND_xid_list);
+ WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::open(): Adding new xid_list_entry for "
+ "%s (%lu)", new_xid_list_entry);
+ binlog_xid_count_list.push_back(new_xid_list_entry);
+ mysql_mutex_unlock(&LOCK_xid_list);
+
+ /*
+ Now that we have synced a new binlog file with an initial Gtid_list
+ event, it is safe to delete the binlog state file. We will write out
+ a new, updated file at shutdown, and if we crash before we can recover
+ the state from the newly written binlog file.
+
+ Since the state file will contain out-of-date data as soon as the first
+ new GTID is binlogged, it is better to remove it, to avoid any risk of
+ accidentally reading incorrect data later.
+ */
+ if (!state_file_deleted)
+ {
+ char buf[FN_REFLEN];
+ fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
+ MY_UNPACK_FILENAME);
+ my_delete(buf, MY_SYNC_DIR);
+ state_file_deleted= true;
+ }
+ }
+
+ log_state= LOG_OPENED;
+
+#ifdef HAVE_REPLICATION
+ close_purge_index_file();
+#endif
+
+ /* Notify the io thread that binlog is rotated to a new file */
+ if (is_relay_log)
+ signal_relay_log_update();
+ else
+ update_binlog_end_pos();
+ DBUG_RETURN(0);
+
+err:
+ int tmp_errno= errno;
+#ifdef HAVE_REPLICATION
+ if (is_inited_purge_index_file())
+ purge_index_entry(NULL, NULL, need_mutex);
+ close_purge_index_file();
+#endif
+ sql_print_error(fatal_log_error, (name) ? name : log_name, tmp_errno);
+ if (new_xid_list_entry)
+ delete new_xid_list_entry;
+ if (file >= 0)
+ mysql_file_close(file, MYF(0));
+ close(LOG_CLOSE_INDEX);
+ DBUG_RETURN(1);
+}
+
+
+int MYSQL_BIN_LOG::get_current_log(LOG_INFO* linfo)
+{
+ mysql_mutex_lock(&LOCK_log);
+ int ret = raw_get_current_log(linfo);
+ mysql_mutex_unlock(&LOCK_log);
+ return ret;
+}
+
+int MYSQL_BIN_LOG::raw_get_current_log(LOG_INFO* linfo)
+{
+ mysql_mutex_assert_owner(&LOCK_log);
+ strmake_buf(linfo->log_file_name, log_file_name);
+ linfo->pos = my_b_tell(&log_file);
+ return 0;
+}
+
+/**
+ Move all data up in a file in an filename index file.
+
+ We do the copy outside of the IO_CACHE as the cache buffers would just
+ make things slower and more complicated.
+ In most cases the copy loop should only do one read.
+
+ @param index_file File to move
+ @param offset Move everything from here to beginning
+
+ @note
+ File will be truncated to be 'offset' shorter or filled up with newlines
+
+ @retval
+ 0 ok
+*/
+
+#ifdef HAVE_REPLICATION
+
+static bool copy_up_file_and_fill(IO_CACHE *index_file, my_off_t offset)
+{
+ int bytes_read;
+ my_off_t init_offset= offset;
+ File file= index_file->file;
+ uchar io_buf[IO_SIZE*2];
+ DBUG_ENTER("copy_up_file_and_fill");
+
+ for (;; offset+= bytes_read)
+ {
+ mysql_file_seek(file, offset, MY_SEEK_SET, MYF(0));
+ if ((bytes_read= (int) mysql_file_read(file, io_buf, sizeof(io_buf),
+ MYF(MY_WME)))
+ < 0)
+ goto err;
+ if (!bytes_read)
+ break; // end of file
+ mysql_file_seek(file, offset-init_offset, MY_SEEK_SET, MYF(0));
+ if (mysql_file_write(file, io_buf, bytes_read,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ goto err;
+ }
+ /* The following will either truncate the file or fill the end with \n' */
+ if (mysql_file_chsize(file, offset - init_offset, '\n', MYF(MY_WME)) ||
+ mysql_file_sync(file, MYF(MY_WME)))
+ goto err;
+
+ /* Reset data in old index cache */
+ reinit_io_cache(index_file, READ_CACHE, (my_off_t) 0, 0, 1);
+ DBUG_RETURN(0);
+
+err:
+ DBUG_RETURN(1);
+}
+
+#endif /* HAVE_REPLICATION */
+
+/**
+ Find the position in the log-index-file for the given log name.
+
+ @param linfo Store here the found log file name and position to
+ the NEXT log file name in the index file.
+ @param log_name Filename to find in the index file.
+ Is a null pointer if we want to read the first entry
+ @param need_lock Set this to 1 if the parent doesn't already have a
+ lock on LOCK_index
+
+ @note
+ On systems without the truncate function the file will end with one or
+ more empty lines. These will be ignored when reading the file.
+
+ @retval
+ 0 ok
+ @retval
+ LOG_INFO_EOF End of log-index-file found
+ @retval
+ LOG_INFO_IO Got IO error while reading file
+*/
+
+int MYSQL_BIN_LOG::find_log_pos(LOG_INFO *linfo, const char *log_name,
+ bool need_lock)
+{
+ int error= 0;
+ char *full_fname= linfo->log_file_name;
+ char full_log_name[FN_REFLEN], fname[FN_REFLEN];
+ uint log_name_len= 0, fname_len= 0;
+ DBUG_ENTER("find_log_pos");
+ full_log_name[0]= full_fname[0]= 0;
+
+ /*
+ Mutex needed because we need to make sure the file pointer does not
+ move from under our feet
+ */
+ if (need_lock)
+ mysql_mutex_lock(&LOCK_index);
+ mysql_mutex_assert_owner(&LOCK_index);
+
+ // extend relative paths for log_name to be searched
+ if (log_name)
+ {
+ if(normalize_binlog_name(full_log_name, log_name, is_relay_log))
+ {
+ error= LOG_INFO_EOF;
+ goto end;
+ }
+ }
+
+ log_name_len= log_name ? (uint) strlen(full_log_name) : 0;
+ DBUG_PRINT("enter", ("log_name: %s, full_log_name: %s",
+ log_name ? log_name : "NULL", full_log_name));
+
+ /* As the file is flushed, we can't get an error here */
+ (void) reinit_io_cache(&index_file, READ_CACHE, (my_off_t) 0, 0, 0);
+
+ for (;;)
+ {
+ size_t length;
+ my_off_t offset= my_b_tell(&index_file);
+
+ DBUG_EXECUTE_IF("simulate_find_log_pos_error",
+ error= LOG_INFO_EOF; break;);
+ /* If we get 0 or 1 characters, this is the end of the file */
+ if ((length= my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
+ {
+ /* Did not find the given entry; Return not found or error */
+ error= !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
+ break;
+ }
+ if (fname[length-1] != '\n')
+ continue; // Not a log entry
+ fname[length-1]= 0; // Remove end \n
+
+ // extend relative paths and match against full path
+ if (normalize_binlog_name(full_fname, fname, is_relay_log))
+ {
+ error= LOG_INFO_EOF;
+ break;
+ }
+ fname_len= (uint) strlen(full_fname);
+
+ // if the log entry matches, null string matching anything
+ if (!log_name ||
+ (log_name_len == fname_len &&
+ !strncmp(full_fname, full_log_name, log_name_len)))
+ {
+ DBUG_PRINT("info", ("Found log file entry"));
+ linfo->index_file_start_offset= offset;
+ linfo->index_file_offset = my_b_tell(&index_file);
+ break;
+ }
+ }
+
+end:
+ if (need_lock)
+ mysql_mutex_unlock(&LOCK_index);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Find the position in the log-index-file for the given log name.
+
+ @param
+ linfo Store here the next log file name and position to
+ the file name after that.
+ @param
+ need_lock Set this to 1 if the parent doesn't already have a
+ lock on LOCK_index
+
+ @note
+ - Before calling this function, one has to call find_log_pos()
+ to set up 'linfo'
+ - Mutex needed because we need to make sure the file pointer does not move
+ from under our feet
+
+ @retval
+ 0 ok
+ @retval
+ LOG_INFO_EOF End of log-index-file found
+ @retval
+ LOG_INFO_IO Got IO error while reading file
+*/
+
+int MYSQL_BIN_LOG::find_next_log(LOG_INFO* linfo, bool need_lock)
+{
+ int error= 0;
+ size_t length;
+ char fname[FN_REFLEN];
+ char *full_fname= linfo->log_file_name;
+
+ if (need_lock)
+ mysql_mutex_lock(&LOCK_index);
+ mysql_mutex_assert_owner(&LOCK_index);
+
+ /* As the file is flushed, we can't get an error here */
+ (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0,
+ 0);
+
+ linfo->index_file_start_offset= linfo->index_file_offset;
+ if ((length=my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
+ {
+ error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
+ goto err;
+ }
+
+ if (fname[0] != 0)
+ {
+ if(normalize_binlog_name(full_fname, fname, is_relay_log))
+ {
+ error= LOG_INFO_EOF;
+ goto err;
+ }
+ length= strlen(full_fname);
+ }
+
+ full_fname[length-1]= 0; // kill \n
+ linfo->index_file_offset= my_b_tell(&index_file);
+
+err:
+ if (need_lock)
+ mysql_mutex_unlock(&LOCK_index);
+ return error;
+}
+
+
+/**
+ Delete all logs referred to in the index file.
+
+ The new index file will only contain this file.
+
+ @param thd Thread id. This can be zero in case of resetting
+ relay logs
+ @param create_new_log 1 if we should start writing to a new log file
+ @param next_log_number min number of next log file to use, if possible.
+
+ @note
+ If not called from slave thread, write start event to new log
+
+ @retval
+ 0 ok
+ @retval
+ 1 error
+*/
+
+bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
+ rpl_gtid *init_state, uint32 init_state_len,
+ ulong next_log_number)
+{
+ LOG_INFO linfo;
+ bool error=0;
+ int err;
+ const char* save_name;
+ DBUG_ENTER("reset_logs");
+
+ if (!is_relay_log)
+ {
+ if (init_state && !is_empty_state())
+ {
+ my_error(ER_BINLOG_MUST_BE_EMPTY, MYF(0));
+ DBUG_RETURN(1);
+ }
+
+ /*
+ Mark that a RESET MASTER is in progress.
+ This ensures that a binlog checkpoint will not try to write binlog
+ checkpoint events, which would be useless (as we are deleting the binlog
+ anyway) and could deadlock, as we are holding LOCK_log.
+
+ Wait for any mark_xid_done() calls that might be already running to
+ complete (mark_xid_done_waiting counter to drop to zero); we need to
+ do this before we take the LOCK_log to not deadlock.
+ */
+ mysql_mutex_lock(&LOCK_xid_list);
+ reset_master_pending++;
+ while (mark_xid_done_waiting > 0)
+ mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
+ mysql_mutex_unlock(&LOCK_xid_list);
+ }
+
+ DEBUG_SYNC_C_IF_THD(thd, "reset_logs_after_set_reset_master_pending");
+ /*
+ We need to get both locks to be sure that no one is trying to
+ write to the index log file.
+ */
+ mysql_mutex_lock(&LOCK_log);
+ mysql_mutex_lock(&LOCK_index);
+
+ if (!is_relay_log)
+ {
+ /*
+ We are going to nuke all binary log files.
+ Without binlog, we cannot XA recover prepared-but-not-committed
+ transactions in engines. So force a commit checkpoint first.
+
+ Note that we take and immediately
+ release LOCK_after_binlog_sync/LOCK_commit_ordered. This has
+ the effect to ensure that any on-going group commit (in
+ trx_group_commit_leader()) has completed before we request the checkpoint,
+ due to the chaining of LOCK_log and LOCK_commit_ordered in that function.
+ (We are holding LOCK_log, so no new group commit can start).
+
+ Without this, it is possible (though perhaps unlikely) that the RESET
+ MASTER could run in-between the write to the binlog and the
+ commit_ordered() in the engine of some transaction, and then a crash
+ later would leave such transaction not recoverable.
+ */
+
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+
+ mark_xids_active(current_binlog_id, 1);
+ do_checkpoint_request(current_binlog_id);
+
+ /* Now wait for all checkpoint requests and pending unlog() to complete. */
+ mysql_mutex_lock(&LOCK_xid_list);
+ for (;;)
+ {
+ if (is_xidlist_idle_nolock())
+ break;
+ /*
+ Wait until signalled that one more binlog dropped to zero, then check
+ again.
+ */
+ mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
+ }
+
+ /*
+ Now all XIDs are fully flushed to disk, and we are holding LOCK_log so
+ no new ones will be written. So we can proceed to delete the logs.
+ */
+ mysql_mutex_unlock(&LOCK_xid_list);
+ }
+
+ /* Save variables so that we can reopen the log */
+ save_name=name;
+ name=0; // Protect against free
+ close(LOG_CLOSE_TO_BE_OPENED);
+
+ last_used_log_number= 0; // Reset log number cache
+
+ /*
+ First delete all old log files and then update the index file.
+ As we first delete the log files and do not use sort of logging,
+ a crash may lead to an inconsistent state where the index has
+ references to non-existent files.
+
+ We need to invert the steps and use the purge_index_file methods
+ in order to make the operation safe.
+ */
+
+ if ((err= find_log_pos(&linfo, NullS, 0)) != 0)
+ {
+ uint errcode= purge_log_get_error_code(err);
+ sql_print_error("Failed to locate old binlog or relay log files");
+ my_message(errcode, ER_THD_OR_DEFAULT(thd, errcode), MYF(0));
+ error= 1;
+ goto err;
+ }
+
+ for (;;)
+ {
+ if (unlikely((error= my_delete(linfo.log_file_name, MYF(0)))))
+ {
+ if (my_errno == ENOENT)
+ {
+ if (thd)
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_LOG_PURGE_NO_FILE,
+ ER_THD(thd, ER_LOG_PURGE_NO_FILE),
+ linfo.log_file_name);
+
+ sql_print_information("Failed to delete file '%s'",
+ linfo.log_file_name);
+ my_errno= 0;
+ error= 0;
+ }
+ else
+ {
+ if (thd)
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_BINLOG_PURGE_FATAL_ERR,
+ "a problem with deleting %s; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ linfo.log_file_name);
+ error= 1;
+ goto err;
+ }
+ }
+ if (find_next_log(&linfo, 0))
+ break;
+ }
+
+ if (!is_relay_log)
+ {
+ if (init_state)
+ rpl_global_gtid_binlog_state.load(init_state, init_state_len);
+ else
+ rpl_global_gtid_binlog_state.reset();
+ }
+
+ /* Start logging with a new file */
+ close(LOG_CLOSE_INDEX | LOG_CLOSE_TO_BE_OPENED);
+ // Reset (open will update)
+ if (unlikely((error= my_delete(index_file_name, MYF(0)))))
+ {
+ if (my_errno == ENOENT)
+ {
+ if (thd)
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_LOG_PURGE_NO_FILE,
+ ER_THD(thd, ER_LOG_PURGE_NO_FILE),
+ index_file_name);
+ sql_print_information("Failed to delete file '%s'",
+ index_file_name);
+ my_errno= 0;
+ error= 0;
+ }
+ else
+ {
+ if (thd)
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_BINLOG_PURGE_FATAL_ERR,
+ "a problem with deleting %s; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ index_file_name);
+ error= 1;
+ goto err;
+ }
+ }
+ if (create_new_log && !open_index_file(index_file_name, 0, FALSE))
+ if (unlikely((error= open(save_name, 0, next_log_number,
+ io_cache_type, max_size, 0, FALSE))))
+ goto err;
+ my_free((void *) save_name);
+
+err:
+ if (error == 1)
+ name= const_cast<char*>(save_name);
+
+ if (!is_relay_log)
+ {
+ xid_count_per_binlog *b;
+ /*
+ Remove all entries in the xid_count list except the last.
+ Normally we will just be deleting all the entries that we waited for to
+ drop to zero above. But if we fail during RESET MASTER for some reason
+ then we will not have created any new log file, and we may keep the last
+ of the old entries.
+ */
+ mysql_mutex_lock(&LOCK_xid_list);
+ for (;;)
+ {
+ b= binlog_xid_count_list.head();
+ DBUG_ASSERT(b /* List can never become empty. */);
+ if (b->binlog_id == current_binlog_id)
+ break;
+ DBUG_ASSERT(b->xid_count == 0);
+ WSREP_XID_LIST_ENTRY("MYSQL_BIN_LOG::reset_logs(): Removing "
+ "xid_list_entry for %s (%lu)", b);
+ delete binlog_xid_count_list.get();
+ }
+ mysql_cond_broadcast(&COND_xid_list);
+ reset_master_pending--;
+ reset_master_count++;
+ mysql_mutex_unlock(&LOCK_xid_list);
+ }
+
+ mysql_mutex_unlock(&LOCK_index);
+ mysql_mutex_unlock(&LOCK_log);
+ DBUG_RETURN(error);
+}
+
+
+void MYSQL_BIN_LOG::wait_for_last_checkpoint_event()
+{
+ mysql_mutex_lock(&LOCK_xid_list);
+ for (;;)
+ {
+ if (binlog_xid_count_list.is_last(binlog_xid_count_list.head()))
+ break;
+ mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
+ }
+ mysql_mutex_unlock(&LOCK_xid_list);
+
+ /*
+ LOCK_xid_list and LOCK_log are chained, so the LOCK_log will only be
+ obtained after mark_xid_done() has written the last checkpoint event.
+ */
+ mysql_mutex_lock(&LOCK_log);
+ mysql_mutex_unlock(&LOCK_log);
+}
+
+
+/**
+ Delete relay log files prior to rli->group_relay_log_name
+ (i.e. all logs which are not involved in a non-finished group
+ (transaction)), remove them from the index file and start on next
+ relay log.
+
+ IMPLEMENTATION
+
+ - You must hold rli->data_lock before calling this function, since
+ it writes group_relay_log_pos and similar fields of
+ Relay_log_info.
+ - Protects index file with LOCK_index
+ - Delete relevant relay log files
+ - Copy all file names after these ones to the front of the index file
+ - If the OS has truncate, truncate the file, else fill it with \n'
+ - Read the next file name from the index file and store in rli->linfo
+
+ @param rli Relay log information
+ @param included If false, all relay logs that are strictly before
+ rli->group_relay_log_name are deleted ; if true, the
+ latter is deleted too (i.e. all relay logs
+ read by the SQL slave thread are deleted).
+
+ @note
+ - This is only called from the slave SQL thread when it has read
+ all commands from a relay log and want to switch to a new relay log.
+ - When this happens, we can be in an active transaction as
+ a transaction can span over two relay logs
+ (although it is always written as a single block to the master's binary
+ log, hence cannot span over two master's binary logs).
+
+ @retval
+ 0 ok
+ @retval
+ LOG_INFO_EOF End of log-index-file found
+ @retval
+ LOG_INFO_SEEK Could not allocate IO cache
+ @retval
+ LOG_INFO_IO Got IO error while reading file
+*/
+
+#ifdef HAVE_REPLICATION
+
+int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
+{
+ int error, errcode;
+ char *to_purge_if_included= NULL;
+ inuse_relaylog *ir;
+ ulonglong log_space_reclaimed= 0;
+ DBUG_ENTER("purge_first_log");
+
+ DBUG_ASSERT(is_open());
+ DBUG_ASSERT(rli->slave_running == MYSQL_SLAVE_RUN_NOT_CONNECT);
+ DBUG_ASSERT(!strcmp(rli->linfo.log_file_name,rli->event_relay_log_name));
+
+ mysql_mutex_assert_owner(&rli->data_lock);
+
+ mysql_mutex_lock(&LOCK_index);
+
+ ir= rli->inuse_relaylog_list;
+ while (ir)
+ {
+ inuse_relaylog *next= ir->next;
+ if (!ir->completed || ir->dequeued_count < ir->queued_count)
+ {
+ included= false;
+ break;
+ }
+ if (!included && !strcmp(ir->name, rli->group_relay_log_name))
+ break;
+ if (!next)
+ {
+ rli->last_inuse_relaylog= NULL;
+ included= 1;
+ to_purge_if_included= my_strdup(key_memory_Relay_log_info_group_relay_log_name,
+ ir->name, MYF(0));
+ }
+ rli->free_inuse_relaylog(ir);
+ ir= next;
+ }
+ rli->inuse_relaylog_list= ir;
+ if (ir)
+ to_purge_if_included= my_strdup(key_memory_Relay_log_info_group_relay_log_name,
+ ir->name, MYF(0));
+
+ /*
+ Read the next log file name from the index file and pass it back to
+ the caller.
+ */
+ if (unlikely((error=find_log_pos(&rli->linfo, rli->event_relay_log_name,
+ 0))) ||
+ unlikely((error=find_next_log(&rli->linfo, 0))))
+ {
+ sql_print_error("next log error: %d offset: %llu log: %s included: %d",
+ error, rli->linfo.index_file_offset,
+ rli->event_relay_log_name, included);
+ goto err;
+ }
+
+ /*
+ Reset rli's coordinates to the current log.
+ */
+ rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE;
+ strmake_buf(rli->event_relay_log_name,rli->linfo.log_file_name);
+
+ /*
+ If we removed the rli->group_relay_log_name file,
+ we must update the rli->group* coordinates, otherwise do not touch it as the
+ group's execution is not finished (e.g. COMMIT not executed)
+ */
+ if (included)
+ {
+ rli->group_relay_log_pos = BIN_LOG_HEADER_SIZE;
+ strmake_buf(rli->group_relay_log_name,rli->linfo.log_file_name);
+ rli->notify_group_relay_log_name_update();
+ }
+
+ /* Store where we are in the new file for the execution thread */
+ if (rli->flush())
+ error= LOG_INFO_IO;
+
+ DBUG_EXECUTE_IF("crash_before_purge_logs", DBUG_SUICIDE(););
+
+ rli->relay_log.purge_logs(to_purge_if_included, included,
+ 0, 0, &log_space_reclaimed);
+
+ mysql_mutex_lock(&rli->log_space_lock);
+ rli->log_space_total-= log_space_reclaimed;
+ mysql_cond_broadcast(&rli->log_space_cond);
+ mysql_mutex_unlock(&rli->log_space_lock);
+
+ /*
+ * Need to update the log pos because purge logs has been called
+ * after fetching initially the log pos at the beginning of the method.
+ */
+ if ((errcode= find_log_pos(&rli->linfo, rli->event_relay_log_name, 0)))
+ {
+ sql_print_error("next log error: %d offset: %llu log: %s included: %d",
+ errcode, rli->linfo.index_file_offset,
+ rli->group_relay_log_name, included);
+ goto err;
+ }
+
+ /* If included was passed, rli->linfo should be the first entry. */
+ DBUG_ASSERT(!included || rli->linfo.index_file_start_offset == 0);
+
+err:
+ my_free(to_purge_if_included);
+ mysql_mutex_unlock(&LOCK_index);
+ DBUG_RETURN(error);
+}
+
+/**
+ Update log index_file.
+*/
+
+int MYSQL_BIN_LOG::update_log_index(LOG_INFO* log_info, bool need_update_threads)
+{
+ if (copy_up_file_and_fill(&index_file, log_info->index_file_start_offset))
+ return LOG_INFO_IO;
+
+ // now update offsets in index file for running threads
+ if (need_update_threads)
+ adjust_linfo_offsets(log_info->index_file_start_offset);
+ return 0;
+}
+
+/**
+ Remove all logs before the given log from disk and from the index file.
+
+ @param to_log Delete all log file name before this file.
+ @param included If true, to_log is deleted too.
+ @param need_mutex
+ @param need_update_threads If we want to update the log coordinates of
+ all threads. False for relay logs, true otherwise.
+ @param reclaimeed_log_space If not null, increment this variable to
+ the amount of log space freed
+
+ @note
+ If any of the logs before the deleted one is in use,
+ only purge logs up to this one.
+
+ @retval
+ 0 ok
+ @retval
+ LOG_INFO_EOF to_log not found
+ LOG_INFO_EMFILE too many files opened
+ LOG_INFO_FATAL if any other than ENOENT error from
+ mysql_file_stat() or mysql_file_delete()
+*/
+
+int MYSQL_BIN_LOG::purge_logs(const char *to_log,
+ bool included,
+ bool need_mutex,
+ bool need_update_threads,
+ ulonglong *reclaimed_space)
+{
+ int error= 0;
+ bool exit_loop= 0;
+ LOG_INFO log_info;
+ THD *thd= current_thd;
+ DBUG_ENTER("purge_logs");
+ DBUG_PRINT("info",("to_log= %s",to_log));
+
+ if (need_mutex)
+ mysql_mutex_lock(&LOCK_index);
+ if (unlikely((error=find_log_pos(&log_info, to_log, 0 /*no mutex*/))) )
+ {
+ sql_print_error("MYSQL_BIN_LOG::purge_logs was called with file %s not "
+ "listed in the index.", to_log);
+ goto err;
+ }
+
+ if (unlikely((error= open_purge_index_file(TRUE))))
+ {
+ sql_print_error("MYSQL_BIN_LOG::purge_logs failed to sync the index file.");
+ goto err;
+ }
+
+ /*
+ File name exists in index file; delete until we find this file
+ or a file that is used.
+ */
+ if (unlikely((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/))))
+ goto err;
+ while ((strcmp(to_log,log_info.log_file_name) || (exit_loop=included)) &&
+ can_purge_log(log_info.log_file_name))
+ {
+ if (unlikely((error= register_purge_index_entry(log_info.log_file_name))))
+ {
+ sql_print_error("MYSQL_BIN_LOG::purge_logs failed to copy %s to register file.",
+ log_info.log_file_name);
+ goto err;
+ }
+
+ if (find_next_log(&log_info, 0) || exit_loop)
+ break;
+ }
+
+ DBUG_EXECUTE_IF("crash_purge_before_update_index", DBUG_SUICIDE(););
+
+ if (unlikely((error= sync_purge_index_file())))
+ {
+ sql_print_error("MYSQL_BIN_LOG::purge_logs failed to flush register file.");
+ goto err;
+ }
+
+ /* We know how many files to delete. Update index file. */
+ if (unlikely((error=update_log_index(&log_info, need_update_threads))))
+ {
+ sql_print_error("MYSQL_BIN_LOG::purge_logs failed to update the index file");
+ goto err;
+ }
+
+ DBUG_EXECUTE_IF("crash_purge_critical_after_update_index", DBUG_SUICIDE(););
+
+err:
+ /* Read each entry from purge_index_file and delete the file. */
+ if (is_inited_purge_index_file() &&
+ (error= purge_index_entry(thd, reclaimed_space, FALSE)))
+ sql_print_error("MYSQL_BIN_LOG::purge_logs failed to process registered files"
+ " that would be purged.");
+ close_purge_index_file();
+
+ DBUG_EXECUTE_IF("crash_purge_non_critical_after_update_index", DBUG_SUICIDE(););
+
+ if (need_mutex)
+ mysql_mutex_unlock(&LOCK_index);
+ DBUG_RETURN(error);
+}
+
+int MYSQL_BIN_LOG::set_purge_index_file_name(const char *base_file_name)
+{
+ int error= 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::set_purge_index_file_name");
+ if (fn_format(purge_index_file_name, base_file_name, mysql_data_home,
+ ".~rec~", MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH |
+ MY_REPLACE_EXT)) == NULL)
+ {
+ error= 1;
+ sql_print_error("MYSQL_BIN_LOG::set_purge_index_file_name failed to set "
+ "file name.");
+ }
+ DBUG_RETURN(error);
+}
+
+int MYSQL_BIN_LOG::open_purge_index_file(bool destroy)
+{
+ int error= 0;
+ File file= -1;
+
+ DBUG_ENTER("MYSQL_BIN_LOG::open_purge_index_file");
+
+ if (destroy)
+ close_purge_index_file();
+
+ if (!my_b_inited(&purge_index_file))
+ {
+ if ((file= my_open(purge_index_file_name, O_RDWR | O_CREAT | O_BINARY,
+ MYF(MY_WME))) < 0 ||
+ init_io_cache(&purge_index_file, file, IO_SIZE,
+ (destroy ? WRITE_CACHE : READ_CACHE),
+ 0, 0, MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ error= 1;
+ sql_print_error("MYSQL_BIN_LOG::open_purge_index_file failed to open register "
+ " file.");
+ }
+ }
+ DBUG_RETURN(error);
+}
+
+int MYSQL_BIN_LOG::close_purge_index_file()
+{
+ int error= 0;
+
+ DBUG_ENTER("MYSQL_BIN_LOG::close_purge_index_file");
+
+ if (my_b_inited(&purge_index_file))
+ {
+ end_io_cache(&purge_index_file);
+ error= my_close(purge_index_file.file, MYF(0));
+ }
+ my_delete(purge_index_file_name, MYF(0));
+ bzero((char*) &purge_index_file, sizeof(purge_index_file));
+
+ DBUG_RETURN(error);
+}
+
+bool MYSQL_BIN_LOG::is_inited_purge_index_file()
+{
+ return my_b_inited(&purge_index_file);
+}
+
+int MYSQL_BIN_LOG::sync_purge_index_file()
+{
+ int error= 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::sync_purge_index_file");
+
+ if (unlikely((error= flush_io_cache(&purge_index_file))) ||
+ unlikely((error= my_sync(purge_index_file.file,
+ MYF(MY_WME)))))
+ DBUG_RETURN(error);
+
+ DBUG_RETURN(error);
+}
+
+int MYSQL_BIN_LOG::register_purge_index_entry(const char *entry)
+{
+ int error= 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::register_purge_index_entry");
+
+ if (unlikely((error=my_b_write(&purge_index_file, (const uchar*)entry,
+ strlen(entry)))) ||
+ unlikely((error=my_b_write(&purge_index_file, (const uchar*)"\n", 1))))
+ DBUG_RETURN (error);
+
+ DBUG_RETURN(error);
+}
+
+int MYSQL_BIN_LOG::register_create_index_entry(const char *entry)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::register_create_index_entry");
+ DBUG_RETURN(register_purge_index_entry(entry));
+}
+
+int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
+ bool need_mutex)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG:purge_index_entry");
+ MY_STAT s;
+ int error= 0;
+ LOG_INFO log_info;
+ LOG_INFO check_log_info;
+
+ DBUG_ASSERT(my_b_inited(&purge_index_file));
+
+ if (unlikely((error= reinit_io_cache(&purge_index_file, READ_CACHE, 0, 0,
+ 0))))
+ {
+ sql_print_error("MYSQL_BIN_LOG::purge_index_entry failed to reinit register file "
+ "for read");
+ goto err;
+ }
+
+ for (;;)
+ {
+ size_t length;
+
+ if ((length=my_b_gets(&purge_index_file, log_info.log_file_name,
+ FN_REFLEN)) <= 1)
+ {
+ if (purge_index_file.error)
+ {
+ error= purge_index_file.error;
+ sql_print_error("MYSQL_BIN_LOG::purge_index_entry error %d reading from "
+ "register file.", error);
+ goto err;
+ }
+
+ /* Reached EOF */
+ break;
+ }
+
+ /* Get rid of the trailing '\n' */
+ log_info.log_file_name[length-1]= 0;
+
+ if (unlikely(!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s,
+ MYF(0))))
+ {
+ if (my_errno == ENOENT)
+ {
+ /*
+ It's not fatal if we can't stat a log file that does not exist;
+ If we could not stat, we won't delete.
+ */
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_LOG_PURGE_NO_FILE, ER_THD(thd, ER_LOG_PURGE_NO_FILE),
+ log_info.log_file_name);
+ }
+ sql_print_information("Failed to execute mysql_file_stat on file '%s'",
+ log_info.log_file_name);
+ my_errno= 0;
+ }
+ else
+ {
+ /*
+ Other than ENOENT are fatal
+ */
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_BINLOG_PURGE_FATAL_ERR,
+ "a problem with getting info on being purged %s; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ log_info.log_file_name);
+ }
+ else
+ {
+ sql_print_information("Failed to delete log file '%s'; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ log_info.log_file_name);
+ }
+ error= LOG_INFO_FATAL;
+ goto err;
+ }
+ }
+ else
+ {
+ if (unlikely((error= find_log_pos(&check_log_info,
+ log_info.log_file_name, need_mutex))))
+ {
+ if (error != LOG_INFO_EOF)
+ {
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_BINLOG_PURGE_FATAL_ERR,
+ "a problem with deleting %s and "
+ "reading the binlog index file",
+ log_info.log_file_name);
+ }
+ else
+ {
+ sql_print_information("Failed to delete file '%s' and "
+ "read the binlog index file",
+ log_info.log_file_name);
+ }
+ goto err;
+ }
+
+ error= 0;
+
+ DBUG_PRINT("info",("purging %s",log_info.log_file_name));
+ if (!my_delete(log_info.log_file_name, MYF(0)))
+ {
+ if (reclaimed_space)
+ *reclaimed_space+= s.st_size;
+ }
+ else
+ {
+ if (my_errno == ENOENT)
+ {
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_LOG_PURGE_NO_FILE, ER_THD(thd, ER_LOG_PURGE_NO_FILE),
+ log_info.log_file_name);
+ }
+ sql_print_information("Failed to delete file '%s'",
+ log_info.log_file_name);
+ my_errno= 0;
+ }
+ else
+ {
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_BINLOG_PURGE_FATAL_ERR,
+ "a problem with deleting %s; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ log_info.log_file_name);
+ }
+ else
+ {
+ sql_print_information("Failed to delete file '%s'; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ log_info.log_file_name);
+ }
+ if (my_errno == EMFILE)
+ {
+ DBUG_PRINT("info",
+ ("my_errno: %d, set ret = LOG_INFO_EMFILE", my_errno));
+ error= LOG_INFO_EMFILE;
+ goto err;
+ }
+ error= LOG_INFO_FATAL;
+ goto err;
+ }
+ }
+ }
+ }
+ }
+
+err:
+ DBUG_RETURN(error);
+}
+
+/**
+ Remove all logs before the given file date from disk and from the
+ index file.
+
+ @param thd Thread pointer
+ @param purge_time Delete all log files before given date.
+
+ @note
+ If any of the logs before the deleted one is in use,
+ only purge logs up to this one.
+
+ @retval
+ 0 ok
+ @retval
+ LOG_INFO_PURGE_NO_ROTATE Binary file that can't be rotated
+ LOG_INFO_FATAL if any other than ENOENT error from
+ mysql_file_stat() or mysql_file_delete()
+*/
+
+int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time)
+{
+ int error;
+ char to_log[FN_REFLEN];
+ LOG_INFO log_info;
+ MY_STAT stat_area;
+ THD *thd= current_thd;
+ DBUG_ENTER("purge_logs_before_date");
+
+ mysql_mutex_lock(&LOCK_index);
+ to_log[0]= 0;
+
+ if (unlikely((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/))))
+ goto err;
+
+ while (strcmp(log_file_name, log_info.log_file_name) &&
+ can_purge_log(log_info.log_file_name))
+ {
+ if (!mysql_file_stat(m_key_file_log,
+ log_info.log_file_name, &stat_area, MYF(0)))
+ {
+ if (my_errno == ENOENT)
+ {
+ /*
+ It's not fatal if we can't stat a log file that does not exist.
+ */
+ my_errno= 0;
+ }
+ else
+ {
+ /*
+ Other than ENOENT are fatal
+ */
+ if (thd)
+ {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ ER_BINLOG_PURGE_FATAL_ERR,
+ "a problem with getting info on being purged %s; "
+ "consider examining correspondence "
+ "of your binlog index file "
+ "to the actual binlog files",
+ log_info.log_file_name);
+ }
+ else
+ {
+ sql_print_information("Failed to delete log file '%s'",
+ log_info.log_file_name);
+ }
+ error= LOG_INFO_FATAL;
+ goto err;
+ }
+ }
+ else
+ {
+ if (stat_area.st_mtime < purge_time)
+ strmake_buf(to_log, log_info.log_file_name);
+ else
+ break;
+ }
+ if (find_next_log(&log_info, 0))
+ break;
+ }
+
+ error= (to_log[0] ? purge_logs(to_log, 1, 0, 1, (ulonglong *) 0) : 0);
+
+err:
+ mysql_mutex_unlock(&LOCK_index);
+ DBUG_RETURN(error);
+}
+
+
+bool
+MYSQL_BIN_LOG::can_purge_log(const char *log_file_name_arg)
+{
+ xid_count_per_binlog *b;
+
+ if (is_active(log_file_name_arg))
+ return false;
+ mysql_mutex_lock(&LOCK_xid_list);
+ {
+ I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+ while ((b= it++) &&
+ 0 != strncmp(log_file_name_arg+dirname_length(log_file_name_arg),
+ b->binlog_name, b->binlog_name_len))
+ ;
+ }
+ mysql_mutex_unlock(&LOCK_xid_list);
+ if (b)
+ return false;
+ return !log_in_use(log_file_name_arg);
+}
+#endif /* HAVE_REPLICATION */
+
+
+bool
+MYSQL_BIN_LOG::is_xidlist_idle()
+{
+ bool res;
+ mysql_mutex_lock(&LOCK_xid_list);
+ res= is_xidlist_idle_nolock();
+ mysql_mutex_unlock(&LOCK_xid_list);
+ return res;
+}
+
+
+bool
+MYSQL_BIN_LOG::is_xidlist_idle_nolock()
+{
+ xid_count_per_binlog *b;
+
+ I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+ while ((b= it++))
+ {
+ if (b->xid_count > 0)
+ return false;
+ }
+ return true;
+}
+
+/**
+ Create a new log file name.
+
+ @param buf buf of at least FN_REFLEN where new name is stored
+
+ @note
+ If file name will be longer then FN_REFLEN it will be truncated
+*/
+
+void MYSQL_BIN_LOG::make_log_name(char* buf, const char* log_ident)
+{
+ size_t dir_len = dirname_length(log_file_name);
+ if (dir_len >= FN_REFLEN)
+ dir_len=FN_REFLEN-1;
+ strnmov(buf, log_file_name, dir_len);
+ strmake(buf+dir_len, log_ident, FN_REFLEN - dir_len -1);
+}
+
+
+/**
+ Check if we are writing/reading to the given log file.
+*/
+
+bool MYSQL_BIN_LOG::is_active(const char *log_file_name_arg)
+{
+ /**
+ * there should/must be mysql_mutex_assert_owner(&LOCK_log) here...
+ * but code violates this! (scary monsters and super creeps!)
+ *
+ * example stacktrace:
+ * #8 MYSQL_BIN_LOG::is_active
+ * #9 MYSQL_BIN_LOG::can_purge_log
+ * #10 MYSQL_BIN_LOG::purge_logs
+ * #11 MYSQL_BIN_LOG::purge_first_log
+ * #12 next_event
+ * #13 exec_relay_log_event
+ *
+ * I didn't investigate if this is ligit...(i.e if my comment is wrong)
+ */
+ return !strcmp(log_file_name, log_file_name_arg);
+}
+
+
+/*
+ Wrappers around new_file_impl to avoid using argument
+ to control locking. The argument 1) less readable 2) breaks
+ incapsulation 3) allows external access to the class without
+ a lock (which is not possible with private new_file_without_locking
+ method).
+
+ @retval
+ nonzero - error
+*/
+
+int MYSQL_BIN_LOG::new_file()
+{
+ int res;
+ mysql_mutex_lock(&LOCK_log);
+ res= new_file_impl();
+ mysql_mutex_unlock(&LOCK_log);
+ return res;
+}
+
+/*
+ @retval
+ nonzero - error
+ */
+int MYSQL_BIN_LOG::new_file_without_locking()
+{
+ return new_file_impl();
+}
+
+
+/**
+ Start writing to a new log file or reopen the old file.
+
+ @param need_lock Set to 1 if caller has not locked LOCK_log
+
+ @retval
+ nonzero - error
+
+ @note
+ The new file name is stored last in the index file
+*/
+
+int MYSQL_BIN_LOG::new_file_impl()
+{
+ int error= 0, close_on_error= FALSE;
+ char new_name[FN_REFLEN], *new_name_ptr, *old_name, *file_to_open;
+ uint close_flag;
+ bool delay_close= false;
+ File UNINIT_VAR(old_file);
+ DBUG_ENTER("MYSQL_BIN_LOG::new_file_impl");
+
+ DBUG_ASSERT(log_type == LOG_BIN);
+ mysql_mutex_assert_owner(&LOCK_log);
+
+ if (!is_open())
+ {
+ DBUG_PRINT("info",("log is closed"));
+ DBUG_RETURN(error);
+ }
+
+ mysql_mutex_lock(&LOCK_index);
+
+ /* Reuse old name if not binlog and not update log */
+ new_name_ptr= name;
+
+ /*
+ If user hasn't specified an extension, generate a new log name
+ We have to do this here and not in open as we want to store the
+ new file name in the current binary log file.
+ */
+ if (unlikely((error= generate_new_name(new_name, name, 0))))
+ {
+#ifdef ENABLE_AND_FIX_HANG
+ close_on_error= TRUE;
+#endif
+ goto end2;
+ }
+ new_name_ptr=new_name;
+
+ {
+ /*
+ We log the whole file name for log file as the user may decide
+ to change base names at some point.
+ */
+ Rotate_log_event r(new_name + dirname_length(new_name), 0, LOG_EVENT_OFFSET,
+ is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
+ /*
+ The current relay-log's closing Rotate event must have checksum
+ value computed with an algorithm of the last relay-logged FD event.
+ */
+ if (is_relay_log)
+ r.checksum_alg= relay_log_checksum_alg;
+ DBUG_ASSERT(!is_relay_log ||
+ relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
+ if ((DBUG_IF("fault_injection_new_file_rotate_event") &&
+ (error= close_on_error= TRUE)) ||
+ (error= write_event(&r)))
+ {
+ DBUG_EXECUTE_IF("fault_injection_new_file_rotate_event", errno= 2;);
+ close_on_error= TRUE;
+ my_printf_error(ER_ERROR_ON_WRITE,
+ ER_THD_OR_DEFAULT(current_thd, ER_CANT_OPEN_FILE),
+ MYF(ME_FATAL), name, errno);
+ goto end;
+ }
+ bytes_written+= r.data_written;
+ }
+
+ /*
+ Update needs to be signalled even if there is no rotate event
+ log rotation should give the waiting thread a signal to
+ discover EOF and move on to the next log.
+ */
+ if (unlikely((error= flush_io_cache(&log_file))))
+ {
+ close_on_error= TRUE;
+ goto end;
+ }
+ update_binlog_end_pos();
+
+ old_name=name;
+ name=0; // Don't free name
+ close_flag= LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX;
+ if (!is_relay_log)
+ {
+ /*
+ We need to keep the old binlog file open (and marked as in-use) until
+ the new one is fully created and synced to disk and index. Otherwise we
+ leave a window where if we crash, there is no binlog file marked as
+ crashed for server restart to detect the need for recovery.
+ */
+ old_file= log_file.file;
+ close_flag|= LOG_CLOSE_DELAYED_CLOSE;
+ delay_close= true;
+ }
+ close(close_flag);
+ if (checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF)
+ {
+ DBUG_ASSERT(!is_relay_log);
+ DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
+ binlog_checksum_options= checksum_alg_reset;
+ }
+ /*
+ Note that at this point, log_state != LOG_CLOSED
+ (important for is_open()).
+ */
+
+ /*
+ new_file() is only used for rotation (in FLUSH LOGS or because size >
+ max_binlog_size or max_relay_log_size).
+ If this is a binary log, the Format_description_log_event at the
+ beginning of the new file should have created=0 (to distinguish with the
+ Format_description_log_event written at server startup, which should
+ trigger temp tables deletion on slaves.
+ */
+
+ /* reopen index binlog file, BUG#34582 */
+ file_to_open= index_file_name;
+ error= open_index_file(index_file_name, 0, FALSE);
+ if (likely(!error))
+ {
+ /* reopen the binary log file. */
+ file_to_open= new_name_ptr;
+ error= open(old_name, new_name_ptr, 0, io_cache_type, max_size, 1, FALSE);
+ }
+
+ /* handle reopening errors */
+ if (unlikely(error))
+ {
+ my_error(ER_CANT_OPEN_FILE, MYF(ME_FATAL), file_to_open, error);
+ close_on_error= TRUE;
+ }
+
+ my_free(old_name);
+
+end:
+ /* In case of errors, reuse the last generated log file name */
+ if (unlikely(error))
+ {
+ DBUG_ASSERT(last_used_log_number > 0);
+ last_used_log_number--;
+ }
+
+end2:
+ if (delay_close)
+ {
+ clear_inuse_flag_when_closing(old_file);
+ mysql_file_close(old_file, MYF(MY_WME));
+ }
+
+ if (unlikely(error && close_on_error)) /* rotate or reopen failed */
+ {
+ /*
+ Close whatever was left opened.
+
+ We are keeping the behavior as it exists today, ie,
+ we disable logging and move on (see: BUG#51014).
+
+ TODO: as part of WL#1790 consider other approaches:
+ - kill mysql (safety);
+ - try multiple locations for opening a log file;
+ - switch server to protected/readonly mode
+ - ...
+ */
+ close(LOG_CLOSE_INDEX);
+ sql_print_error(fatal_log_error, new_name_ptr, errno);
+ }
+
+ mysql_mutex_unlock(&LOCK_index);
+
+ DBUG_RETURN(error);
+}
+
+bool MYSQL_BIN_LOG::write_event(Log_event *ev, binlog_cache_data *cache_data,
+ IO_CACHE *file)
+{
+ Log_event_writer writer(file, 0, &crypto);
+ if (crypto.scheme && file == &log_file)
+ {
+ writer.ctx= alloca(crypto.ctx_size);
+ writer.set_encrypted_writer();
+ }
+ if (cache_data)
+ cache_data->add_status(ev->logged_status());
+ return writer.write(ev);
+}
+
+bool MYSQL_BIN_LOG::append(Log_event *ev)
+{
+ bool res;
+ mysql_mutex_lock(&LOCK_log);
+ res= append_no_lock(ev);
+ mysql_mutex_unlock(&LOCK_log);
+ return res;
+}
+
+
+bool MYSQL_BIN_LOG::append_no_lock(Log_event* ev)
+{
+ bool error = 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::append");
+
+ mysql_mutex_assert_owner(&LOCK_log);
+ DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
+
+ if (write_event(ev))
+ {
+ error=1;
+ goto err;
+ }
+ bytes_written+= ev->data_written;
+ DBUG_PRINT("info",("max_size: %lu",max_size));
+ if (flush_and_sync(0))
+ goto err;
+ if (my_b_append_tell(&log_file) > max_size)
+ error= new_file_without_locking();
+err:
+ update_binlog_end_pos();
+ DBUG_RETURN(error);
+}
+
+bool MYSQL_BIN_LOG::write_event_buffer(uchar* buf, uint len)
+{
+ bool error= 1;
+ uchar *ebuf= 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::write_event_buffer");
+
+ DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
+
+ mysql_mutex_assert_owner(&LOCK_log);
+
+ if (crypto.scheme != 0)
+ {
+ DBUG_ASSERT(crypto.scheme == 1);
+
+ uint elen;
+ uchar iv[BINLOG_IV_LENGTH];
+
+ ebuf= (uchar*)my_safe_alloca(len);
+ if (!ebuf)
+ goto err;
+
+ crypto.set_iv(iv, (uint32)my_b_append_tell(&log_file));
+
+ /*
+ we want to encrypt everything, excluding the event length:
+ massage the data before the encryption
+ */
+ memcpy(buf + EVENT_LEN_OFFSET, buf, 4);
+
+ if (encryption_crypt(buf + 4, len - 4,
+ ebuf + 4, &elen,
+ crypto.key, crypto.key_length, iv, sizeof(iv),
+ ENCRYPTION_FLAG_ENCRYPT | ENCRYPTION_FLAG_NOPAD,
+ ENCRYPTION_KEY_SYSTEM_DATA, crypto.key_version))
+ goto err;
+
+ DBUG_ASSERT(elen == len - 4);
+
+ /* massage the data after the encryption */
+ memcpy(ebuf, ebuf + EVENT_LEN_OFFSET, 4);
+ int4store(ebuf + EVENT_LEN_OFFSET, len);
+
+ buf= ebuf;
+ }
+ if (my_b_append(&log_file, buf, len))
+ goto err;
+ bytes_written+= len;
+
+ error= 0;
+ DBUG_PRINT("info",("max_size: %lu",max_size));
+ if (flush_and_sync(0))
+ goto err;
+ if (my_b_append_tell(&log_file) > max_size)
+ error= new_file_without_locking();
+err:
+ my_safe_afree(ebuf, len);
+ if (likely(!error))
+ update_binlog_end_pos();
+ DBUG_RETURN(error);
+}
+
+bool MYSQL_BIN_LOG::flush_and_sync(bool *synced)
+{
+ int err=0, fd=log_file.file;
+ if (synced)
+ *synced= 0;
+ mysql_mutex_assert_owner(&LOCK_log);
+ if (flush_io_cache(&log_file))
+ return 1;
+ uint sync_period= get_sync_period();
+ if (sync_period && ++sync_counter >= sync_period)
+ {
+ sync_counter= 0;
+ err= mysql_file_sync(fd, MYF(MY_WME));
+ if (synced)
+ *synced= 1;
+#ifndef DBUG_OFF
+ if (opt_binlog_dbug_fsync_sleep > 0)
+ my_sleep(opt_binlog_dbug_fsync_sleep);
+#endif
+ }
+ return err;
+}
+
+void MYSQL_BIN_LOG::start_union_events(THD *thd, query_id_t query_id_param)
+{
+ DBUG_ASSERT(!thd->binlog_evt_union.do_union);
+ thd->binlog_evt_union.do_union= TRUE;
+ thd->binlog_evt_union.unioned_events= FALSE;
+ thd->binlog_evt_union.unioned_events_trans= FALSE;
+ thd->binlog_evt_union.first_query_id= query_id_param;
+}
+
+void MYSQL_BIN_LOG::stop_union_events(THD *thd)
+{
+ DBUG_ASSERT(thd->binlog_evt_union.do_union);
+ thd->binlog_evt_union.do_union= FALSE;
+}
+
+bool MYSQL_BIN_LOG::is_query_in_union(THD *thd, query_id_t query_id_param)
+{
+ return (thd->binlog_evt_union.do_union &&
+ query_id_param >= thd->binlog_evt_union.first_query_id);
+}
+
+/**
+ This function checks if a transactional table was updated by the
+ current transaction.
+
+ @param thd The client thread that executed the current statement.
+ @return
+ @c true if a transactional table was updated, @c false otherwise.
+*/
+bool
+trans_has_updated_trans_table(const THD* thd)
+{
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+ return (cache_mngr ? !cache_mngr->trx_cache.empty() : 0);
+}
+
+/**
+ This function checks if a transactional table was updated by the
+ current statement.
+
+ @param thd The client thread that executed the current statement.
+ @return
+ @c true if a transactional table with rollback was updated,
+ @c false otherwise.
+*/
+bool
+stmt_has_updated_trans_table(const THD *thd)
+{
+ Ha_trx_info *ha_info;
+
+ for (ha_info= thd->transaction->stmt.ha_list; ha_info;
+ ha_info= ha_info->next())
+ {
+ if (ha_info->is_trx_read_write() &&
+ !(ha_info->ht()->flags & HTON_NO_ROLLBACK))
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+/**
+ This function checks if either a trx-cache or a non-trx-cache should
+ be used. If @c bin_log_direct_non_trans_update is active or the format
+ is either MIXED or ROW, the cache to be used depends on the flag @c
+ is_transactional.
+
+ On the other hand, if binlog_format is STMT or direct option is
+ OFF, the trx-cache should be used if and only if the statement is
+ transactional or the trx-cache is not empty. Otherwise, the
+ non-trx-cache should be used.
+
+ @param thd The client thread.
+ @param is_transactional The changes are related to a trx-table.
+ @return
+ @c true if a trx-cache should be used, @c false otherwise.
+*/
+bool use_trans_cache(const THD* thd, bool is_transactional)
+{
+ if (is_transactional)
+ return 1;
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+ return ((thd->is_current_stmt_binlog_format_row() ||
+ thd->variables.binlog_direct_non_trans_update) ? 0 :
+ !cache_mngr->trx_cache.empty());
+}
+
+/**
+ This function checks if a transaction, either a multi-statement
+ or a single statement transaction is about to commit or not.
+
+ @param thd The client thread that executed the current statement.
+ @param all Committing a transaction (i.e. TRUE) or a statement
+ (i.e. FALSE).
+ @return
+ @c true if committing a transaction, otherwise @c false.
+*/
+bool ending_trans(THD* thd, const bool all)
+{
+ return (all || ending_single_stmt_trans(thd, all));
+}
+
+/**
+ This function checks if a single statement transaction is about
+ to commit or not.
+
+ @param thd The client thread that executed the current statement.
+ @param all Committing a transaction (i.e. TRUE) or a statement
+ (i.e. FALSE).
+ @return
+ @c true if committing a single statement transaction, otherwise
+ @c false.
+*/
+bool ending_single_stmt_trans(THD* thd, const bool all)
+{
+ return (!all && !thd->in_multi_stmt_transaction_mode());
+}
+
+/**
+ This function checks if a non-transactional table was updated by
+ the current transaction.
+
+ @param thd The client thread that executed the current statement.
+ @return
+ @c true if a non-transactional table was updated, @c false
+ otherwise.
+*/
+bool trans_has_updated_non_trans_table(const THD* thd)
+{
+ return (thd->transaction->all.modified_non_trans_table ||
+ thd->transaction->stmt.modified_non_trans_table);
+}
+
+/**
+ This function checks if a non-transactional table was updated by the
+ current statement.
+
+ @param thd The client thread that executed the current statement.
+ @return
+ @c true if a non-transactional table was updated, @c false otherwise.
+*/
+bool stmt_has_updated_non_trans_table(const THD* thd)
+{
+ return (thd->transaction->stmt.modified_non_trans_table);
+}
+
+/*
+ These functions are placed in this file since they need access to
+ binlog_hton, which has internal linkage.
+*/
+
+binlog_cache_mngr *THD::binlog_setup_trx_data()
+{
+ DBUG_ENTER("THD::binlog_setup_trx_data");
+ binlog_cache_mngr *cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
+
+ if (cache_mngr)
+ DBUG_RETURN(cache_mngr); // Already set up
+
+ cache_mngr= (binlog_cache_mngr*) my_malloc(key_memory_binlog_cache_mngr,
+ sizeof(binlog_cache_mngr), MYF(MY_ZEROFILL));
+ if (!cache_mngr ||
+ open_cached_file(&cache_mngr->stmt_cache.cache_log, mysql_tmpdir,
+ LOG_PREFIX, (size_t)binlog_stmt_cache_size, MYF(MY_WME)) ||
+ open_cached_file(&cache_mngr->trx_cache.cache_log, mysql_tmpdir,
+ LOG_PREFIX, (size_t)binlog_cache_size, MYF(MY_WME)))
+ {
+ my_free(cache_mngr);
+ DBUG_RETURN(0); // Didn't manage to set it up
+ }
+ thd_set_ha_data(this, binlog_hton, cache_mngr);
+
+ cache_mngr= new (cache_mngr)
+ binlog_cache_mngr(max_binlog_stmt_cache_size,
+ max_binlog_cache_size,
+ &binlog_stmt_cache_use,
+ &binlog_stmt_cache_disk_use,
+ &binlog_cache_use,
+ &binlog_cache_disk_use);
+ DBUG_RETURN(cache_mngr);
+}
+
+
+/*
+ Two phase logged ALTER getter and setter methods.
+*/
+uchar THD::get_binlog_flags_for_alter()
+{
+ return mysql_bin_log.is_open() ? binlog_setup_trx_data()->gtid_flags3 : 0;
+}
+
+void THD::set_binlog_flags_for_alter(uchar flags)
+{
+ if (mysql_bin_log.is_open())
+ {
+ // SA must find the flag set empty
+ DBUG_ASSERT(flags != Gtid_log_event::FL_START_ALTER_E1 ||
+ binlog_setup_trx_data()->gtid_flags3 == 0);
+
+ binlog_setup_trx_data()->gtid_flags3= flags;
+ }
+}
+
+uint64 THD::get_binlog_start_alter_seq_no()
+{
+ return mysql_bin_log.is_open() ? binlog_setup_trx_data()->sa_seq_no : 0;
+}
+
+void THD::set_binlog_start_alter_seq_no(uint64 s_no)
+{
+ if (mysql_bin_log.is_open())
+ binlog_setup_trx_data()->sa_seq_no= s_no;
+}
+
+
+/*
+ Function to start a statement and optionally a transaction for the
+ binary log.
+
+ SYNOPSIS
+ binlog_start_trans_and_stmt()
+
+ DESCRIPTION
+
+ This function does three things:
+ - Start a transaction if not in autocommit mode or if a BEGIN
+ statement has been seen.
+
+ - Start a statement transaction to allow us to truncate the cache.
+
+ - Save the current binlog position so that we can roll back the
+ statement by truncating the cache.
+
+ We only update the saved position if the old one was undefined,
+ the reason is that there are some cases (e.g., for CREATE-SELECT)
+ where the position is saved twice (e.g., both in
+ select_create::prepare() and binlog_write_table_map()) , but
+ we should use the first. This means that calls to this function
+ can be used to start the statement before the first table map
+ event, to include some extra events.
+ */
+
+void
+THD::binlog_start_trans_and_stmt()
+{
+ binlog_cache_mngr *cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
+ DBUG_ENTER("binlog_start_trans_and_stmt");
+ DBUG_PRINT("enter", ("cache_mngr: %p cache_mngr->trx_cache.get_prev_position(): %lu",
+ cache_mngr,
+ (cache_mngr ? (ulong) cache_mngr->trx_cache.get_prev_position() :
+ (ulong) 0)));
+
+ if (cache_mngr == NULL ||
+ cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF)
+ {
+ this->binlog_set_stmt_begin();
+ bool mstmt_mode= in_multi_stmt_transaction_mode();
+#ifdef WITH_WSREP
+ /*
+ With wsrep binlog emulation we can skip the rest because the
+ binlog cache will not be written into binlog. Note however that
+ because of this the hton callbacks will not get called to clean
+ up the cache, so this must be done explicitly when the transaction
+ terminates.
+ */
+ if (WSREP_EMULATE_BINLOG_NNULL(this))
+ {
+ DBUG_VOID_RETURN;
+ }
+ /* If this event replicates through a master-slave then we need to
+ inject manually GTID so it is preserved in the cluster. We are writing
+ directly to WSREP buffer and not in IO cache because in case of IO cache
+ GTID event will be duplicated in binlog.
+ We have to do this only one time in mysql transaction.
+ Since this function is called multiple times , We will check for
+ ha_info->is_started().
+ */
+ Ha_trx_info *ha_info;
+ ha_info= this->ha_data[binlog_hton->slot].ha_info + (mstmt_mode ? 1 : 0);
+
+ if (!ha_info->is_started() &&
+ (this->variables.gtid_seq_no || this->variables.wsrep_gtid_seq_no) &&
+ wsrep_on(this) &&
+ (this->wsrep_cs().mode() == wsrep::client_state::m_local))
+ {
+ uchar *buf= 0;
+ size_t len= 0;
+ IO_CACHE tmp_io_cache;
+ Log_event_writer writer(&tmp_io_cache, 0);
+ if(!open_cached_file(&tmp_io_cache, mysql_tmpdir, TEMP_PREFIX,
+ 128, MYF(MY_WME)))
+ {
+ uint64 seqno= this->variables.gtid_seq_no;
+ uint32 domain_id= this->variables.gtid_domain_id;
+ uint32 server_id= this->variables.server_id;
+ if (!this->variables.gtid_seq_no && this->variables.wsrep_gtid_seq_no)
+ {
+ seqno= this->variables.wsrep_gtid_seq_no;
+ domain_id= wsrep_gtid_server.domain_id;
+ server_id= wsrep_gtid_server.server_id;
+ }
+ Gtid_log_event gtid_event(this, seqno, domain_id, true,
+ LOG_EVENT_SUPPRESS_USE_F, true, 0);
+ // Replicated events in writeset doesn't have checksum
+ gtid_event.checksum_alg= BINLOG_CHECKSUM_ALG_OFF;
+ gtid_event.server_id= server_id;
+ writer.write(&gtid_event);
+ wsrep_write_cache_buf(&tmp_io_cache, &buf, &len);
+ if (len > 0) this->wsrep_cs().append_data(wsrep::const_buffer(buf, len));
+ if (buf) my_free(buf);
+ close_cached_file(&tmp_io_cache);
+ }
+ }
+#endif
+ if (mstmt_mode)
+ trans_register_ha(this, TRUE, binlog_hton, 0);
+ trans_register_ha(this, FALSE, binlog_hton, 0);
+ /*
+ Mark statement transaction as read/write. We never start
+ a binary log transaction and keep it read-only,
+ therefore it's best to mark the transaction read/write just
+ at the same time we start it.
+ Not necessary to mark the normal transaction read/write
+ since the statement-level flag will be propagated automatically
+ inside ha_commit_trans.
+ */
+ ha_data[binlog_hton->slot].ha_info[0].set_trx_read_write();
+ }
+ DBUG_VOID_RETURN;
+}
+
+void THD::binlog_set_stmt_begin() {
+ binlog_cache_mngr *cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
+
+ /*
+ The call to binlog_trans_log_savepos() might create the cache_mngr
+ structure, if it didn't exist before, so we save the position
+ into an auto variable and then write it into the transaction
+ data for the binary log (i.e., cache_mngr).
+ */
+ my_off_t pos= 0;
+ binlog_trans_log_savepos(this, &pos);
+ cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
+ cache_mngr->trx_cache.set_prev_position(pos);
+}
+
+static int
+binlog_start_consistent_snapshot(handlerton *hton, THD *thd)
+{
+ int err= 0;
+ DBUG_ENTER("binlog_start_consistent_snapshot");
+
+ binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
+
+ /* Server layer calls us with LOCK_commit_ordered locked, so this is safe. */
+ mysql_mutex_assert_owner(&LOCK_commit_ordered);
+ strmake_buf(cache_mngr->last_commit_pos_file, mysql_bin_log.last_commit_pos_file);
+ cache_mngr->last_commit_pos_offset= mysql_bin_log.last_commit_pos_offset;
+
+ trans_register_ha(thd, TRUE, binlog_hton, 0);
+
+ DBUG_RETURN(err);
+}
+
+
+/**
+ Prepare all tables that are updated for row logging
+
+ Annotate events and table maps are written by binlog_write_table_maps()
+*/
+
+void THD::binlog_prepare_for_row_logging()
+{
+ DBUG_ENTER("THD::binlog_prepare_for_row_logging");
+ for (TABLE *table= open_tables ; table; table= table->next)
+ {
+ if (table->query_id == query_id && table->current_lock == F_WRLCK)
+ table->file->prepare_for_row_logging();
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ Write annnotated row event (the query) if needed
+*/
+
+bool THD::binlog_write_annotated_row(Log_event_writer *writer)
+{
+ DBUG_ENTER("THD::binlog_write_annotated_row");
+
+ if (!(IF_WSREP(!wsrep_fragments_certified_for_stmt(this), true) &&
+ variables.binlog_annotate_row_events &&
+ query_length()))
+ DBUG_RETURN(0);
+
+ Annotate_rows_log_event anno(this, 0, false);
+ DBUG_RETURN(writer->write(&anno));
+}
+
+
+/**
+ Write table map events for all tables that are using row logging.
+ This includes all tables used by this statement, including tables
+ used in triggers.
+
+ Also write annotate events and start transactions.
+ This is using the "tables_with_row_logging" list prepared by
+ THD::binlog_prepare_for_row_logging
+*/
+
+bool THD::binlog_write_table_maps()
+{
+ bool with_annotate;
+ MYSQL_LOCK *locks[2], **locks_end= locks;
+ DBUG_ENTER("THD::binlog_write_table_maps");
+
+ DBUG_ASSERT(!binlog_table_maps);
+ DBUG_ASSERT(is_current_stmt_binlog_format_row());
+
+ /* Initialize cache_mngr once per statement */
+ binlog_start_trans_and_stmt();
+ with_annotate= 1; // Write annotate with first map
+
+ if ((*locks_end= extra_lock))
+ locks_end++;
+ if ((*locks_end= lock))
+ locks_end++;
+
+ for (MYSQL_LOCK **cur_lock= locks ; cur_lock < locks_end ; cur_lock++)
+ {
+ TABLE **const end_ptr= (*cur_lock)->table + (*cur_lock)->table_count;
+ for (TABLE **table_ptr= (*cur_lock)->table;
+ table_ptr != end_ptr ;
+ ++table_ptr)
+ {
+ TABLE *table= *table_ptr;
+ bool restore= 0;
+ /*
+ We have to also write table maps for tables that have not yet been
+ used, like for tables in after triggers
+ */
+ if (!table->file->row_logging &&
+ table->query_id != query_id && table->current_lock == F_WRLCK)
+ {
+ if (table->file->prepare_for_row_logging())
+ restore= 1;
+ }
+ if (table->file->row_logging)
+ {
+ if (binlog_write_table_map(table, with_annotate))
+ DBUG_RETURN(1);
+ with_annotate= 0;
+ }
+ if (restore)
+ {
+ /*
+ Restore original setting so that it doesn't cause problem for the
+ next statement
+ */
+ table->file->row_logging= table->file->row_logging_init= 0;
+ }
+ }
+ }
+ binlog_table_maps= 1; // Table maps written
+ DBUG_RETURN(0);
+}
+
+
+/**
+ This function writes a table map to the binary log.
+
+ If an error occurs while writing events and rollback is not possible, e.g.
+ due to the statement modifying a non-transactional table, an incident event
+ is logged.
+
+ @param table a pointer to the table.
+ @param with_annotate @c true to write an annotate event before writing
+ the table_map event, @c false otherwise.
+ @return
+ nonzero if an error pops up when writing the table map event.
+*/
+
+bool THD::binlog_write_table_map(TABLE *table, bool with_annotate)
+{
+ int error= 1;
+ bool is_transactional= table->file->row_logging_has_trans;
+ DBUG_ENTER("THD::binlog_write_table_map");
+ DBUG_PRINT("enter", ("table: %p (%s: #%lu)",
+ table, table->s->table_name.str,
+ table->s->table_map_id));
+
+ /* Pre-conditions */
+ DBUG_ASSERT(table->s->table_map_id != ULONG_MAX);
+
+ /* Ensure that all events in a GTID group are in the same cache */
+ if (variables.option_bits & OPTION_GTID_BEGIN)
+ is_transactional= 1;
+
+ Table_map_log_event
+ the_event(this, table, table->s->table_map_id, is_transactional);
+
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
+ binlog_cache_data *cache_data= (cache_mngr->
+ get_binlog_cache_data(is_transactional));
+ IO_CACHE *file= &cache_data->cache_log;
+ Log_event_writer writer(file, cache_data);
+
+ if (with_annotate)
+ if (binlog_write_annotated_row(&writer))
+ goto write_err;
+
+ DBUG_EXECUTE_IF("table_map_write_error",
+ {
+ if (is_transactional)
+ {
+ my_errno= EFBIG;
+ goto write_err;
+ }
+ });
+
+ if (unlikely((error= writer.write(&the_event))))
+ goto write_err;
+
+ DBUG_RETURN(0);
+
+write_err:
+ mysql_bin_log.set_write_error(this, is_transactional);
+ /*
+ For non-transactional engine or multi statement transaction with mixed
+ engines, data is written to table but writing to binary log failed. In
+ these scenarios rollback is not possible. Hence report an incident.
+ */
+ if (mysql_bin_log.check_write_error(this) && cache_data &&
+ lex->stmt_accessed_table(LEX::STMT_WRITES_NON_TRANS_TABLE) &&
+ table->current_lock == F_WRLCK)
+ cache_data->set_incident();
+ DBUG_RETURN(error);
+}
+
+
+/**
+ This function retrieves a pending row event from a cache which is
+ specified through the parameter @c is_transactional. Respectively, when it
+ is @c true, the pending event is returned from the transactional cache.
+ Otherwise from the non-transactional cache.
+
+ @param is_transactional @c true indicates a transactional cache,
+ otherwise @c false a non-transactional.
+ @return
+ The row event if any.
+*/
+Rows_log_event*
+THD::binlog_get_pending_rows_event(bool is_transactional) const
+{
+ Rows_log_event* rows= NULL;
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
+
+ /*
+ This is less than ideal, but here's the story: If there is no cache_mngr,
+ prepare_pending_rows_event() has never been called (since the cache_mngr
+ is set up there). In that case, we just return NULL.
+ */
+ if (cache_mngr)
+ {
+ binlog_cache_data *cache_data=
+ cache_mngr->get_binlog_cache_data(use_trans_cache(this, is_transactional));
+
+ rows= cache_data->pending();
+ }
+ return (rows);
+}
+
+/**
+ This function stores a pending row event into a cache which is specified
+ through the parameter @c is_transactional. Respectively, when it is @c
+ true, the pending event is stored into the transactional cache. Otherwise
+ into the non-transactional cache.
+
+ @param evt a pointer to the row event.
+ @param is_transactional @c true indicates a transactional cache,
+ otherwise @c false a non-transactional.
+*/
+void
+THD::binlog_set_pending_rows_event(Rows_log_event* ev, bool is_transactional)
+{
+ binlog_cache_mngr *const cache_mngr= binlog_setup_trx_data();
+
+ DBUG_ASSERT(cache_mngr);
+
+ binlog_cache_data *cache_data=
+ cache_mngr->get_binlog_cache_data(use_trans_cache(this, is_transactional));
+
+ cache_data->set_pending(ev);
+}
+
+
+/**
+ This function removes the pending rows event, discarding any outstanding
+ rows. If there is no pending rows event available, this is effectively a
+ no-op.
+
+ @param thd a pointer to the user thread.
+ @param is_transactional @c true indicates a transactional cache,
+ otherwise @c false a non-transactional.
+*/
+int
+MYSQL_BIN_LOG::remove_pending_rows_event(THD *thd, bool is_transactional)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::remove_pending_rows_event");
+
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+ DBUG_ASSERT(cache_mngr);
+
+ binlog_cache_data *cache_data=
+ cache_mngr->get_binlog_cache_data(use_trans_cache(thd, is_transactional));
+
+ if (Rows_log_event* pending= cache_data->pending())
+ {
+ delete pending;
+ cache_data->set_pending(NULL);
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*
+ Moves the last bunch of rows from the pending Rows event to a cache (either
+ transactional cache if is_transaction is @c true, or the non-transactional
+ cache otherwise. Sets a new pending event.
+
+ @param thd a pointer to the user thread.
+ @param evt a pointer to the row event.
+ @param is_transactional @c true indicates a transactional cache,
+ otherwise @c false a non-transactional.
+*/
+int
+MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
+ Rows_log_event* event,
+ bool is_transactional)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::flush_and_set_pending_rows_event(event)");
+ DBUG_ASSERT(WSREP_EMULATE_BINLOG(thd) || mysql_bin_log.is_open());
+ DBUG_PRINT("enter", ("event: %p", event));
+
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+ DBUG_ASSERT(cache_mngr);
+
+ binlog_cache_data *cache_data=
+ cache_mngr->get_binlog_cache_data(use_trans_cache(thd, is_transactional));
+
+ DBUG_PRINT("info", ("cache_mngr->pending(): %p", cache_data->pending()));
+
+ if (Rows_log_event* pending= cache_data->pending())
+ {
+ Log_event_writer writer(&cache_data->cache_log, cache_data);
+
+ /*
+ Write pending event to the cache.
+ */
+ DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
+ {DBUG_SET("+d,simulate_file_write_error");});
+ if (writer.write(pending))
+ {
+ set_write_error(thd, is_transactional);
+ if (check_write_error(thd) && cache_data &&
+ stmt_has_updated_non_trans_table(thd))
+ cache_data->set_incident();
+ delete pending;
+ cache_data->set_pending(NULL);
+ DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
+ {DBUG_SET("-d,simulate_file_write_error");});
+ DBUG_RETURN(1);
+ }
+
+ delete pending;
+ }
+
+ thd->binlog_set_pending_rows_event(event, is_transactional);
+
+ DBUG_RETURN(0);
+}
+
+
+/* Generate a new global transaction ID, and write it to the binlog */
+
+bool
+MYSQL_BIN_LOG::write_gtid_event(THD *thd, bool standalone,
+ bool is_transactional, uint64 commit_id,
+ bool has_xid, bool is_ro_1pc)
+{
+ rpl_gtid gtid;
+ uint32 domain_id;
+ uint32 local_server_id;
+ uint64 seq_no;
+ int err;
+ DBUG_ENTER("write_gtid_event");
+ DBUG_PRINT("enter", ("standalone: %d", standalone));
+
+ seq_no= thd->variables.gtid_seq_no;
+ domain_id= thd->variables.gtid_domain_id;
+ local_server_id= thd->variables.server_id;
+
+ DBUG_ASSERT(local_server_id != 0);
+
+ if (thd->variables.option_bits & OPTION_GTID_BEGIN)
+ {
+ DBUG_PRINT("error", ("OPTION_GTID_BEGIN is set. "
+ "Master and slave will have different GTID values"));
+ /* Reset the flag, as we will write out a GTID anyway */
+ thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
+ }
+
+ /*
+ Reset the session variable gtid_seq_no, to reduce the risk of accidentally
+ producing a duplicate GTID.
+ */
+ thd->variables.gtid_seq_no= 0;
+ if (seq_no != 0)
+ {
+ /* Use the specified sequence number. */
+ gtid.domain_id= domain_id;
+ gtid.server_id= local_server_id;
+ gtid.seq_no= seq_no;
+ err= rpl_global_gtid_binlog_state.update(&gtid, opt_gtid_strict_mode);
+ if (err && thd->get_stmt_da()->sql_errno()==ER_GTID_STRICT_OUT_OF_ORDER)
+ errno= ER_GTID_STRICT_OUT_OF_ORDER;
+ }
+ else
+ {
+ /* Allocate the next sequence number for the GTID. */
+ err= rpl_global_gtid_binlog_state.update_with_next_gtid(domain_id,
+ local_server_id, &gtid);
+ seq_no= gtid.seq_no;
+ }
+ if (err)
+ DBUG_RETURN(true);
+
+ thd->set_last_commit_gtid(gtid);
+ if (thd->get_binlog_flags_for_alter() & Gtid_log_event::FL_START_ALTER_E1)
+ thd->set_binlog_start_alter_seq_no(gtid.seq_no);
+
+ Gtid_log_event gtid_event(thd, seq_no, domain_id, standalone,
+ LOG_EVENT_SUPPRESS_USE_F, is_transactional,
+ commit_id, has_xid, is_ro_1pc);
+
+ /* Write the event to the binary log. */
+ DBUG_ASSERT(this == &mysql_bin_log);
+
+#ifdef WITH_WSREP
+ if (wsrep_gtid_mode)
+ {
+ thd->variables.gtid_domain_id= global_system_variables.gtid_domain_id;
+ thd->variables.server_id= global_system_variables.server_id;
+ }
+#endif
+
+ if (write_event(&gtid_event))
+ DBUG_RETURN(true);
+ status_var_add(thd->status_var.binlog_bytes_written, gtid_event.data_written);
+
+ DBUG_RETURN(false);
+}
+
+
+int
+MYSQL_BIN_LOG::write_state_to_file()
+{
+ File file_no;
+ IO_CACHE cache;
+ char buf[FN_REFLEN];
+ int err;
+ bool opened= false;
+ bool log_inited= false;
+
+ fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
+ MY_UNPACK_FILENAME);
+ if ((file_no= mysql_file_open(key_file_binlog_state, buf,
+ O_RDWR|O_CREAT|O_TRUNC|O_BINARY,
+ MYF(MY_WME))) < 0)
+ {
+ err= 1;
+ goto err;
+ }
+ opened= true;
+ if ((err= init_io_cache(&cache, file_no, IO_SIZE, WRITE_CACHE, 0, 0,
+ MYF(MY_WME|MY_WAIT_IF_FULL))))
+ goto err;
+ log_inited= true;
+ if ((err= rpl_global_gtid_binlog_state.write_to_iocache(&cache)))
+ goto err;
+ log_inited= false;
+ if ((err= end_io_cache(&cache)))
+ goto err;
+ if ((err= mysql_file_sync(file_no, MYF(MY_WME))))
+ goto err;
+ goto end;
+
+err:
+ sql_print_error("Error writing binlog state to file '%s'.", buf);
+ if (log_inited)
+ end_io_cache(&cache);
+end:
+ if (opened)
+ mysql_file_close(file_no, MYF(0));
+
+ return err;
+}
+
+
+/*
+ Initialize the binlog state from the master-bin.state file, at server startup.
+
+ Returns:
+ 0 for success.
+ 2 for when .state file did not exist.
+ 1 for other error.
+*/
+int
+MYSQL_BIN_LOG::read_state_from_file()
+{
+ File file_no;
+ IO_CACHE cache;
+ char buf[FN_REFLEN];
+ int err;
+ bool opened= false;
+ bool log_inited= false;
+
+ fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
+ MY_UNPACK_FILENAME);
+ if ((file_no= mysql_file_open(key_file_binlog_state, buf,
+ O_RDONLY|O_BINARY, MYF(0))) < 0)
+ {
+ if (my_errno != ENOENT)
+ {
+ err= 1;
+ goto err;
+ }
+ else
+ {
+ /*
+ If the state file does not exist, this is the first server startup
+ with GTID enabled. So initialize to empty state.
+ */
+ rpl_global_gtid_binlog_state.reset();
+ err= 2;
+ goto end;
+ }
+ }
+ opened= true;
+ if ((err= init_io_cache(&cache, file_no, IO_SIZE, READ_CACHE, 0, 0,
+ MYF(MY_WME|MY_WAIT_IF_FULL))))
+ goto err;
+ log_inited= true;
+ if ((err= rpl_global_gtid_binlog_state.read_from_iocache(&cache)))
+ goto err;
+ goto end;
+
+err:
+ sql_print_error("Error reading binlog GTID state from file '%s'.", buf);
+end:
+ if (log_inited)
+ end_io_cache(&cache);
+ if (opened)
+ mysql_file_close(file_no, MYF(0));
+
+ return err;
+}
+
+
+int
+MYSQL_BIN_LOG::get_most_recent_gtid_list(rpl_gtid **list, uint32 *size)
+{
+ return rpl_global_gtid_binlog_state.get_most_recent_gtid_list(list, size);
+}
+
+
+bool
+MYSQL_BIN_LOG::append_state_pos(String *str)
+{
+ return rpl_global_gtid_binlog_state.append_pos(str);
+}
+
+
+bool
+MYSQL_BIN_LOG::append_state(String *str)
+{
+ return rpl_global_gtid_binlog_state.append_state(str);
+}
+
+
+bool
+MYSQL_BIN_LOG::is_empty_state()
+{
+ return (rpl_global_gtid_binlog_state.count() == 0);
+}
+
+
+bool
+MYSQL_BIN_LOG::find_in_binlog_state(uint32 domain_id, uint32 server_id_arg,
+ rpl_gtid *out_gtid)
+{
+ rpl_gtid *gtid;
+ if ((gtid= rpl_global_gtid_binlog_state.find(domain_id, server_id_arg)))
+ *out_gtid= *gtid;
+ return gtid != NULL;
+}
+
+
+bool
+MYSQL_BIN_LOG::lookup_domain_in_binlog_state(uint32 domain_id,
+ rpl_gtid *out_gtid)
+{
+ rpl_gtid *found_gtid;
+
+ if ((found_gtid= rpl_global_gtid_binlog_state.find_most_recent(domain_id)))
+ {
+ *out_gtid= *found_gtid;
+ return true;
+ }
+
+ return false;
+}
+
+
+int
+MYSQL_BIN_LOG::bump_seq_no_counter_if_needed(uint32 domain_id, uint64 seq_no)
+{
+ return rpl_global_gtid_binlog_state.bump_seq_no_if_needed(domain_id, seq_no);
+}
+
+
+bool
+MYSQL_BIN_LOG::check_strict_gtid_sequence(uint32 domain_id,
+ uint32 server_id_arg,
+ uint64 seq_no,
+ bool no_error)
+{
+ return rpl_global_gtid_binlog_state.check_strict_sequence(domain_id,
+ server_id_arg,
+ seq_no,
+ no_error);
+}
+
+
+/**
+ Write an event to the binary log. If with_annotate != NULL and
+ *with_annotate = TRUE write also Annotate_rows before the event
+ (this should happen only if the event is a Table_map).
+*/
+
+bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
+{
+ THD *thd= event_info->thd;
+ bool error= 1;
+ binlog_cache_data *cache_data= 0;
+ bool is_trans_cache= FALSE;
+ bool using_trans= event_info->use_trans_cache();
+ bool direct= event_info->use_direct_logging();
+ ulong UNINIT_VAR(prev_binlog_id);
+ DBUG_ENTER("MYSQL_BIN_LOG::write(Log_event *)");
+
+ /*
+ When binary logging is not enabled (--log-bin=0), wsrep-patch partially
+ enables it without opening the binlog file (MYSQL_BIN_LOG::open().
+ So, avoid writing to binlog file.
+ */
+ if (direct &&
+ (wsrep_emulate_bin_log ||
+ (WSREP(thd) && !(thd->variables.option_bits & OPTION_BIN_LOG))))
+ DBUG_RETURN(0);
+
+ if (thd->variables.option_bits &
+ (OPTION_GTID_BEGIN | OPTION_BIN_COMMIT_OFF))
+ {
+ DBUG_PRINT("info", ("OPTION_GTID_BEGIN was set"));
+ /* Wait for commit from binary log before we commit */
+ direct= 0;
+ using_trans= 1;
+ /* Set cache_type to ensure we don't get checksums for this event */
+ event_info->cache_type= Log_event::EVENT_TRANSACTIONAL_CACHE;
+ }
+
+ if (thd->binlog_evt_union.do_union)
+ {
+ /*
+ In Stored function; Remember that function call caused an update.
+ We will log the function call to the binary log on function exit
+ */
+ thd->binlog_evt_union.unioned_events= TRUE;
+ thd->binlog_evt_union.unioned_events_trans |= using_trans;
+ DBUG_RETURN(0);
+ }
+
+ /*
+ We only end the statement if we are in a top-level statement. If
+ we are inside a stored function, we do not end the statement since
+ this will close all tables on the slave. But there can be a special case
+ where we are inside a stored function/trigger and a SAVEPOINT is being
+ set in side the stored function/trigger. This SAVEPOINT execution will
+ force the pending event to be flushed without an STMT_END_F flag. This
+ will result in a case where following DMLs will be considered as part of
+ same statement and result in data loss on slave. Hence in this case we
+ force the end_stmt to be true.
+ */
+ bool const end_stmt= (thd->in_sub_stmt && thd->lex->sql_command ==
+ SQLCOM_SAVEPOINT) ? true :
+ (thd->locked_tables_mode && thd->lex->requires_prelocking());
+ if (thd->binlog_flush_pending_rows_event(end_stmt, using_trans))
+ DBUG_RETURN(error);
+
+ /*
+ In most cases this is only called if 'is_open()' is true; in fact this is
+ mostly called if is_open() *was* true a few instructions before, but it
+ could have changed since.
+ */
+ /* applier and replayer can skip writing binlog events */
+ if ((WSREP_EMULATE_BINLOG(thd) &&
+ IF_WSREP(thd->wsrep_cs().mode() == wsrep::client_state::m_local, 0)) || is_open())
+ {
+ my_off_t UNINIT_VAR(my_org_b_tell);
+#ifdef HAVE_REPLICATION
+ /*
+ In the future we need to add to the following if tests like
+ "do the involved tables match (to be implemented)
+ binlog_[wild_]{do|ignore}_table?" (WL#1049)"
+ */
+ const char *local_db= event_info->get_db();
+
+ bool option_bin_log_flag= (thd->variables.option_bits & OPTION_BIN_LOG);
+
+ /*
+ Log all updates to binlog cache so that they can get replicated to other
+ nodes. A check has been added to stop them from getting logged into
+ binary log files.
+ */
+ if (WSREP(thd))
+ option_bin_log_flag= true;
+
+ if ((!(option_bin_log_flag)) ||
+ (thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT &&
+ thd->lex->sql_command != SQLCOM_SAVEPOINT &&
+ !binlog_filter->db_ok(local_db)))
+ DBUG_RETURN(0);
+#endif /* HAVE_REPLICATION */
+
+ IO_CACHE *file= NULL;
+
+ if (direct)
+ {
+ /* We come here only for incident events */
+ int res;
+ uint64 commit_id= 0;
+ MDL_request mdl_request;
+ DBUG_PRINT("info", ("direct is set"));
+ DBUG_ASSERT(!thd->backup_commit_lock);
+
+ MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
+ MDL_EXPLICIT);
+ if (thd->mdl_context.acquire_lock(&mdl_request,
+ thd->variables.lock_wait_timeout))
+ DBUG_RETURN(1);
+ thd->backup_commit_lock= &mdl_request;
+
+ if ((res= thd->wait_for_prior_commit()))
+ {
+ if (mdl_request.ticket)
+ thd->mdl_context.release_lock(mdl_request.ticket);
+ thd->backup_commit_lock= 0;
+ DBUG_RETURN(res);
+ }
+ file= &log_file;
+ my_org_b_tell= my_b_tell(file);
+ mysql_mutex_lock(&LOCK_log);
+ prev_binlog_id= current_binlog_id;
+ DBUG_EXECUTE_IF("binlog_force_commit_id",
+ {
+ const LEX_CSTRING commit_name= { STRING_WITH_LEN("commit_id") };
+ bool null_value;
+ user_var_entry *entry=
+ (user_var_entry*) my_hash_search(&thd->user_vars,
+ (uchar*) commit_name.str,
+ commit_name.length);
+ commit_id= entry->val_int(&null_value);
+ });
+ res= write_gtid_event(thd, true, using_trans, commit_id);
+ if (mdl_request.ticket)
+ thd->mdl_context.release_lock(mdl_request.ticket);
+ thd->backup_commit_lock= 0;
+ if (res)
+ goto err;
+ }
+ else
+ {
+ binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
+ if (!cache_mngr)
+ goto err;
+
+ is_trans_cache= use_trans_cache(thd, using_trans);
+ cache_data= cache_mngr->get_binlog_cache_data(is_trans_cache);
+ file= &cache_data->cache_log;
+
+ if (thd->lex->stmt_accessed_non_trans_temp_table() && is_trans_cache)
+ thd->transaction->stmt.mark_modified_non_trans_temp_table();
+ thd->binlog_start_trans_and_stmt();
+ }
+ DBUG_PRINT("info",("event type: %d",event_info->get_type_code()));
+
+ /*
+ No check for auto events flag here - this write method should
+ never be called if auto-events are enabled.
+
+ Write first log events which describe the 'run environment'
+ of the SQL command. If row-based binlogging, Insert_id, Rand
+ and other kind of "setting context" events are not needed.
+ */
+
+ if (with_annotate && *with_annotate)
+ {
+ DBUG_ASSERT(event_info->get_type_code() == TABLE_MAP_EVENT);
+ Annotate_rows_log_event anno(thd, using_trans, direct);
+ /* Annotate event should be written not more than once */
+ *with_annotate= 0;
+ if (write_event(&anno, cache_data, file))
+ goto err;
+ }
+
+ {
+ if (!thd->is_current_stmt_binlog_format_row())
+ {
+ if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
+ {
+ Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT,
+ thd->first_successful_insert_id_in_prev_stmt_for_binlog,
+ using_trans, direct);
+ if (write_event(&e, cache_data, file))
+ goto err;
+ }
+ if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
+ {
+ DBUG_PRINT("info",("number of auto_inc intervals: %u",
+ thd->auto_inc_intervals_in_cur_stmt_for_binlog.
+ nb_elements()));
+ Intvar_log_event e(thd, (uchar) INSERT_ID_EVENT,
+ thd->auto_inc_intervals_in_cur_stmt_for_binlog.
+ minimum(), using_trans, direct);
+ if (write_event(&e, cache_data, file))
+ goto err;
+ }
+ if (thd->used & THD::RAND_USED)
+ {
+ Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2,
+ using_trans, direct);
+ if (write_event(&e, cache_data, file))
+ goto err;
+ }
+ if (thd->user_var_events.elements)
+ {
+ for (uint i= 0; i < thd->user_var_events.elements; i++)
+ {
+ BINLOG_USER_VAR_EVENT *user_var_event;
+ get_dynamic(&thd->user_var_events,(uchar*) &user_var_event, i);
+
+ /* setting flags for user var log event */
+ uchar flags= User_var_log_event::UNDEF_F;
+ if (user_var_event->unsigned_flag)
+ flags|= User_var_log_event::UNSIGNED_F;
+
+ User_var_log_event e(thd, user_var_event->user_var_event->name.str,
+ user_var_event->user_var_event->name.length,
+ user_var_event->value,
+ user_var_event->length,
+ user_var_event->type,
+ user_var_event->charset_number,
+ flags,
+ using_trans,
+ direct);
+ if (write_event(&e, cache_data, file))
+ goto err;
+ }
+ }
+ }
+ }
+
+ /*
+ Write the event.
+ */
+ if (write_event(event_info, cache_data, file) ||
+ DBUG_IF("injecting_fault_writing"))
+ goto err;
+
+ error= 0;
+err:
+ if (direct)
+ {
+ my_off_t offset= my_b_tell(file);
+ bool check_purge= false;
+ DBUG_ASSERT(!is_relay_log);
+
+ if (likely(!error))
+ {
+ bool synced;
+
+ if ((error= flush_and_sync(&synced)))
+ {
+ }
+ else
+ {
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_owner(&LOCK_log);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+#ifdef HAVE_REPLICATION
+ if (repl_semisync_master.report_binlog_update(thd, log_file_name,
+ file->pos_in_file))
+ {
+ sql_print_error("Failed to run 'after_flush' hooks");
+ error= 1;
+ }
+ else
+#endif
+ {
+ /*
+ update binlog_end_pos so it can be read by dump thread
+ note: must be _after_ the RUN_HOOK(after_flush) or else
+ semi-sync might not have put the transaction into
+ it's list before dump-thread tries to send it
+ */
+ update_binlog_end_pos(offset);
+ if (unlikely((error= rotate(false, &check_purge))))
+ check_purge= false;
+ }
+ }
+ }
+
+ status_var_add(thd->status_var.binlog_bytes_written,
+ offset - my_org_b_tell);
+
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
+ mysql_mutex_unlock(&LOCK_log);
+
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_not_owner(&LOCK_log);
+ mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+#ifdef HAVE_REPLICATION
+ if (repl_semisync_master.wait_after_sync(log_file_name,
+ file->pos_in_file))
+ {
+ error=1;
+ /* error is already printed inside hook */
+ }
+#endif
+
+ /*
+ Take mutex to protect against a reader seeing partial writes of 64-bit
+ offset on 32-bit CPUs.
+ */
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
+ last_commit_pos_offset= offset;
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+
+ if (check_purge)
+ checkpoint_and_purge(prev_binlog_id);
+ }
+
+ if (unlikely(error))
+ {
+ set_write_error(thd, is_trans_cache);
+ if (check_write_error(thd) && cache_data &&
+ stmt_has_updated_non_trans_table(thd))
+ cache_data->set_incident();
+ }
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+int error_log_print(enum loglevel level, const char *format,
+ va_list args)
+{
+ return logger.error_log_print(level, format, args);
+}
+
+
+bool slow_log_print(THD *thd, const char *query, uint query_length,
+ ulonglong current_utime)
+{
+ return logger.slow_log_print(thd, query, query_length, current_utime);
+}
+
+
+/**
+ Decide if we should log the command to general log
+
+ @retval
+ FALSE No logging
+ TRUE Ok to log
+*/
+
+bool LOGGER::log_command(THD *thd, enum enum_server_command command)
+{
+ /*
+ Log command if we have at least one log event handler enabled and want
+ to log this king of commands
+ */
+ if (!(*general_log_handler_list && (what_to_log & (1L << (uint) command))))
+ return FALSE;
+
+ /*
+ If LOG_SLOW_DISABLE_SLAVE is set when slave thread starts, then
+ OPTION_LOG_OFF is set.
+ Only the super user can set this bit.
+ */
+ return !(thd->variables.option_bits & OPTION_LOG_OFF);
+}
+
+
+bool general_log_print(THD *thd, enum enum_server_command command,
+ const char *format, ...)
+{
+ va_list args;
+ uint error= 0;
+
+ /* Print the message to the buffer if we want to log this kind of commands */
+ if (! logger.log_command(thd, command))
+ return FALSE;
+
+ va_start(args, format);
+ error= logger.general_log_print(thd, command, format, args);
+ va_end(args);
+
+ return error;
+}
+
+bool general_log_write(THD *thd, enum enum_server_command command,
+ const char *query, size_t query_length)
+{
+ /* Write the message to the log if we want to log this king of commands */
+ if (logger.log_command(thd, command) || mysql_audit_general_enabled())
+ return logger.general_log_write(thd, command, query, query_length);
+
+ return FALSE;
+}
+
+
+static void
+binlog_checkpoint_callback(void *cookie)
+{
+ MYSQL_BIN_LOG::xid_count_per_binlog *entry=
+ (MYSQL_BIN_LOG::xid_count_per_binlog *)cookie;
+ /*
+ For every supporting engine, we increment the xid_count and issue a
+ commit_checkpoint_request(). Then we can count when all
+ commit_checkpoint_notify() callbacks have occurred, and then log a new
+ binlog checkpoint event.
+ */
+ mysql_bin_log.mark_xids_active(entry->binlog_id, 1);
+}
+
+
+/*
+ Request a commit checkpoint from each supporting engine.
+ This must be called after each binlog rotate, and after LOCK_log has been
+ released. The xid_count value in the xid_count_per_binlog entry was
+ incremented by 1 and will be decremented in this function; this ensures
+ that the entry will not go away early despite LOCK_log not being held.
+*/
+void
+MYSQL_BIN_LOG::do_checkpoint_request(ulong binlog_id)
+{
+ xid_count_per_binlog *entry;
+
+ /*
+ Find the binlog entry, and invoke commit_checkpoint_request() on it in
+ each supporting storage engine.
+ */
+ mysql_mutex_lock(&LOCK_xid_list);
+ I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+ do {
+ entry= it++;
+ DBUG_ASSERT(entry /* binlog_id is always somewhere in the list. */);
+ } while (entry->binlog_id != binlog_id);
+ mysql_mutex_unlock(&LOCK_xid_list);
+
+ ha_commit_checkpoint_request(entry, binlog_checkpoint_callback);
+ /*
+ When we rotated the binlog, we incremented xid_count to make sure the
+ entry would not go away until this point, where we have done all necessary
+ commit_checkpoint_request() calls.
+ So now we can (and must) decrease the count - when it reaches zero, we
+ will know that both all pending unlog() and all pending
+ commit_checkpoint_notify() calls are done, and we can log a new binlog
+ checkpoint.
+ */
+ mark_xid_done(binlog_id, true);
+}
+
+
+/**
+ The method executes rotation when LOCK_log is already acquired
+ by the caller.
+
+ @param force_rotate caller can request the log rotation
+ @param check_purge is set to true if rotation took place
+
+ @note
+ Caller _must_ check the check_purge variable. If this is set, it means
+ that the binlog was rotated, and caller _must_ ensure that
+ do_checkpoint_request() is called later with the binlog_id of the rotated
+ binlog file. The call to do_checkpoint_request() must happen after
+ LOCK_log is released (which is why we cannot simply do it here).
+ Usually, checkpoint_and_purge() is appropriate, as it will both handle
+ the checkpointing and any needed purging of old logs.
+
+ @note
+ If rotation fails, for instance the server was unable
+ to create a new log file, we still try to write an
+ incident event to the current log.
+
+ @retval
+ nonzero - error in rotating routine.
+*/
+int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
+{
+ int error= 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::rotate");
+
+#ifdef WITH_WSREP
+ if (WSREP_ON && wsrep_to_isolation)
+ {
+ *check_purge= false;
+ WSREP_DEBUG("avoiding binlog rotate due to TO isolation: %d",
+ wsrep_to_isolation);
+ DBUG_RETURN(0);
+ }
+#endif /* WITH_WSREP */
+
+ //todo: fix the macro def and restore safe_mutex_assert_owner(&LOCK_log);
+ *check_purge= false;
+
+ if (force_rotate || (my_b_tell(&log_file) >= (my_off_t) max_size))
+ {
+ ulong binlog_id= current_binlog_id;
+ /*
+ We rotate the binlog, so we need to start a commit checkpoint in all
+ supporting engines - when it finishes, we can log a new binlog checkpoint
+ event.
+
+ But we cannot start the checkpoint here - there could be a group commit
+ still in progress which needs to be included in the checkpoint, and
+ besides we do not want to do the (possibly expensive) checkpoint while
+ LOCK_log is held.
+
+ On the other hand, we must be sure that the xid_count entry for the
+ previous log does not go away until we start the checkpoint - which it
+ could do as it is no longer the most recent. So we increment xid_count
+ (to count the pending checkpoint request) - this will fix the entry in
+ place until we decrement again in do_checkpoint_request().
+ */
+ mark_xids_active(binlog_id, 1);
+
+ if (unlikely((error= new_file_without_locking())))
+ {
+ /**
+ Be conservative... There are possible lost events (eg,
+ failing to log the Execute_load_query_log_event
+ on a LOAD DATA while using a non-transactional
+ table)!
+
+ We give it a shot and try to write an incident event anyway
+ to the current log.
+ */
+ if (!write_incident_already_locked(current_thd))
+ flush_and_sync(0);
+
+ /*
+ We failed to rotate - so we have to decrement the xid_count back that
+ we incremented before attempting the rotate.
+ */
+ mark_xid_done(binlog_id, false);
+ }
+ else
+ *check_purge= true;
+ }
+ DBUG_RETURN(error);
+}
+
+/**
+ The method executes logs purging routine.
+
+ @retval
+ nonzero - error in rotating routine.
+*/
+void MYSQL_BIN_LOG::purge()
+{
+ mysql_mutex_assert_not_owner(&LOCK_log);
+#ifdef HAVE_REPLICATION
+ if (binlog_expire_logs_seconds)
+ {
+ DEBUG_SYNC(current_thd, "at_purge_logs_before_date");
+ time_t purge_time= my_time(0) - binlog_expire_logs_seconds;
+ DBUG_EXECUTE_IF("expire_logs_always", { purge_time = my_time(0); });
+ if (purge_time >= 0)
+ {
+ purge_logs_before_date(purge_time);
+ }
+ DEBUG_SYNC(current_thd, "after_purge_logs_before_date");
+ }
+#endif
+}
+
+
+void MYSQL_BIN_LOG::checkpoint_and_purge(ulong binlog_id)
+{
+ do_checkpoint_request(binlog_id);
+ purge();
+}
+
+
+/**
+ Searches for the first (oldest) binlog file name in in the binlog index.
+
+ @param[in,out] buf_arg pointer to a buffer to hold found
+ the first binary log file name
+ @return NULL on success, otherwise error message
+*/
+static const char* get_first_binlog(char* buf_arg)
+{
+ IO_CACHE *index_file;
+ size_t length;
+ char fname[FN_REFLEN];
+ const char* errmsg= NULL;
+
+ DBUG_ENTER("get_first_binlog");
+
+ DBUG_ASSERT(mysql_bin_log.is_open());
+
+ mysql_bin_log.lock_index();
+
+ index_file=mysql_bin_log.get_index_file();
+ if (reinit_io_cache(index_file, READ_CACHE, (my_off_t) 0, 0, 0))
+ {
+ errmsg= "failed to create a cache on binlog index";
+ goto end;
+ }
+ /* The file ends with EOF or empty line */
+ if ((length=my_b_gets(index_file, fname, sizeof(fname))) <= 1)
+ {
+ errmsg= "empty binlog index";
+ goto end;
+ }
+ else
+ {
+ fname[length-1]= 0; // Remove end \n
+ }
+ if (normalize_binlog_name(buf_arg, fname, false))
+ {
+ errmsg= "could not normalize the first file name in the binlog index";
+ goto end;
+ }
+end:
+ mysql_bin_log.unlock_index();
+
+ DBUG_RETURN(errmsg);
+}
+
+/**
+ Check weather the gtid binlog state can safely remove gtid
+ domains passed as the argument. A safety condition is satisfied when
+ there are no events from the being deleted domains in the currently existing
+ binlog files. Upon successful check the supplied domains are removed
+ from @@gtid_binlog_state. The caller is supposed to rotate binlog so that
+ the active latest file won't have the deleted domains in its Gtid_list header.
+
+ @param domain_drop_lex gtid domain id sequence from lex.
+ Passed as a pointer to dynamic array must be not empty
+ unless pointer value NULL.
+ @retval zero on success
+ @retval > 0 ineffective call none from the *non* empty
+ gtid domain sequence is deleted
+ @retval < 0 on error
+*/
+static int do_delete_gtid_domain(DYNAMIC_ARRAY *domain_drop_lex)
+{
+ int rc= 0;
+ Gtid_list_log_event *glev= NULL;
+ char buf[FN_REFLEN];
+ File file;
+ IO_CACHE cache;
+ const char* errmsg= NULL;
+ char errbuf[MYSQL_ERRMSG_SIZE]= {0};
+
+ if (!domain_drop_lex)
+ return 0; // still "effective" having empty domain sequence to delete
+
+ DBUG_ASSERT(domain_drop_lex->elements > 0);
+ mysql_mutex_assert_owner(mysql_bin_log.get_log_lock());
+
+ if ((errmsg= get_first_binlog(buf)) != NULL)
+ goto end;
+ bzero((char*) &cache, sizeof(cache));
+ if ((file= open_binlog(&cache, buf, &errmsg)) == (File) -1)
+ goto end;
+ errmsg= get_gtid_list_event(&cache, &glev);
+ end_io_cache(&cache);
+ mysql_file_close(file, MYF(MY_WME));
+
+ DBUG_EXECUTE_IF("inject_binlog_delete_domain_init_error",
+ errmsg= "injected error";);
+ if (errmsg)
+ goto end;
+ errmsg= rpl_global_gtid_binlog_state.drop_domain(domain_drop_lex,
+ glev, errbuf);
+
+end:
+ if (errmsg)
+ {
+ if (strlen(errmsg) > 0)
+ {
+ my_error(ER_BINLOG_CANT_DELETE_GTID_DOMAIN, MYF(0), errmsg);
+ rc= -1;
+ }
+ else
+ {
+ rc= 1;
+ }
+ }
+ delete glev;
+
+ return rc;
+}
+
+/**
+ The method is a shortcut of @c rotate() and @c purge().
+ LOCK_log is acquired prior to rotate and is released after it.
+
+ @param force_rotate caller can request the log rotation
+
+ @retval
+ nonzero - error in rotating routine.
+*/
+int MYSQL_BIN_LOG::rotate_and_purge(bool force_rotate,
+ DYNAMIC_ARRAY *domain_drop_lex)
+{
+ int err_gtid=0, error= 0;
+ ulong prev_binlog_id;
+ DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
+ bool check_purge= false;
+
+ mysql_mutex_lock(&LOCK_log);
+
+ DEBUG_SYNC(current_thd, "rotate_after_acquire_LOCK_log");
+
+ prev_binlog_id= current_binlog_id;
+
+ if ((err_gtid= do_delete_gtid_domain(domain_drop_lex)))
+ {
+ // inffective attempt to delete merely skips rotate and purge
+ if (err_gtid < 0)
+ error= 1; // otherwise error is propagated the user
+ }
+ else if (unlikely((error= rotate(force_rotate, &check_purge))))
+ check_purge= false;
+
+ DEBUG_SYNC(current_thd, "rotate_after_rotate");
+
+ /*
+ NOTE: Run purge_logs wo/ holding LOCK_log because it does not need
+ the mutex. Otherwise causes various deadlocks.
+ Explicit binlog rotation must be synchronized with a concurrent
+ binlog ordered commit, in particular not let binlog
+ checkpoint notification request until early binlogged
+ concurrent commits have has been completed.
+ */
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
+ mysql_mutex_unlock(&LOCK_log);
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+
+ if (check_purge)
+ checkpoint_and_purge(prev_binlog_id);
+
+ DBUG_RETURN(error);
+}
+
+uint MYSQL_BIN_LOG::next_file_id()
+{
+ uint res;
+ mysql_mutex_lock(&LOCK_log);
+ res = file_id++;
+ mysql_mutex_unlock(&LOCK_log);
+ return res;
+}
+
+class CacheWriter: public Log_event_writer
+{
+public:
+ size_t remains;
+
+ CacheWriter(THD *thd_arg, IO_CACHE *file_arg, bool do_checksum,
+ Binlog_crypt_data *cr)
+ : Log_event_writer(file_arg, 0, cr), remains(0), thd(thd_arg),
+ first(true)
+ { checksum_len= do_checksum ? BINLOG_CHECKSUM_LEN : 0; }
+
+ ~CacheWriter()
+ { status_var_add(thd->status_var.binlog_bytes_written, bytes_written); }
+
+ int write(uchar* pos, size_t len)
+ {
+ DBUG_ENTER("CacheWriter::write");
+ if (first)
+ write_header(pos, len);
+ else
+ write_data(pos, len);
+
+ remains -= len;
+ if ((first= !remains))
+ write_footer();
+ DBUG_RETURN(0);
+ }
+private:
+ THD *thd;
+ bool first;
+};
+
+/*
+ Write the contents of a cache to the binary log.
+
+ SYNOPSIS
+ write_cache()
+ thd Current_thread
+ cache Cache to write to the binary log
+
+ DESCRIPTION
+ Write the contents of the cache to the binary log. The cache will
+ be reset as a READ_CACHE to be able to read the contents from it.
+
+ Reading from the trans cache with possible (per @c binlog_checksum_options)
+ adding checksum value and then fixing the length and the end_log_pos of
+ events prior to fill in the binlog cache.
+*/
+
+int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
+{
+ DBUG_ENTER("MYSQL_BIN_LOG::write_cache");
+
+ mysql_mutex_assert_owner(&LOCK_log);
+ if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ size_t length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
+ size_t val;
+ size_t end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t
+ uchar header[LOG_EVENT_HEADER_LEN];
+ CacheWriter writer(thd, &log_file, binlog_checksum_options, &crypto);
+
+ if (crypto.scheme)
+ {
+ writer.ctx= alloca(crypto.ctx_size);
+ writer.set_encrypted_writer();
+ }
+ // while there is just one alg the following must hold:
+ DBUG_ASSERT(binlog_checksum_options == BINLOG_CHECKSUM_ALG_OFF ||
+ binlog_checksum_options == BINLOG_CHECKSUM_ALG_CRC32);
+
+ /*
+ The events in the buffer have incorrect end_log_pos data
+ (relative to beginning of group rather than absolute),
+ so we'll recalculate them in situ so the binlog is always
+ correct, even in the middle of a group. This is possible
+ because we now know the start position of the group (the
+ offset of this cache in the log, if you will); all we need
+ to do is to find all event-headers, and add the position of
+ the group to the end_log_pos of each event. This is pretty
+ straight forward, except that we read the cache in segments,
+ so an event-header might end up on the cache-border and get
+ split.
+ */
+
+ group= (size_t)my_b_tell(&log_file);
+ hdr_offs= carry= 0;
+
+ do
+ {
+ /*
+ if we only got a partial header in the last iteration,
+ get the other half now and process a full header.
+ */
+ if (unlikely(carry > 0))
+ {
+ DBUG_ASSERT(carry < LOG_EVENT_HEADER_LEN);
+ size_t tail= LOG_EVENT_HEADER_LEN - carry;
+
+ /* assemble both halves */
+ memcpy(&header[carry], (char *)cache->read_pos, tail);
+
+ uint32 len= uint4korr(header + EVENT_LEN_OFFSET);
+ writer.remains= len;
+
+ /* fix end_log_pos */
+ end_log_pos_inc += writer.checksum_len;
+ val= uint4korr(header + LOG_POS_OFFSET) + group + end_log_pos_inc;
+ int4store(header + LOG_POS_OFFSET, val);
+
+ /* fix len */
+ len+= writer.checksum_len;
+ int4store(header + EVENT_LEN_OFFSET, len);
+
+ if (writer.write(header, LOG_EVENT_HEADER_LEN))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+
+ cache->read_pos+= tail;
+ length-= tail;
+ carry= 0;
+
+ /* next event header at ... */
+ hdr_offs= len - LOG_EVENT_HEADER_LEN - writer.checksum_len;
+ }
+
+ /* if there is anything to write, process it. */
+
+ if (likely(length > 0))
+ {
+ DBUG_EXECUTE_IF("fail_binlog_write_1",
+ errno= 28; DBUG_RETURN(ER_ERROR_ON_WRITE););
+ /*
+ process all event-headers in this (partial) cache.
+ if next header is beyond current read-buffer,
+ we'll get it later (though not necessarily in the
+ very next iteration, just "eventually").
+ */
+
+ if (hdr_offs >= length)
+ {
+ if (writer.write(cache->read_pos, length))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+
+ while (hdr_offs < length)
+ {
+ /*
+ finish off with remains of the last event that crawls
+ from previous into the current buffer
+ */
+ if (writer.remains != 0)
+ {
+ if (writer.write(cache->read_pos, hdr_offs))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+
+ /*
+ partial header only? save what we can get, process once
+ we get the rest.
+ */
+ if (hdr_offs + LOG_EVENT_HEADER_LEN > length)
+ {
+ carry= length - hdr_offs;
+ memcpy(header, (char *)cache->read_pos + hdr_offs, carry);
+ length= hdr_offs;
+ }
+ else
+ {
+ /* we've got a full event-header, and it came in one piece */
+ uchar *ev= (uchar *)cache->read_pos + hdr_offs;
+ uint ev_len= uint4korr(ev + EVENT_LEN_OFFSET); // netto len
+ uchar *log_pos= ev + LOG_POS_OFFSET;
+
+ end_log_pos_inc += writer.checksum_len;
+ /* fix end_log_pos */
+ val= uint4korr(log_pos) + group + end_log_pos_inc;
+ int4store(log_pos, val);
+
+ /* fix length */
+ int4store(ev + EVENT_LEN_OFFSET, ev_len + writer.checksum_len);
+
+ writer.remains= ev_len;
+ if (writer.write(ev, MY_MIN(ev_len, length - hdr_offs)))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+
+ /* next event header at ... */
+ hdr_offs += ev_len; // incr by the netto len
+
+ DBUG_ASSERT(!writer.checksum_len || writer.remains == 0 || hdr_offs >= length);
+ }
+ }
+
+ /*
+ Adjust hdr_offs. Note that it may still point beyond the segment
+ read in the next iteration; if the current event is very long,
+ it may take a couple of read-iterations (and subsequent adjustments
+ of hdr_offs) for it to point into the then-current segment.
+ If we have a split header (!carry), hdr_offs will be set at the
+ beginning of the next iteration, overwriting the value we set here:
+ */
+ hdr_offs -= length;
+ }
+ } while ((length= my_b_fill(cache)));
+
+ DBUG_ASSERT(carry == 0);
+ DBUG_ASSERT(!writer.checksum_len || writer.remains == 0);
+
+ DBUG_RETURN(0); // All OK
+}
+
+/*
+ Helper function to get the error code of the query to be binlogged.
+ */
+int query_error_code(THD *thd, bool not_killed)
+{
+ int error;
+
+ if (not_killed || (killed_mask_hard(thd->killed) == KILL_BAD_DATA))
+ {
+ error= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0;
+ if (!error)
+ return error;
+
+ /* thd->get_get_stmt_da()->sql_errno() might be ER_SERVER_SHUTDOWN or
+ ER_QUERY_INTERRUPTED, So here we need to make sure that error
+ is not set to these errors when specified not_killed by the
+ caller.
+ */
+ if (error == ER_SERVER_SHUTDOWN || error == ER_QUERY_INTERRUPTED ||
+ error == ER_NEW_ABORTING_CONNECTION || error == ER_CONNECTION_KILLED)
+ error= 0;
+ }
+ else
+ {
+ /* killed status for DELAYED INSERT thread should never be used */
+ DBUG_ASSERT(!(thd->system_thread & SYSTEM_THREAD_DELAYED_INSERT));
+ error= thd->killed_errno();
+ }
+
+ return error;
+}
+
+
+bool MYSQL_BIN_LOG::write_incident_already_locked(THD *thd)
+{
+ uint error= 0;
+ DBUG_ENTER("MYSQL_BIN_LOG::write_incident_already_locked");
+ Incident incident= INCIDENT_LOST_EVENTS;
+ Incident_log_event ev(thd, incident, &write_error_msg);
+
+ if (likely(is_open()))
+ {
+ error= write_event(&ev);
+ status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+bool MYSQL_BIN_LOG::write_incident(THD *thd)
+{
+ uint error= 0;
+ my_off_t offset;
+ bool check_purge= false;
+ ulong prev_binlog_id;
+ DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
+
+ mysql_mutex_lock(&LOCK_log);
+ if (likely(is_open()))
+ {
+ prev_binlog_id= current_binlog_id;
+ if (likely(!(error= DBUG_IF("incident_event_write_error")
+ ? 1
+ : write_incident_already_locked(thd))) &&
+ likely(!(error= flush_and_sync(0))))
+ {
+ update_binlog_end_pos();
+ if (unlikely((error= rotate(false, &check_purge))))
+ check_purge= false;
+ }
+
+ offset= my_b_tell(&log_file);
+
+ update_binlog_end_pos(offset);
+
+ /*
+ Take mutex to protect against a reader seeing partial writes of 64-bit
+ offset on 32-bit CPUs.
+ */
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= offset;
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+ mysql_mutex_unlock(&LOCK_log);
+
+ if (check_purge)
+ checkpoint_and_purge(prev_binlog_id);
+ }
+ else
+ {
+ mysql_mutex_unlock(&LOCK_log);
+ }
+
+ /*
+ Upon writing incident event, check for thd->error() and print the
+ relevant error message in the error log.
+ */
+ if (thd->is_error())
+ {
+ sql_print_error("Write to binary log failed: "
+ "%s. An incident event is written to binary log "
+ "and slave will be stopped.\n",
+ thd->get_stmt_da()->message());
+ }
+ if (error)
+ {
+ sql_print_error("Incident event write to the binary log file failed.");
+ }
+
+ DBUG_RETURN(error);
+}
+
+void
+MYSQL_BIN_LOG::
+write_binlog_checkpoint_event_already_locked(const char *name_arg, uint len)
+{
+ my_off_t offset;
+ Binlog_checkpoint_log_event ev(name_arg, len);
+ /*
+ Note that we must sync the binlog checkpoint to disk.
+ Otherwise a subsequent log purge could delete binlogs that XA recovery
+ thinks are needed (even though they are not really).
+ */
+ if (!write_event(&ev) && !flush_and_sync(0))
+ {
+ update_binlog_end_pos();
+ }
+ else
+ {
+ /*
+ If we fail to write the checkpoint event, something is probably really
+ bad with the binlog. We complain in the error log.
+
+ Note that failure to write binlog checkpoint does not compromise the
+ ability to do crash recovery - crash recovery will just have to scan a
+ bit more of the binlog than strictly necessary.
+ */
+ sql_print_error("Failed to write binlog checkpoint event to binary log");
+ }
+
+ offset= my_b_tell(&log_file);
+
+ update_binlog_end_pos(offset);
+
+ /*
+ Take mutex to protect against a reader seeing partial writes of 64-bit
+ offset on 32-bit CPUs.
+ */
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ last_commit_pos_offset= offset;
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+}
+
+
+/**
+ Write a cached log entry to the binary log.
+ - To support transaction over replication, we wrap the transaction
+ with BEGIN/COMMIT or BEGIN/ROLLBACK in the binary log.
+ We want to write a BEGIN/ROLLBACK block when a non-transactional table
+ was updated in a transaction which was rolled back. This is to ensure
+ that the same updates are run on the slave.
+
+ @param thd
+ @param cache The cache to copy to the binlog
+ @param commit_event The commit event to print after writing the
+ contents of the cache.
+ @param incident Defines if an incident event should be created to
+ notify that some non-transactional changes did
+ not get into the binlog.
+
+ @note
+ We only come here if there is something in the cache.
+ @note
+ The thing in the cache is always a complete transaction.
+ @note
+ 'cache' needs to be reinitialized after this functions returns.
+*/
+
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
+ binlog_cache_mngr *cache_mngr,
+ Log_event *end_ev, bool all,
+ bool using_stmt_cache,
+ bool using_trx_cache,
+ bool is_ro_1pc)
+{
+ group_commit_entry entry;
+ Ha_trx_info *ha_info;
+ DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
+
+ /*
+ Control should not be allowed beyond this point in wsrep_emulate_bin_log
+ mode. Also, do not write the cached updates to binlog if binary logging is
+ disabled (log-bin/sql_log_bin).
+ */
+ if (wsrep_emulate_bin_log)
+ {
+ DBUG_RETURN(0);
+ }
+ else if (!(thd->variables.option_bits & OPTION_BIN_LOG))
+ {
+ cache_mngr->need_unlog= false;
+ DBUG_RETURN(0);
+ }
+
+ entry.thd= thd;
+ entry.cache_mngr= cache_mngr;
+ entry.error= 0;
+ entry.all= all;
+ entry.using_stmt_cache= using_stmt_cache;
+ entry.using_trx_cache= using_trx_cache;
+ entry.need_unlog= is_preparing_xa(thd);
+ ha_info= all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list;
+ entry.ro_1pc= is_ro_1pc;
+ entry.end_event= end_ev;
+ auto has_xid= entry.end_event->get_type_code() == XID_EVENT;
+
+ for (; has_xid && !entry.need_unlog && ha_info; ha_info= ha_info->next())
+ {
+ if (ha_info->is_started() && ha_info->ht() != binlog_hton &&
+ !ha_info->ht()->commit_checkpoint_request)
+ entry.need_unlog= true;
+ }
+
+ if (cache_mngr->stmt_cache.has_incident() ||
+ cache_mngr->trx_cache.has_incident())
+ {
+ Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, &write_error_msg);
+ entry.incident_event= &inc_ev;
+ DBUG_RETURN(write_transaction_to_binlog_events(&entry));
+ }
+ else
+ {
+ entry.incident_event= NULL;
+ DBUG_RETURN(write_transaction_to_binlog_events(&entry));
+ }
+}
+
+
+/*
+ Put a transaction that is ready to commit in the group commit queue.
+ The transaction is identified by the ENTRY object passed into this function.
+
+ To facilitate group commit for the binlog, we first queue up ourselves in
+ this function. Then later the first thread to enter the queue waits for
+ the LOCK_log mutex, and commits for everyone in the queue once it gets the
+ lock. Any other threads in the queue just wait for the first one to finish
+ the commit and wake them up. This way, all transactions in the queue get
+ committed in a single disk operation.
+
+ The main work in this function is when the commit in one transaction has
+ been marked to wait for the commit of another transaction to happen
+ first. This is used to support in-order parallel replication, where
+ transactions can execute out-of-order but need to be committed in-order with
+ how they happened on the master. The waiting of one commit on another needs
+ to be integrated with the group commit queue, to ensure that the waiting
+ transaction can participate in the same group commit as the waited-for
+ transaction.
+
+ So when we put a transaction in the queue, we check if there were other
+ transactions already prepared to commit but just waiting for the first one
+ to commit. If so, we add those to the queue as well, transitively for all
+ waiters.
+
+ And if a transaction is marked to wait for a prior transaction, but that
+ prior transaction is already queued for group commit, then we can queue the
+ new transaction directly to participate in the group commit.
+
+ @retval < 0 Error
+ @retval -2 WSREP error with commit ordering
+ @retval -3 WSREP return code to mark the leader
+ @retval > 0 If queued as the first entry in the queue (meaning this
+ is the leader)
+ @retval 0 Otherwise (queued as participant, leader handles the commit)
+*/
+
+int
+MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
+{
+ group_commit_entry *entry, *orig_queue, *last;
+ wait_for_commit *cur;
+ wait_for_commit *wfc;
+ bool backup_lock_released= 0;
+ int result= 0;
+ THD *thd= orig_entry->thd;
+ DBUG_ENTER("MYSQL_BIN_LOG::queue_for_group_commit");
+ DBUG_ASSERT(thd == current_thd);
+
+ /*
+ Check if we need to wait for another transaction to commit before us.
+
+ It is safe to do a quick check without lock first in the case where we do
+ not have to wait. But if the quick check shows we need to wait, we must do
+ another safe check under lock, to avoid the race where the other
+ transaction wakes us up between the check and the wait.
+ */
+ wfc= orig_entry->thd->wait_for_commit_ptr;
+ orig_entry->queued_by_other= false;
+ if (wfc && wfc->waitee.load(std::memory_order_acquire))
+ {
+ wait_for_commit *loc_waitee;
+
+ mysql_mutex_lock(&wfc->LOCK_wait_commit);
+ /*
+ Do an extra check here, this time safely under lock.
+
+ If waitee->commit_started is set, it means that the transaction we need
+ to wait for has already queued up for group commit. In this case it is
+ safe for us to queue up immediately as well, increasing the opprtunities
+ for group commit. Because waitee has taken the LOCK_prepare_ordered
+ before setting the flag, so there is no risk that we can queue ahead of
+ it.
+ */
+ if ((loc_waitee= wfc->waitee.load(std::memory_order_relaxed)) &&
+ !loc_waitee->commit_started)
+ {
+ PSI_stage_info old_stage;
+
+ /*
+ Release MDL_BACKUP_COMMIT LOCK while waiting for other threads to
+ commit.
+ This is needed to avoid deadlock between the other threads (which not
+ yet have the MDL_BACKUP_COMMIT_LOCK) and any threads using
+ BACKUP LOCK BLOCK_COMMIT.
+ */
+ if (thd->backup_commit_lock && thd->backup_commit_lock->ticket &&
+ !backup_lock_released)
+ {
+ backup_lock_released= 1;
+ thd->mdl_context.release_lock(thd->backup_commit_lock->ticket);
+ thd->backup_commit_lock->ticket= 0;
+ }
+
+ /*
+ By setting wfc->opaque_pointer to our own entry, we mark that we are
+ ready to commit, but waiting for another transaction to commit before
+ us.
+
+ This other transaction may then take over the commit process for us to
+ get us included in its own group commit. If this happens, the
+ queued_by_other flag is set.
+
+ Setting this flag may or may not be seen by the other thread, but we
+ are safe in any case: The other thread will set queued_by_other under
+ its LOCK_wait_commit, and we will not check queued_by_other until after
+ we have been woken up.
+ */
+ wfc->opaque_pointer= orig_entry;
+ DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior");
+ orig_entry->thd->ENTER_COND(&wfc->COND_wait_commit,
+ &wfc->LOCK_wait_commit,
+ &stage_waiting_for_prior_transaction_to_commit,
+ &old_stage);
+ while ((loc_waitee= wfc->waitee.load(std::memory_order_relaxed)) &&
+ !orig_entry->thd->check_killed(1))
+ mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
+ wfc->opaque_pointer= NULL;
+ DBUG_PRINT("info", ("After waiting for prior commit, queued_by_other=%d",
+ orig_entry->queued_by_other));
+
+ if (loc_waitee)
+ {
+ /* Wait terminated due to kill. */
+ mysql_mutex_lock(&loc_waitee->LOCK_wait_commit);
+ if (loc_waitee->wakeup_subsequent_commits_running ||
+ orig_entry->queued_by_other)
+ {
+ /* Our waitee is already waking us up, so ignore the kill. */
+ mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit);
+ do
+ {
+ mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
+ } while (wfc->waitee.load(std::memory_order_relaxed));
+ }
+ else
+ {
+ /* We were killed, so remove us from the list of waitee. */
+ wfc->remove_from_list(&loc_waitee->subsequent_commits_list);
+ mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit);
+ /*
+ This is the thread clearing its own status, it is no longer on
+ the list of waiters. So no memory barriers are needed here.
+ */
+ wfc->waitee.store(NULL, std::memory_order_relaxed);
+
+ orig_entry->thd->EXIT_COND(&old_stage);
+ /* Interrupted by kill. */
+ DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior_killed");
+ wfc->wakeup_error= orig_entry->thd->killed_errno();
+ if (!wfc->wakeup_error)
+ wfc->wakeup_error= ER_QUERY_INTERRUPTED;
+ my_message(wfc->wakeup_error,
+ ER_THD(orig_entry->thd, wfc->wakeup_error), MYF(0));
+ result= -1;
+ goto end;
+ }
+ }
+ orig_entry->thd->EXIT_COND(&old_stage);
+ }
+ else
+ mysql_mutex_unlock(&wfc->LOCK_wait_commit);
+ }
+ /*
+ If the transaction we were waiting for has already put us into the group
+ commit queue (and possibly already done the entire binlog commit for us),
+ then there is nothing else to do.
+ */
+ if (orig_entry->queued_by_other)
+ goto end;
+
+ if (wfc && wfc->wakeup_error)
+ {
+ my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
+ result= -1;
+ goto end;
+ }
+
+ /* Now enqueue ourselves in the group commit queue. */
+ DEBUG_SYNC(orig_entry->thd, "commit_before_enqueue");
+ orig_entry->thd->clear_wakeup_ready();
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ orig_queue= group_commit_queue;
+
+ /*
+ Iteratively process everything added to the queue, looking for waiters,
+ and their waiters, and so on. If a waiter is ready to commit, we
+ immediately add it to the queue, and mark it as queued_by_other.
+
+ This would be natural to do with recursion, but we want to avoid
+ potentially unbounded recursion blowing the C stack, so we use the list
+ approach instead.
+
+ We keep a list of the group_commit_entry of all the waiters that need to
+ be processed. Initially this list contains only the entry passed into this
+ function.
+
+ We process entries in the list one by one. The element currently being
+ processed is pointed to by `entry`, and the element at the end of the list
+ is pointed to by `last` (we do not use NULL to terminate the list).
+
+ As we process an entry, any waiters for that entry are added at the end of
+ the list, to be processed in subsequent iterations. Then the entry is added
+ to the group_commit_queue. This continues until the list is exhausted,
+ with all entries ever added eventually processed.
+
+ The end result is a breath-first traversal of the tree of waiters,
+ re-using the `next' pointers of the group_commit_entry objects in place of
+ extra stack space in a recursive traversal.
+
+ The temporary list linked through these `next' pointers is not used by the
+ caller or any other function; it only exists while doing the iterative
+ tree traversal. After, all the processed entries are linked into the
+ group_commit_queue.
+ */
+
+ cur= wfc;
+ last= orig_entry;
+ entry= orig_entry;
+ for (;;)
+ {
+ group_commit_entry *next_entry;
+
+ if (entry->cache_mngr->using_xa)
+ {
+ DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
+ run_prepare_ordered(entry->thd, entry->all);
+ DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
+ }
+
+ if (cur)
+ {
+ /*
+ Now that we have taken LOCK_prepare_ordered and will queue up in the
+ group commit queue, it is safe for following transactions to queue
+ themselves. We will grab here any transaction that is now ready to
+ queue up, but after that, more transactions may become ready while the
+ leader is waiting to start the group commit. So set the flag
+ `commit_started', so that later transactions can still participate in
+ the group commit..
+ */
+ cur->commit_started= true;
+
+ /*
+ Check if this transaction has other transaction waiting for it to
+ commit.
+
+ If so, process the waiting transactions, and their waiters and so on,
+ transitively.
+ */
+ if (cur->subsequent_commits_list)
+ {
+ wait_for_commit *waiter, **waiter_ptr;
+
+ mysql_mutex_lock(&cur->LOCK_wait_commit);
+ /*
+ Grab the list, now safely under lock, and process it if still
+ non-empty.
+ */
+ waiter= cur->subsequent_commits_list;
+ waiter_ptr= &cur->subsequent_commits_list;
+ while (waiter)
+ {
+ wait_for_commit *next_waiter= waiter->next_subsequent_commit;
+ group_commit_entry *entry2=
+ (group_commit_entry *)waiter->opaque_pointer;
+ if (entry2)
+ {
+ /*
+ This is another transaction ready to be written to the binary
+ log. We can put it into the queue directly, without needing a
+ separate context switch to the other thread. We just set a flag
+ so that the other thread will know when it wakes up that it was
+ already processed.
+
+ So remove it from the list of our waiters, and instead put it at
+ the end of the list to be processed in a subsequent iteration of
+ the outer loop.
+ */
+ *waiter_ptr= next_waiter;
+ entry2->queued_by_other= true;
+ last->next= entry2;
+ last= entry2;
+ /*
+ As a small optimisation, we do not actually need to set
+ entry2->next to NULL, as we can use the pointer `last' to check
+ for end-of-list.
+ */
+ }
+ else
+ {
+ /*
+ This transaction is not ready to participate in the group commit
+ yet, so leave it in the waiter list. It might join the group
+ commit later, if it completes soon enough to do so (it will see
+ our wfc->commit_started flag set), or it might commit later in a
+ later group commit.
+ */
+ waiter_ptr= &waiter->next_subsequent_commit;
+ }
+ waiter= next_waiter;
+ }
+ mysql_mutex_unlock(&cur->LOCK_wait_commit);
+ }
+ }
+
+ /*
+ Handle the heuristics that if another transaction is waiting for this
+ transaction (or if it does so later), then we want to trigger group
+ commit immediately, without waiting for the binlog_commit_wait_usec
+ timeout to expire.
+ */
+ entry->thd->waiting_on_group_commit= true;
+
+ /* Add the entry to the group commit queue. */
+ next_entry= entry->next;
+ entry->next= group_commit_queue;
+ group_commit_queue= entry;
+ if (entry == last)
+ break;
+ /*
+ Move to the next entry in the flattened list of waiting transactions
+ that still need to be processed transitively.
+ */
+ entry= next_entry;
+ DBUG_ASSERT(entry != NULL);
+ cur= entry->thd->wait_for_commit_ptr;
+ }
+
+ result= orig_queue == NULL;
+
+#ifdef WITH_WSREP
+ if (wsrep_is_active(entry->thd) &&
+ wsrep_run_commit_hook(entry->thd, entry->all))
+ {
+ /* Release commit order here */
+ if (wsrep_ordered_commit(entry->thd, entry->all))
+ result= -2;
+
+ /* return -3, if this is leader */
+ if (orig_queue == NULL)
+ result= -3;
+ }
+#endif /* WITH_WSREP */
+
+ if (opt_binlog_commit_wait_count > 0 && orig_queue != NULL)
+ mysql_cond_signal(&COND_prepare_ordered);
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+ DEBUG_SYNC(orig_entry->thd, "commit_after_release_LOCK_prepare_ordered");
+
+ DBUG_PRINT("info", ("Queued for group commit as %s",
+ (orig_queue == NULL) ? "leader" : "participant"));
+
+end:
+ if (backup_lock_released)
+ thd->mdl_context.acquire_lock(thd->backup_commit_lock,
+ thd->variables.lock_wait_timeout);
+ DBUG_RETURN(result);
+}
+
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
+{
+ int is_leader= queue_for_group_commit(entry);
+#ifdef WITH_WSREP
+ /* commit order was released in queue_for_group_commit() call,
+ here we check if wsrep_commit_ordered() failed or if we are leader */
+ switch (is_leader)
+ {
+ case -2: /* wsrep_ordered_commit() has failed */
+ DBUG_ASSERT(wsrep_is_active(entry->thd));
+ DBUG_ASSERT(wsrep_run_commit_hook(entry->thd, entry->all));
+ entry->thd->wakeup_subsequent_commits(1);
+ return true;
+ case -3: /* this is leader, wait for prior commit to
+ complete. This establishes total order for group leaders
+ */
+ DBUG_ASSERT(wsrep_is_active(entry->thd));
+ DBUG_ASSERT(wsrep_run_commit_hook(entry->thd, entry->all));
+ if (entry->thd->wait_for_prior_commit())
+ return true;
+
+ /* retain the correct is_leader value */
+ is_leader= 1;
+ break;
+
+ default: /* native MariaDB cases */
+ break;
+ }
+#endif /* WITH_WSREP */
+
+ /*
+ The first in the queue handles group commit for all; the others just wait
+ to be signalled when group commit is done.
+ */
+ if (is_leader < 0)
+ return true; /* Error */
+ else if (is_leader)
+ trx_group_commit_leader(entry);
+ else if (!entry->queued_by_other)
+ {
+ DEBUG_SYNC(entry->thd, "after_semisync_queue");
+
+ entry->thd->wait_for_wakeup_ready();
+ }
+ else
+ {
+ /*
+ If we were queued by another prior commit, then we are woken up
+ only when the leader has already completed the commit for us.
+ So nothing to do here then.
+ */
+ }
+
+ if (!opt_optimize_thread_scheduling)
+ {
+ /* For the leader, trx_group_commit_leader() already took the lock. */
+ if (!is_leader)
+ mysql_mutex_lock(&LOCK_commit_ordered);
+
+ DEBUG_SYNC(entry->thd, "commit_loop_entry_commit_ordered");
+ ++num_commits;
+ if (entry->cache_mngr->using_xa && !entry->error)
+ run_commit_ordered(entry->thd, entry->all);
+
+ group_commit_entry *next= entry->next;
+ if (!next)
+ {
+ group_commit_queue_busy= FALSE;
+ mysql_cond_signal(&COND_queue_busy);
+ DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered");
+ }
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+ entry->thd->wakeup_subsequent_commits(entry->error);
+
+ if (next)
+ {
+ /*
+ Wake up the next thread in the group commit.
+
+ The next thread can be waiting in two different ways, depending on
+ whether it put itself in the queue, or if it was put in queue by us
+ because it had to wait for us to commit first.
+
+ So execute the appropriate wakeup, identified by the queued_by_other
+ field.
+ */
+ if (next->queued_by_other)
+ next->thd->wait_for_commit_ptr->wakeup(entry->error);
+ else
+ next->thd->signal_wakeup_ready();
+ }
+ else
+ {
+ /*
+ If we rotated the binlog, and if we are using the unoptimized thread
+ scheduling where every thread runs its own commit_ordered(), then we
+ must do the commit checkpoint and log purge here, after all
+ commit_ordered() calls have finished, and locks have been released.
+ */
+ if (entry->check_purge)
+ checkpoint_and_purge(entry->binlog_id);
+ }
+
+ }
+
+ if (likely(!entry->error))
+ return entry->thd->wait_for_prior_commit();
+
+ switch (entry->error)
+ {
+ case ER_ERROR_ON_WRITE:
+ my_error(ER_ERROR_ON_WRITE, MYF(ME_ERROR_LOG), name, entry->commit_errno);
+ break;
+ case ER_ERROR_ON_READ:
+ my_error(ER_ERROR_ON_READ, MYF(ME_ERROR_LOG),
+ entry->error_cache->file_name, entry->commit_errno);
+ break;
+ default:
+ /*
+ There are not (and should not be) any errors thrown not covered above.
+ But just in case one is added later without updating the above switch
+ statement, include a catch-all.
+ */
+ my_printf_error(entry->error,
+ "Error writing transaction to binary log: %d",
+ MYF(ME_ERROR_LOG), entry->error);
+ }
+
+ /*
+ Since we return error, this transaction XID will not be committed, so
+ we need to mark it as not needed for recovery (unlog() is not called
+ for a transaction if log_xid() fails).
+ */
+ if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid &&
+ entry->cache_mngr->need_unlog)
+ mark_xid_done(entry->cache_mngr->binlog_id, true);
+
+ return 1;
+}
+
+/*
+ Do binlog group commit as the lead thread.
+
+ This must be called when this statement/transaction is queued at the start of
+ the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
+ commit all the transactions in the queue (more may have entered while waiting
+ for LOCK_log). After commit is done, all other threads in the queue will be
+ signalled.
+
+ */
+void
+MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
+{
+ uint xid_count= 0;
+ my_off_t UNINIT_VAR(commit_offset);
+ group_commit_entry *current, *last_in_queue;
+ group_commit_entry *queue= NULL;
+ bool check_purge= false;
+ ulong UNINIT_VAR(binlog_id);
+ uint64 commit_id;
+ DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
+
+ {
+#ifdef ENABLED_DEBUG_SYNC
+ DBUG_EXECUTE_IF("inject_binlog_commit_before_get_LOCK_log",
+ DBUG_ASSERT(!debug_sync_set_action(leader->thd, STRING_WITH_LEN
+ ("commit_before_get_LOCK_log SIGNAL waiting WAIT_FOR cont TIMEOUT 1")));
+ );
+#endif
+ /*
+ Lock the LOCK_log(), and once we get it, collect any additional writes
+ that queued up while we were waiting.
+ */
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_log");
+ mysql_mutex_lock(&LOCK_log);
+ DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
+
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ if (opt_binlog_commit_wait_count)
+ wait_for_sufficient_commits();
+ /*
+ Note that wait_for_sufficient_commits() may have released and
+ re-acquired the LOCK_log and LOCK_prepare_ordered if it needed to wait.
+ */
+ current= group_commit_queue;
+ group_commit_queue= NULL;
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+ binlog_id= current_binlog_id;
+
+ /* As the queue is in reverse order of entering, reverse it. */
+ last_in_queue= current;
+ while (current)
+ {
+ group_commit_entry *next= current->next;
+ /*
+ Now that group commit is started, we can clear the flag; there is no
+ longer any use in waiters on this commit trying to trigger it early.
+ */
+ current->thd->waiting_on_group_commit= false;
+ current->next= queue;
+ queue= current;
+ current= next;
+ }
+ DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
+
+ /* Now we have in queue the list of transactions to be committed in order. */
+ }
+
+ DBUG_ASSERT(is_open());
+ if (likely(is_open())) // Should always be true
+ {
+ commit_id= (last_in_queue == leader ? 0 : (uint64)leader->thd->query_id);
+ DBUG_EXECUTE_IF("binlog_force_commit_id",
+ {
+ const LEX_CSTRING commit_name= { STRING_WITH_LEN("commit_id") };
+ bool null_value;
+ user_var_entry *entry=
+ (user_var_entry*) my_hash_search(&leader->thd->user_vars,
+ (uchar*) commit_name.str,
+ commit_name.length);
+ commit_id= entry->val_int(&null_value);
+ });
+ /*
+ Commit every transaction in the queue.
+
+ Note that we are doing this in a different thread than the one running
+ the transaction! So we are limited in the operations we can do. In
+ particular, we cannot call my_error() on behalf of a transaction, as
+ that obtains the THD from thread local storage. Instead, we must set
+ current->error and let the thread do the error reporting itself once
+ we wake it up.
+ */
+ for (current= queue; current != NULL; current= current->next)
+ {
+ set_current_thd(current->thd);
+ binlog_cache_mngr *cache_mngr= current->cache_mngr;
+
+ /*
+ We already checked before that at least one cache is non-empty; if both
+ are empty we would have skipped calling into here.
+ */
+ DBUG_ASSERT(!cache_mngr->stmt_cache.empty() ||
+ !cache_mngr->trx_cache.empty() ||
+ current->thd->transaction->xid_state.is_explicit_XA());
+
+ if (unlikely((current->error= write_transaction_or_stmt(current,
+ commit_id))))
+ current->commit_errno= errno;
+
+ strmake_buf(cache_mngr->last_commit_pos_file, log_file_name);
+ commit_offset= my_b_write_tell(&log_file);
+ cache_mngr->last_commit_pos_offset= commit_offset;
+ if ((cache_mngr->using_xa && cache_mngr->xa_xid) || current->need_unlog)
+ {
+ /*
+ If all storage engines support commit_checkpoint_request(), then we
+ do not need to keep track of when this XID is durably committed.
+ Instead we will just ask the storage engine to durably commit all its
+ XIDs when we rotate a binlog file.
+ */
+ if (current->need_unlog)
+ {
+ xid_count++;
+ cache_mngr->need_unlog= true;
+ cache_mngr->binlog_id= binlog_id;
+ }
+ else
+ cache_mngr->need_unlog= false;
+
+ cache_mngr->delayed_error= false;
+ }
+ }
+ set_current_thd(leader->thd);
+
+ bool synced= 0;
+ if (unlikely(flush_and_sync(&synced)))
+ {
+ for (current= queue; current != NULL; current= current->next)
+ {
+ if (!current->error)
+ {
+ current->error= ER_ERROR_ON_WRITE;
+ current->commit_errno= errno;
+ current->error_cache= NULL;
+ }
+ }
+ }
+ else
+ {
+ DEBUG_SYNC(leader->thd, "commit_before_update_binlog_end_pos");
+ bool any_error= false;
+
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_owner(&LOCK_log);
+ mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+
+ for (current= queue; current != NULL; current= current->next)
+ {
+#ifdef HAVE_REPLICATION
+ if (likely(!current->error) &&
+ unlikely(repl_semisync_master.
+ report_binlog_update(current->thd,
+ current->cache_mngr->
+ last_commit_pos_file,
+ current->cache_mngr->
+ last_commit_pos_offset)))
+ {
+ current->error= ER_ERROR_ON_WRITE;
+ current->commit_errno= -1;
+ current->error_cache= NULL;
+ any_error= true;
+ }
+#endif
+ }
+
+ /*
+ update binlog_end_pos so it can be read by dump thread
+ Note: must be _after_ the RUN_HOOK(after_flush) or else
+ semi-sync might not have put the transaction into
+ it's list before dump-thread tries to send it
+ */
+ update_binlog_end_pos(commit_offset);
+
+ if (unlikely(any_error))
+ sql_print_error("Failed to run 'after_flush' hooks");
+ }
+
+ /*
+ If any commit_events are Xid_log_event, increase the number of pending
+ XIDs in current binlog (it's decreased in ::unlog()). When the count in
+ a (not active) binlog file reaches zero, we know that it is no longer
+ needed in XA recovery, and we can log a new binlog checkpoint event.
+ */
+ if (xid_count > 0)
+ {
+ mark_xids_active(binlog_id, xid_count);
+ }
+
+ if (rotate(false, &check_purge))
+ {
+ /*
+ If we fail to rotate, which thread should get the error?
+ We give the error to the leader, as any my_error() thrown inside
+ rotate() will have been registered for the leader THD.
+
+ However we must not return error from here - that would cause
+ ha_commit_trans() to abort and rollback the transaction, which would
+ leave an inconsistent state with the transaction committed in the
+ binlog but rolled back in the engine.
+
+ Instead set a flag so that we can return error later, from unlog(),
+ when the transaction has been safely committed in the engine.
+ */
+ leader->cache_mngr->delayed_error= true;
+ my_error(ER_ERROR_ON_WRITE, MYF(ME_ERROR_LOG), name, errno);
+ check_purge= false;
+ }
+ /* In case of binlog rotate, update the correct current binlog offset. */
+ commit_offset= my_b_write_tell(&log_file);
+ }
+
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_after_binlog_sync");
+ mysql_mutex_lock(&LOCK_after_binlog_sync);
+ /*
+ We cannot unlock LOCK_log until we have locked LOCK_after_binlog_sync;
+ otherwise scheduling could allow the next group commit to run ahead of us,
+ messing up the order of commit_ordered() calls. But as soon as
+ LOCK_after_binlog_sync is obtained, we can let the next group commit start.
+ */
+ mysql_mutex_unlock(&LOCK_log);
+
+ DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
+
+ /*
+ Loop through threads and run the binlog_sync hook
+ */
+ {
+ mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
+ mysql_mutex_assert_not_owner(&LOCK_log);
+ mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
+ mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
+
+ bool first __attribute__((unused))= true;
+ bool last __attribute__((unused));
+ for (current= queue; current != NULL; current= current->next)
+ {
+ last= current->next == NULL;
+#ifdef HAVE_REPLICATION
+ if (likely(!current->error))
+ current->error=
+ repl_semisync_master.wait_after_sync(current->cache_mngr->
+ last_commit_pos_file,
+ current->cache_mngr->
+ last_commit_pos_offset);
+#endif
+ first= false;
+ }
+ }
+
+ DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
+
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ DBUG_EXECUTE_IF("crash_before_engine_commit",
+ {
+ DBUG_SUICIDE();
+ });
+ last_commit_pos_offset= commit_offset;
+
+ /*
+ Unlock LOCK_after_binlog_sync only *after* LOCK_commit_ordered has been
+ acquired so that groups can not reorder for the different stages of
+ the group commit procedure.
+ */
+ mysql_mutex_unlock(&LOCK_after_binlog_sync);
+ DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_after_binlog_sync");
+ ++num_group_commits;
+
+ if (!opt_optimize_thread_scheduling)
+ {
+ /*
+ If we want to run commit_ordered() each in the transaction's own thread
+ context, then we need to mark the queue reserved; we need to finish all
+ threads in one group commit before the next group commit can be allowed
+ to proceed, and we cannot unlock a simple pthreads mutex in a different
+ thread from the one that locked it.
+ */
+
+ while (group_commit_queue_busy)
+ mysql_cond_wait(&COND_queue_busy, &LOCK_commit_ordered);
+ group_commit_queue_busy= TRUE;
+
+ /*
+ Set these so parent can run checkpoint_and_purge() in last thread.
+ (When using optimized thread scheduling, we run checkpoint_and_purge()
+ in this function, so parent does not need to and we need not set these
+ values).
+ */
+ last_in_queue->check_purge= check_purge;
+ last_in_queue->binlog_id= binlog_id;
+
+ /* Note that we return with LOCK_commit_ordered locked! */
+ DBUG_VOID_RETURN;
+ }
+
+ /*
+ Wakeup each participant waiting for our group commit, first calling the
+ commit_ordered() methods for any transactions doing 2-phase commit.
+ */
+ current= queue;
+ while (current != NULL)
+ {
+ group_commit_entry *next;
+
+ DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
+ ++num_commits;
+ if (current->cache_mngr->using_xa && likely(!current->error) &&
+ !DBUG_IF("skip_commit_ordered"))
+ {
+ mysql_mutex_lock(&current->thd->LOCK_thd_data);
+ run_commit_ordered(current->thd, current->all);
+ mysql_mutex_unlock(&current->thd->LOCK_thd_data);
+ }
+ current->thd->wakeup_subsequent_commits(current->error);
+
+ /*
+ Careful not to access current->next after waking up the other thread! As
+ it may change immediately after wakeup.
+ */
+ next= current->next;
+ if (current != leader) // Don't wake up ourself
+ {
+ if (current->queued_by_other)
+ current->thd->wait_for_commit_ptr->wakeup(current->error);
+ else
+ current->thd->signal_wakeup_ready();
+ }
+ current= next;
+ }
+ DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+ DEBUG_SYNC(leader->thd, "commit_after_group_release_commit_ordered");
+
+ if (check_purge)
+ checkpoint_and_purge(binlog_id);
+
+ DBUG_VOID_RETURN;
+}
+
+
+int
+MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
+ uint64 commit_id)
+{
+ binlog_cache_mngr *mngr= entry->cache_mngr;
+ bool has_xid= entry->end_event->get_type_code() == XID_EVENT;
+
+ DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_or_stmt");
+
+ if (write_gtid_event(entry->thd, is_prepared_xa(entry->thd),
+ entry->using_trx_cache, commit_id,
+ has_xid, entry->ro_1pc))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+
+ if (entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
+ write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
+ {
+ entry->error_cache= &mngr->stmt_cache.cache_log;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+
+ if (entry->using_trx_cache && !mngr->trx_cache.empty())
+ {
+ DBUG_EXECUTE_IF("crash_before_writing_xid",
+ {
+ if ((write_cache(entry->thd,
+ mngr->get_binlog_cache_log(TRUE))))
+ DBUG_PRINT("info", ("error writing binlog cache"));
+ else
+ flush_and_sync(0);
+
+ DBUG_PRINT("info", ("crashing before writing xid"));
+ DBUG_SUICIDE();
+ });
+
+ if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
+ {
+ entry->error_cache= &mngr->trx_cache.cache_log;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+ }
+
+ DBUG_EXECUTE_IF("inject_error_writing_xid",
+ {
+ entry->error_cache= NULL;
+ errno= 28;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ });
+
+ if (write_event(entry->end_event))
+ {
+ entry->error_cache= NULL;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+ status_var_add(entry->thd->status_var.binlog_bytes_written,
+ entry->end_event->data_written);
+
+ if (entry->incident_event)
+ {
+ if (write_event(entry->incident_event))
+ {
+ entry->error_cache= NULL;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+ }
+
+ if (unlikely(mngr->get_binlog_cache_log(FALSE)->error))
+ {
+ entry->error_cache= &mngr->stmt_cache.cache_log;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+ if (unlikely(mngr->get_binlog_cache_log(TRUE)->error)) // Error on read
+ {
+ entry->error_cache= &mngr->trx_cache.cache_log;
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Wait for sufficient commits to queue up for group commit, according to the
+ values of binlog_commit_wait_count and binlog_commit_wait_usec.
+
+ Note that this function may release and re-acquire LOCK_log and
+ LOCK_prepare_ordered if it needs to wait.
+*/
+
+void
+MYSQL_BIN_LOG::wait_for_sufficient_commits()
+{
+ size_t count;
+ group_commit_entry *e;
+ group_commit_entry *last_head;
+ struct timespec wait_until;
+
+ mysql_mutex_assert_owner(&LOCK_log);
+ mysql_mutex_assert_owner(&LOCK_prepare_ordered);
+
+ for (e= last_head= group_commit_queue, count= 0; e; e= e->next)
+ {
+ if (++count >= opt_binlog_commit_wait_count)
+ {
+ group_commit_trigger_count++;
+ return;
+ }
+ if (unlikely(e->thd->has_waiter))
+ {
+ group_commit_trigger_lock_wait++;
+ return;
+ }
+ }
+
+ mysql_mutex_unlock(&LOCK_log);
+ set_timespec_nsec(wait_until, (ulonglong)1000*opt_binlog_commit_wait_usec);
+
+ for (;;)
+ {
+ int err;
+ group_commit_entry *head;
+
+ err= mysql_cond_timedwait(&COND_prepare_ordered, &LOCK_prepare_ordered,
+ &wait_until);
+ if (err == ETIMEDOUT)
+ {
+ group_commit_trigger_timeout++;
+ break;
+ }
+ if (unlikely(last_head->thd->has_waiter))
+ {
+ group_commit_trigger_lock_wait++;
+ break;
+ }
+ head= group_commit_queue;
+ for (e= head; e && e != last_head; e= e->next)
+ {
+ ++count;
+ if (unlikely(e->thd->has_waiter))
+ {
+ group_commit_trigger_lock_wait++;
+ goto after_loop;
+ }
+ }
+ if (count >= opt_binlog_commit_wait_count)
+ {
+ group_commit_trigger_count++;
+ break;
+ }
+ last_head= head;
+ }
+after_loop:
+
+ /*
+ We must not wait for LOCK_log while holding LOCK_prepare_ordered.
+ LOCK_log can be held for long periods (eg. we do I/O under it), while
+ LOCK_prepare_ordered must only be held for short periods.
+
+ In addition, waiting for LOCK_log while holding LOCK_prepare_ordered would
+ violate locking order of LOCK_log-before-LOCK_prepare_ordered. This could
+ cause SAFEMUTEX warnings (even if it cannot actually deadlock with current
+ code, as there can be at most one group commit leader thread at a time).
+
+ So release and re-acquire LOCK_prepare_ordered if we need to wait for the
+ LOCK_log.
+ */
+ if (mysql_mutex_trylock(&LOCK_log))
+ {
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+ mysql_mutex_lock(&LOCK_log);
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ }
+}
+
+
+void
+MYSQL_BIN_LOG::binlog_trigger_immediate_group_commit()
+{
+ group_commit_entry *head;
+ mysql_mutex_assert_owner(&LOCK_prepare_ordered);
+ head= group_commit_queue;
+ if (head)
+ {
+ head->thd->has_waiter= true;
+ mysql_cond_signal(&COND_prepare_ordered);
+ }
+}
+
+
+/*
+ This function is called when a transaction T1 goes to wait for another
+ transaction T2. It is used to cut short any binlog group commit delay from
+ --binlog-commit-wait-count in the case where another transaction is stalled
+ on the wait due to conflicting row locks.
+
+ If T2 is already ready to group commit, any waiting group commit will be
+ signalled to proceed immediately. Otherwise, a flag will be set in T2, and
+ when T2 later becomes ready, immediate group commit will be triggered.
+*/
+void
+binlog_report_wait_for(THD *thd1, THD *thd2)
+{
+ if (opt_binlog_commit_wait_count == 0)
+ return;
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ thd2->has_waiter= true;
+ if (thd2->waiting_on_group_commit)
+ mysql_bin_log.binlog_trigger_immediate_group_commit();
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+}
+
+
+/**
+ Wait until we get a signal that the relay log has been updated.
+
+ @param thd Thread variable
+
+ @note
+ One must have a lock on LOCK_log before calling this function.
+ This lock will be released before return! That's required by
+ THD::enter_cond() (see NOTES in sql_class.h).
+*/
+
+void MYSQL_BIN_LOG::wait_for_update_relay_log(THD* thd)
+{
+ PSI_stage_info old_stage;
+ DBUG_ENTER("wait_for_update_relay_log");
+
+ mysql_mutex_assert_owner(&LOCK_log);
+ thd->ENTER_COND(&COND_relay_log_updated, &LOCK_log,
+ &stage_slave_has_read_all_relay_log,
+ &old_stage);
+ mysql_cond_wait(&COND_relay_log_updated, &LOCK_log);
+ thd->EXIT_COND(&old_stage);
+ DBUG_VOID_RETURN;
+}
+
+/**
+ Wait until we get a signal that the binary log has been updated.
+ Applies to master only.
+
+ NOTES
+ @param[in] thd a THD struct
+ @param[in] timeout a pointer to a timespec;
+ NULL means to wait w/o timeout.
+ @retval 0 if got signalled on update
+ @retval non-0 if wait timeout elapsed
+ @note
+ LOCK_log must be taken before calling this function.
+ LOCK_log is being released while the thread is waiting.
+ LOCK_log is released by the caller.
+*/
+
+int MYSQL_BIN_LOG::wait_for_update_binlog_end_pos(THD* thd,
+ struct timespec *timeout)
+{
+ int ret= 0;
+ DBUG_ENTER("wait_for_update_binlog_end_pos");
+
+ thd_wait_begin(thd, THD_WAIT_BINLOG);
+ mysql_mutex_assert_owner(get_binlog_end_pos_lock());
+ if (!timeout)
+ mysql_cond_wait(&COND_bin_log_updated, get_binlog_end_pos_lock());
+ else
+ ret= mysql_cond_timedwait(&COND_bin_log_updated, get_binlog_end_pos_lock(),
+ timeout);
+ thd_wait_end(thd);
+ DBUG_RETURN(ret);
+}
+
+
+/**
+ Close the log file.
+
+ @param exiting Bitmask for one or more of the following bits:
+ - LOG_CLOSE_INDEX : if we should close the index file
+ - LOG_CLOSE_TO_BE_OPENED : if we intend to call open
+ at once after close.
+ - LOG_CLOSE_STOP_EVENT : write a 'stop' event to the log
+ - LOG_CLOSE_DELAYED_CLOSE : do not yet close the file and clear the
+ LOG_EVENT_BINLOG_IN_USE_F flag
+
+ @note
+ One can do an open on the object at once after doing a close.
+ The internal structures are not freed until cleanup() is called
+*/
+
+void MYSQL_BIN_LOG::close(uint exiting)
+{ // One can't set log_type here!
+ bool failed_to_save_state= false;
+ DBUG_ENTER("MYSQL_BIN_LOG::close");
+ DBUG_PRINT("enter",("exiting: %d", (int) exiting));
+
+ mysql_mutex_assert_owner(&LOCK_log);
+
+ if (log_state == LOG_OPENED)
+ {
+ DBUG_ASSERT(log_type == LOG_BIN);
+#ifdef HAVE_REPLICATION
+ if (exiting & LOG_CLOSE_STOP_EVENT)
+ {
+ Stop_log_event s;
+ // the checksumming rule for relay-log case is similar to Rotate
+ s.checksum_alg= is_relay_log ? relay_log_checksum_alg
+ : (enum_binlog_checksum_alg)binlog_checksum_options;
+ DBUG_ASSERT(!is_relay_log ||
+ relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
+ write_event(&s);
+ bytes_written+= s.data_written;
+ flush_io_cache(&log_file);
+ update_binlog_end_pos();
+
+ /*
+ When we shut down server, write out the binlog state to a separate
+ file so we do not have to scan an entire binlog file to recover it
+ at next server start.
+
+ Note that this must be written and synced to disk before marking the
+ last binlog file as "not crashed".
+ */
+ if (!is_relay_log && write_state_to_file())
+ {
+ sql_print_error("Failed to save binlog GTID state during shutdown. "
+ "Binlog will be marked as crashed, so that crash "
+ "recovery can recover the state at next server "
+ "startup.");
+ /*
+ Leave binlog file marked as crashed, so we can recover state by
+ scanning it now that we failed to write out the state properly.
+ */
+ failed_to_save_state= true;
+ }
+ }
+#endif /* HAVE_REPLICATION */
+
+ /* don't pwrite in a file opened with O_APPEND - it doesn't work */
+ if (log_file.type == WRITE_CACHE && !(exiting & LOG_CLOSE_DELAYED_CLOSE))
+ {
+ my_off_t org_position= mysql_file_tell(log_file.file, MYF(0));
+ if (!failed_to_save_state)
+ clear_inuse_flag_when_closing(log_file.file);
+ /*
+ Restore position so that anything we have in the IO_cache is written
+ to the correct position.
+ We need the seek here, as mysql_file_pwrite() is not guaranteed to keep the
+ original position on system that doesn't support pwrite().
+ */
+ mysql_file_seek(log_file.file, org_position, MY_SEEK_SET, MYF(0));
+ }
+
+ /* this will cleanup IO_CACHE, sync and close the file */
+ MYSQL_LOG::close(exiting);
+ }
+
+ /*
+ The following test is needed even if is_open() is not set, as we may have
+ called a not complete close earlier and the index file is still open.
+ */
+
+ if ((exiting & LOG_CLOSE_INDEX) && my_b_inited(&index_file))
+ {
+ end_io_cache(&index_file);
+ if (unlikely(mysql_file_close(index_file.file, MYF(0)) < 0) &&
+ ! write_error)
+ {
+ write_error= 1;
+ sql_print_error(ER_DEFAULT(ER_ERROR_ON_WRITE), index_file_name, errno);
+ }
+ }
+ log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
+ my_free(name);
+ name= NULL;
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Clear the LOG_EVENT_BINLOG_IN_USE_F; this marks the binlog file as cleanly
+ closed and not needing crash recovery.
+*/
+void MYSQL_BIN_LOG::clear_inuse_flag_when_closing(File file)
+{
+ my_off_t offset= BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
+ uchar flags= 0; // clearing LOG_EVENT_BINLOG_IN_USE_F
+ mysql_file_pwrite(file, &flags, 1, offset, MYF(0));
+}
+
+
+void MYSQL_BIN_LOG::set_max_size(ulong max_size_arg)
+{
+ /*
+ We need to take locks, otherwise this may happen:
+ new_file() is called, calls open(old_max_size), then before open() starts,
+ set_max_size() sets max_size to max_size_arg, then open() starts and
+ uses the old_max_size argument, so max_size_arg has been overwritten and
+ it's like if the SET command was never run.
+ */
+ DBUG_ENTER("MYSQL_BIN_LOG::set_max_size");
+ mysql_mutex_lock(&LOCK_log);
+ if (is_open())
+ max_size= max_size_arg;
+ mysql_mutex_unlock(&LOCK_log);
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Check if a string is a valid number.
+
+ @param str String to test
+ @param res Store value here
+ @param allow_wildcards Set to 1 if we should ignore '%' and '_'
+
+ @note
+ For the moment the allow_wildcards argument is not used
+ Should be move to some other file.
+
+ @retval
+ 1 String is a number
+ @retval
+ 0 String is not a number
+*/
+
+static bool test_if_number(const char *str, ulong *res, bool allow_wildcards)
+{
+ int flag;
+ const char *start;
+ DBUG_ENTER("test_if_number");
+
+ flag=0; start=str;
+ while (*str++ == ' ') ;
+ if (*--str == '-' || *str == '+')
+ str++;
+ while (my_isdigit(files_charset_info,*str) ||
+ (allow_wildcards && (*str == wild_many || *str == wild_one)))
+ {
+ flag=1;
+ str++;
+ }
+ if (*str == '.')
+ {
+ for (str++ ;
+ my_isdigit(files_charset_info,*str) ||
+ (allow_wildcards && (*str == wild_many || *str == wild_one)) ;
+ str++, flag=1) ;
+ }
+ if (*str != 0 || flag == 0)
+ DBUG_RETURN(0);
+ if (res)
+ *res=atol(start);
+ DBUG_RETURN(1); /* Number ok */
+} /* test_if_number */
+
+
+void sql_perror(const char *message)
+{
+#if defined(_WIN32)
+ char* buf;
+ DWORD dw= GetLastError();
+ if (FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS, NULL, dw,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL ) > 0)
+ {
+ sql_print_error("%s: %s",message, buf);
+ LocalFree((HLOCAL)buf);
+ }
+ else
+ {
+ sql_print_error("%s", message);
+ }
+#elif defined(HAVE_STRERROR)
+ sql_print_error("%s: %s",message, strerror(errno));
+#else
+ perror(message);
+#endif
+}
+
+
+/*
+ Change the file associated with two output streams. Used to
+ redirect stdout and stderr to a file. The streams are reopened
+ only for appending (writing at end of file).
+*/
+bool reopen_fstreams(const char *filename, FILE *outstream, FILE *errstream)
+{
+ static constexpr const char *mode= "a" IF_WIN("t", );
+ if ((outstream && !my_freopen(filename, mode, outstream)) ||
+ (errstream && !my_freopen(filename, mode, errstream)))
+ {
+ my_error(ER_CANT_CREATE_FILE, MYF(0), filename, errno);
+ return TRUE;
+ }
+
+ /* The error stream must be unbuffered. */
+ if (errstream)
+ setbuf(errstream, NULL);
+
+ return FALSE;
+}
+
+
+/*
+ Unfortunately, there seems to be no good way
+ to restore the original streams upon failure.
+*/
+static bool redirect_std_streams(const char *file)
+{
+ if (reopen_fstreams(file, stdout, stderr))
+ return TRUE;
+
+ setbuf(stderr, NULL);
+ return FALSE;
+}
+
+
+bool flush_error_log()
+{
+ bool result= 0;
+ if (opt_error_log)
+ {
+ mysql_mutex_lock(&LOCK_error_log);
+ if (redirect_std_streams(log_error_file))
+ result= 1;
+ mysql_mutex_unlock(&LOCK_error_log);
+ }
+ return result;
+}
+
+#ifdef _WIN32
+struct eventlog_source
+{
+ HANDLE handle;
+ eventlog_source()
+ {
+ setup_windows_event_source();
+ handle = RegisterEventSource(NULL, "MariaDB");
+ }
+
+ ~eventlog_source()
+ {
+ if (handle)
+ DeregisterEventSource(handle);
+ }
+};
+
+static eventlog_source eventlog;
+
+static void print_buffer_to_nt_eventlog(enum loglevel level, char *buff,
+ size_t length, size_t buffLen)
+{
+ HANDLE event= eventlog.handle;
+ char *buffptr= buff;
+ DBUG_ENTER("print_buffer_to_nt_eventlog");
+
+ /* Add ending CR/LF's to string, overwrite last chars if necessary */
+ strmov(buffptr+MY_MIN(length, buffLen-5), "\r\n\r\n");
+
+ if (event)
+ {
+ switch (level) {
+ case ERROR_LEVEL:
+ ReportEvent(event, EVENTLOG_ERROR_TYPE, 0, MSG_DEFAULT, NULL, 1, 0,
+ (LPCSTR*)&buffptr, NULL);
+ break;
+ case WARNING_LEVEL:
+ ReportEvent(event, EVENTLOG_WARNING_TYPE, 0, MSG_DEFAULT, NULL, 1, 0,
+ (LPCSTR*) &buffptr, NULL);
+ break;
+ case INFORMATION_LEVEL:
+ ReportEvent(event, EVENTLOG_INFORMATION_TYPE, 0, MSG_DEFAULT, NULL, 1,
+ 0, (LPCSTR*) &buffptr, NULL);
+ break;
+ }
+ }
+
+ DBUG_VOID_RETURN;
+}
+#endif /* _WIN32 */
+
+
+#ifndef EMBEDDED_LIBRARY
+static void print_buffer_to_file(enum loglevel level, const char *buffer,
+ size_t length)
+{
+ time_t skr;
+ struct tm tm_tmp;
+ struct tm *start;
+ THD *thd= 0;
+ size_t tag_length= 0;
+ char tag[NAME_LEN];
+ DBUG_ENTER("print_buffer_to_file");
+ DBUG_PRINT("enter",("buffer: %s", buffer));
+
+ if (mysqld_server_initialized && (thd= current_thd))
+ {
+ if (thd->connection_name.length)
+ {
+ /*
+ Add tag for slaves so that the user can see from which connection
+ the error originates.
+ */
+ tag_length= my_snprintf(tag, sizeof(tag),
+ ER_THD(thd, ER_MASTER_LOG_PREFIX),
+ (int) thd->connection_name.length,
+ thd->connection_name.str);
+ }
+ }
+
+ mysql_mutex_lock(&LOCK_error_log);
+
+ skr= my_time(0);
+ localtime_r(&skr, &tm_tmp);
+ start=&tm_tmp;
+
+ fprintf(stderr, "%d-%02d-%02d %2d:%02d:%02d %lu [%s] %.*s%.*s\n",
+ start->tm_year + 1900,
+ start->tm_mon+1,
+ start->tm_mday,
+ start->tm_hour,
+ start->tm_min,
+ start->tm_sec,
+ (unsigned long) (thd ? thd->thread_id : 0),
+ (level == ERROR_LEVEL ? "ERROR" : level == WARNING_LEVEL ?
+ "Warning" : "Note"),
+ (int) tag_length, tag,
+ (int) length, buffer);
+
+ fflush(stderr);
+
+#ifdef WITH_WSREP
+ if (level <= WARNING_LEVEL)
+ {
+ wsrep::reporter::log_level const lvl = (level <= ERROR_LEVEL ?
+ wsrep::reporter::error :
+ wsrep::reporter::warning);
+ Wsrep_status::report_log_msg(lvl, tag, tag_length, buffer, length, skr);
+ }
+#endif /* WITH_WSREP */
+
+ mysql_mutex_unlock(&LOCK_error_log);
+ DBUG_VOID_RETURN;
+}
+
+/**
+ Prints a printf style message to the error log and, under NT, to the
+ Windows event log.
+
+ This function prints the message into a buffer and then sends that buffer
+ to other functions to write that message to other logging sources.
+
+ @param level The level of the msg significance
+ @param format Printf style format of message
+ @param args va_list list of arguments for the message
+
+ @returns
+ The function always returns 0. The return value is present in the
+ signature to be compatible with other logging routines, which could
+ return an error (e.g. logging to the log tables)
+*/
+int vprint_msg_to_log(enum loglevel level, const char *format, va_list args)
+{
+ char buff[1024];
+ size_t length;
+ DBUG_ENTER("vprint_msg_to_log");
+
+ length= my_vsnprintf(buff, sizeof(buff), format, args);
+ print_buffer_to_file(level, buff, length);
+
+#ifdef _WIN32
+ print_buffer_to_nt_eventlog(level, buff, length, sizeof(buff));
+#endif
+
+ DBUG_RETURN(0);
+}
+#endif /* EMBEDDED_LIBRARY */
+
+
+void sql_print_error(const char *format, ...)
+{
+ va_list args;
+ DBUG_ENTER("sql_print_error");
+
+ va_start(args, format);
+ error_log_print(ERROR_LEVEL, format, args);
+ va_end(args);
+
+ DBUG_VOID_RETURN;
+}
+
+
+void sql_print_warning(const char *format, ...)
+{
+ va_list args;
+ DBUG_ENTER("sql_print_warning");
+
+ va_start(args, format);
+ error_log_print(WARNING_LEVEL, format, args);
+ va_end(args);
+
+ DBUG_VOID_RETURN;
+}
+
+
+void sql_print_information(const char *format, ...)
+{
+ va_list args;
+ DBUG_ENTER("sql_print_information");
+
+ va_start(args, format);
+ sql_print_information_v(format, args);
+ va_end(args);
+
+ DBUG_VOID_RETURN;
+}
+
+void sql_print_information_v(const char *format, va_list ap)
+{
+ if (disable_log_notes)
+ return; // Skip notes during start/shutdown
+
+ error_log_print(INFORMATION_LEVEL, format, ap);
+}
+
+void
+TC_LOG::run_prepare_ordered(THD *thd, bool all)
+{
+ Ha_trx_info *ha_info=
+ all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list;
+
+ mysql_mutex_assert_owner(&LOCK_prepare_ordered);
+ for (; ha_info; ha_info= ha_info->next())
+ {
+ handlerton *ht= ha_info->ht();
+ if (!ht->prepare_ordered)
+ continue;
+ ht->prepare_ordered(ht, thd, all);
+ }
+}
+
+
+void
+TC_LOG::run_commit_ordered(THD *thd, bool all)
+{
+ Ha_trx_info *ha_info=
+ all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list;
+
+ mysql_mutex_assert_owner(&LOCK_commit_ordered);
+ for (; ha_info; ha_info= ha_info->next())
+ {
+ handlerton *ht= ha_info->ht();
+ if (!ht->commit_ordered)
+ continue;
+ ht->commit_ordered(ht, thd, all);
+ DBUG_EXECUTE_IF("enable_log_write_upto_crash",
+ {
+ DBUG_SET_INITIAL("+d,crash_after_log_write_upto");
+ sleep(1000);
+ });
+ DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
+ }
+}
+
+
+int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
+ bool need_prepare_ordered,
+ bool need_commit_ordered)
+{
+ int cookie;
+ struct commit_entry entry;
+ bool UNINIT_VAR(is_group_commit_leader);
+
+ if (need_prepare_ordered)
+ {
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ run_prepare_ordered(thd, all);
+ if (need_commit_ordered)
+ {
+ /*
+ Must put us in queue so we can run_commit_ordered() in same sequence
+ as we did run_prepare_ordered().
+ */
+ thd->clear_wakeup_ready();
+ entry.thd= thd;
+ commit_entry *previous_queue= commit_ordered_queue;
+ entry.next= previous_queue;
+ commit_ordered_queue= &entry;
+ is_group_commit_leader= (previous_queue == NULL);
+ }
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+ }
+
+ if (thd->wait_for_prior_commit())
+ return 0;
+
+ cookie= 0;
+ if (xid)
+ cookie= log_one_transaction(xid);
+
+ if (need_commit_ordered)
+ {
+ if (need_prepare_ordered)
+ {
+ /*
+ We did the run_prepare_ordered() serialised, then ran the log_xid() in
+ parallel. Now we have to do run_commit_ordered() serialised in the
+ same sequence as run_prepare_ordered().
+
+ We do this starting from the head of the queue, each thread doing
+ run_commit_ordered() and signalling the next in queue.
+ */
+ if (is_group_commit_leader)
+ {
+ /* The first in queue starts the ball rolling. */
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ while (commit_ordered_queue_busy)
+ mysql_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered);
+ commit_entry *queue= commit_ordered_queue;
+ commit_ordered_queue= NULL;
+ /*
+ Mark the queue busy while we bounce it from one thread to the
+ next.
+ */
+ commit_ordered_queue_busy= true;
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+
+ /* Reverse the queue list so we get correct order. */
+ commit_entry *prev= NULL;
+ while (queue)
+ {
+ commit_entry *next= queue->next;
+ queue->next= prev;
+ prev= queue;
+ queue= next;
+ }
+ DBUG_ASSERT(prev == &entry);
+ DBUG_ASSERT(prev->thd == thd);
+ }
+ else
+ {
+ /* Not first in queue; just wait until previous thread wakes us up. */
+ thd->wait_for_wakeup_ready();
+ }
+ }
+
+ /* Only run commit_ordered() if log_xid was successful. */
+ if (cookie)
+ {
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ run_commit_ordered(thd, all);
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+ }
+
+ if (need_prepare_ordered)
+ {
+ commit_entry *next= entry.next;
+ if (next)
+ {
+ next->thd->signal_wakeup_ready();
+ }
+ else
+ {
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ commit_ordered_queue_busy= false;
+ mysql_cond_signal(&COND_queue_busy);
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+ }
+ }
+ }
+
+ return cookie;
+}
+
+
+/********* transaction coordinator log for 2pc - mmap() based solution *******/
+
+/*
+ the log consists of a file, mapped to memory.
+ file is divided into pages of tc_log_page_size size.
+ (usable size of the first page is smaller because of the log header)
+ there is a PAGE control structure for each page
+ each page (or rather its PAGE control structure) can be in one of
+ the three states - active, syncing, pool.
+ there could be only one page in the active or syncing state,
+ but many in pool - pool is a fifo queue.
+ the usual lifecycle of a page is pool->active->syncing->pool.
+ the "active" page is a page where new xid's are logged.
+ the page stays active as long as the syncing slot is taken.
+ the "syncing" page is being synced to disk. no new xid can be added to it.
+ when the syncing is done the page is moved to a pool and an active page
+ becomes "syncing".
+
+ the result of such an architecture is a natural "commit grouping" -
+ If commits are coming faster than the system can sync, they do not
+ stall. Instead, all commits that came since the last sync are
+ logged to the same "active" page, and they all are synced with the next -
+ one - sync. Thus, thought individual commits are delayed, throughput
+ is not decreasing.
+
+ when an xid is added to an active page, the thread of this xid waits
+ for a page's condition until the page is synced. when syncing slot
+ becomes vacant one of these waiters is awaken to take care of syncing.
+ it syncs the page and signals all waiters that the page is synced.
+ PAGE::waiters is used to count these waiters, and a page may never
+ become active again until waiters==0 (that is all waiters from the
+ previous sync have noticed that the sync was completed)
+
+ note, that the page becomes "dirty" and has to be synced only when a
+ new xid is added into it. Removing a xid from a page does not make it
+ dirty - we don't sync xid removals to disk.
+*/
+
+ulong tc_log_page_waits= 0;
+
+#ifdef HAVE_MMAP
+
+#define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)
+
+static const uchar tc_log_magic[]={(uchar) 254, 0x23, 0x05, 0x74};
+
+ulong opt_tc_log_size;
+ulong tc_log_max_pages_used=0, tc_log_page_size=0, tc_log_cur_pages_used=0;
+
+int TC_LOG_MMAP::open(const char *opt_name)
+{
+ uint i;
+ bool crashed=FALSE;
+ PAGE *pg;
+
+ DBUG_ASSERT(total_ha_2pc > 1);
+ DBUG_ASSERT(opt_name);
+ DBUG_ASSERT(opt_name[0]);
+
+ tc_log_page_size= my_getpagesize();
+
+ fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
+ if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR | O_CLOEXEC, MYF(0))) < 0)
+ {
+ if (my_errno != ENOENT)
+ goto err;
+ if (using_heuristic_recover())
+ return 1;
+ if ((fd= mysql_file_create(key_file_tclog, logname, CREATE_MODE,
+ O_RDWR | O_CLOEXEC, MYF(MY_WME))) < 0)
+ goto err;
+ inited=1;
+ file_length= opt_tc_log_size;
+ if (mysql_file_chsize(fd, file_length, 0, MYF(MY_WME)))
+ goto err;
+ }
+ else
+ {
+ inited= 1;
+ crashed= TRUE;
+ sql_print_information("Recovering after a crash using %s", opt_name);
+ if (tc_heuristic_recover)
+ {
+ sql_print_error("Cannot perform automatic crash recovery when "
+ "--tc-heuristic-recover is used");
+ goto err;
+ }
+ file_length= mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
+ if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
+ goto err;
+ }
+
+ data= (uchar *)my_mmap(0, (size_t)file_length, PROT_READ|PROT_WRITE,
+ MAP_NOSYNC|MAP_SHARED, fd, 0);
+ if (data == MAP_FAILED)
+ {
+ my_errno=errno;
+ goto err;
+ }
+ inited=2;
+
+ npages=(uint)file_length/tc_log_page_size;
+ if (npages < 3) // to guarantee non-empty pool
+ goto err;
+ if (!(pages=(PAGE *)my_malloc(key_memory_TC_LOG_MMAP_pages,
+ npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
+ goto err;
+ inited=3;
+ for (pg=pages, i=0; i < npages; i++, pg++)
+ {
+ pg->next=pg+1;
+ pg->waiters=0;
+ pg->state=PS_POOL;
+ mysql_mutex_init(key_PAGE_lock, &pg->lock, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(key_PAGE_cond, &pg->cond, 0);
+ pg->ptr= pg->start=(my_xid *)(data + i*tc_log_page_size);
+ pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
+ pg->end=pg->start + pg->size;
+ }
+ pages[0].size=pages[0].free=
+ (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
+ pages[0].start=pages[0].end-pages[0].size;
+ pages[npages-1].next=0;
+ inited=4;
+
+ if (crashed && recover())
+ goto err;
+
+ memcpy(data, tc_log_magic, sizeof(tc_log_magic));
+ data[sizeof(tc_log_magic)]= (uchar)total_ha_2pc;
+ my_msync(fd, data, tc_log_page_size, MS_SYNC);
+ inited=5;
+
+ mysql_mutex_init(key_LOCK_sync, &LOCK_sync, MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(key_LOCK_active, &LOCK_active, MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(key_LOCK_pending_checkpoint, &LOCK_pending_checkpoint,
+ MY_MUTEX_INIT_FAST);
+ mysql_cond_init(key_COND_active, &COND_active, 0);
+ mysql_cond_init(key_COND_pool, &COND_pool, 0);
+ mysql_cond_init(key_TC_LOG_MMAP_COND_queue_busy, &COND_queue_busy, 0);
+
+ inited=6;
+
+ syncing= 0;
+ active=pages;
+ DBUG_ASSERT(npages >= 2);
+ pool=pages+1;
+ pool_last_ptr= &((pages+npages-1)->next);
+ commit_ordered_queue= NULL;
+ commit_ordered_queue_busy= false;
+
+ return 0;
+
+err:
+ close();
+ return 1;
+}
+
+/**
+ there is no active page, let's got one from the pool.
+
+ Two strategies here:
+ -# take the first from the pool
+ -# if there're waiters - take the one with the most free space.
+
+ @todo
+ page merging. try to allocate adjacent page first,
+ so that they can be flushed both in one sync
+*/
+
+void TC_LOG_MMAP::get_active_from_pool()
+{
+ PAGE **p, **best_p=0;
+ int best_free;
+
+ mysql_mutex_lock(&LOCK_pool);
+
+ do
+ {
+ best_p= p= &pool;
+ if ((*p)->waiters == 0 && (*p)->free > 0) // can the first page be used ?
+ break; // yes - take it.
+
+ best_free=0; // no - trying second strategy
+ for (p=&(*p)->next; *p; p=&(*p)->next)
+ {
+ if ((*p)->waiters == 0 && (*p)->free > best_free)
+ {
+ best_free=(*p)->free;
+ best_p=p;
+ }
+ }
+ }
+ while ((*best_p == 0 || best_free == 0) && overflow());
+
+ mysql_mutex_assert_owner(&LOCK_active);
+ active=*best_p;
+
+ /* Unlink the page from the pool. */
+ if (!(*best_p)->next)
+ pool_last_ptr= best_p;
+ *best_p=(*best_p)->next;
+ mysql_mutex_unlock(&LOCK_pool);
+
+ mysql_mutex_lock(&active->lock);
+ if (active->free == active->size) // we've chosen an empty page
+ {
+ tc_log_cur_pages_used++;
+ set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
+ }
+}
+
+/**
+ @todo
+ perhaps, increase log size ?
+*/
+int TC_LOG_MMAP::overflow()
+{
+ /*
+ simple overflow handling - just wait
+ TODO perhaps, increase log size ?
+ let's check the behaviour of tc_log_page_waits first
+ */
+ tc_log_page_waits++;
+ mysql_cond_wait(&COND_pool, &LOCK_pool);
+ return 1; // always return 1
+}
+
+/**
+ Record that transaction XID is committed on the persistent storage.
+
+ This function is called in the middle of two-phase commit:
+ First all resources prepare the transaction, then tc_log->log() is called,
+ then all resources commit the transaction, then tc_log->unlog() is called.
+
+ All access to active page is serialized but it's not a problem, as
+ we're assuming that fsync() will be a main bottleneck.
+ That is, parallelizing writes to log pages we'll decrease number of
+ threads waiting for a page, but then all these threads will be waiting
+ for a fsync() anyway
+
+ If tc_log == MYSQL_LOG then tc_log writes transaction to binlog and
+ records XID in a special Xid_log_event.
+ If tc_log = TC_LOG_MMAP then xid is written in a special memory-mapped
+ log.
+
+ @retval
+ 0 - error
+ @retval
+ \# - otherwise, "cookie", a number that will be passed as an argument
+ to unlog() call. tc_log can define it any way it wants,
+ and use for whatever purposes. TC_LOG_MMAP sets it
+ to the position in memory where xid was logged to.
+*/
+
+int TC_LOG_MMAP::log_one_transaction(my_xid xid)
+{
+ int err;
+ PAGE *p;
+ ulong cookie;
+
+ mysql_mutex_lock(&LOCK_active);
+
+ /*
+ if the active page is full - just wait...
+ frankly speaking, active->free here accessed outside of mutex
+ protection, but it's safe, because it only means we may miss an
+ unlog() for the active page, and we're not waiting for it here -
+ unlog() does not signal COND_active.
+ */
+ while (unlikely(active && active->free == 0))
+ mysql_cond_wait(&COND_active, &LOCK_active);
+
+ /* no active page ? take one from the pool */
+ if (active == 0)
+ get_active_from_pool();
+ else
+ mysql_mutex_lock(&active->lock);
+
+ p=active;
+
+ /*
+ p->free is always > 0 here because to decrease it one needs
+ to take p->lock and before it one needs to take LOCK_active.
+ But checked that active->free > 0 under LOCK_active and
+ haven't release it ever since
+ */
+
+ /* searching for an empty slot */
+ while (*p->ptr)
+ {
+ p->ptr++;
+ DBUG_ASSERT(p->ptr < p->end); // because p->free > 0
+ }
+
+ /* found! store xid there and mark the page dirty */
+ cookie= (ulong)((uchar *)p->ptr - data); // can never be zero
+ *p->ptr++= xid;
+ p->free--;
+ p->state= PS_DIRTY;
+ mysql_mutex_unlock(&p->lock);
+
+ mysql_mutex_lock(&LOCK_sync);
+ if (syncing)
+ { // somebody's syncing. let's wait
+ mysql_mutex_unlock(&LOCK_active);
+ mysql_mutex_lock(&p->lock);
+ p->waiters++;
+ while (p->state == PS_DIRTY && syncing)
+ {
+ mysql_mutex_unlock(&p->lock);
+ mysql_cond_wait(&p->cond, &LOCK_sync);
+ mysql_mutex_lock(&p->lock);
+ }
+ p->waiters--;
+ err= p->state == PS_ERROR;
+ if (p->state != PS_DIRTY) // page was synced
+ {
+ mysql_mutex_unlock(&LOCK_sync);
+ if (p->waiters == 0)
+ mysql_cond_signal(&COND_pool); // in case somebody's waiting
+ mysql_mutex_unlock(&p->lock);
+ goto done; // we're done
+ }
+ DBUG_ASSERT(!syncing);
+ mysql_mutex_unlock(&p->lock);
+ syncing = p;
+ mysql_mutex_unlock(&LOCK_sync);
+
+ mysql_mutex_lock(&LOCK_active);
+ active=0; // page is not active anymore
+ mysql_cond_broadcast(&COND_active);
+ mysql_mutex_unlock(&LOCK_active);
+ }
+ else
+ {
+ syncing = p; // place is vacant - take it
+ mysql_mutex_unlock(&LOCK_sync);
+ active = 0; // page is not active anymore
+ mysql_cond_broadcast(&COND_active);
+ mysql_mutex_unlock(&LOCK_active);
+ }
+ err= sync();
+
+done:
+ return err ? 0 : cookie;
+}
+
+int TC_LOG_MMAP::sync()
+{
+ int err;
+
+ DBUG_ASSERT(syncing != active);
+
+ /*
+ sit down and relax - this can take a while...
+ note - no locks are held at this point
+ */
+ err= my_msync(fd, syncing->start, syncing->size * sizeof(my_xid), MS_SYNC);
+
+ /* page is synced. let's move it to the pool */
+ mysql_mutex_lock(&LOCK_pool);
+ (*pool_last_ptr)=syncing;
+ pool_last_ptr=&(syncing->next);
+ syncing->next=0;
+ syncing->state= err ? PS_ERROR : PS_POOL;
+ mysql_cond_signal(&COND_pool); // in case somebody's waiting
+ mysql_mutex_unlock(&LOCK_pool);
+
+ /* marking 'syncing' slot free */
+ mysql_mutex_lock(&LOCK_sync);
+ mysql_cond_broadcast(&syncing->cond); // signal "sync done"
+ syncing=0;
+ /*
+ we check the "active" pointer without LOCK_active. Still, it's safe -
+ "active" can change from NULL to not NULL any time, but it
+ will take LOCK_sync before waiting on active->cond. That is, it can never
+ miss a signal.
+ And "active" can change to NULL only by the syncing thread
+ (the thread that will send a signal below)
+ */
+ if (active)
+ mysql_cond_signal(&active->cond); // wake up a new syncer
+ mysql_mutex_unlock(&LOCK_sync);
+ return err;
+}
+
+static void
+mmap_do_checkpoint_callback(void *data)
+{
+ TC_LOG_MMAP::pending_cookies *pending=
+ static_cast<TC_LOG_MMAP::pending_cookies *>(data);
+ ++pending->pending_count;
+}
+
+int TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
+{
+ pending_cookies *full_buffer= NULL;
+ uint32 ncookies= tc_log_page_size / sizeof(my_xid);
+ DBUG_ASSERT(*(my_xid *)(data+cookie) == xid);
+
+ /*
+ Do not delete the entry immediately, as there may be participating storage
+ engines which implement commit_checkpoint_request(), and thus have not yet
+ flushed the commit durably to disk.
+
+ Instead put it in a queue - and periodically, we will request a checkpoint
+ from all engines and delete a whole batch at once.
+ */
+ mysql_mutex_lock(&LOCK_pending_checkpoint);
+ if (pending_checkpoint == NULL)
+ {
+ uint32 size= sizeof(*pending_checkpoint) + sizeof(ulong) * (ncookies - 1);
+ if (!(pending_checkpoint=
+ (pending_cookies *)my_malloc(PSI_INSTRUMENT_ME, size,
+ MYF(MY_ZEROFILL))))
+ {
+ my_error(ER_OUTOFMEMORY, MYF(0), size);
+ mysql_mutex_unlock(&LOCK_pending_checkpoint);
+ return 1;
+ }
+ }
+
+ pending_checkpoint->cookies[pending_checkpoint->count++]= cookie;
+ if (pending_checkpoint->count == ncookies)
+ {
+ full_buffer= pending_checkpoint;
+ pending_checkpoint= NULL;
+ }
+ mysql_mutex_unlock(&LOCK_pending_checkpoint);
+
+ if (full_buffer)
+ {
+ /*
+ We do an extra increment and notify here - this ensures that
+ things work also if there are no engines at all that support
+ commit_checkpoint_request.
+ */
+ ++full_buffer->pending_count;
+ ha_commit_checkpoint_request(full_buffer, mmap_do_checkpoint_callback);
+ commit_checkpoint_notify(full_buffer);
+ }
+ return 0;
+}
+
+
+void
+TC_LOG_MMAP::commit_checkpoint_notify(void *cookie)
+{
+ uint count;
+ pending_cookies *pending= static_cast<pending_cookies *>(cookie);
+ mysql_mutex_lock(&LOCK_pending_checkpoint);
+ DBUG_ASSERT(pending->pending_count > 0);
+ count= --pending->pending_count;
+ mysql_mutex_unlock(&LOCK_pending_checkpoint);
+ if (count == 0)
+ {
+ uint i;
+ for (i= 0; i < tc_log_page_size / sizeof(my_xid); ++i)
+ delete_entry(pending->cookies[i]);
+ my_free(pending);
+ }
+}
+
+
+/**
+ erase xid from the page, update page free space counters/pointers.
+ cookie points directly to the memory where xid was logged.
+*/
+
+int TC_LOG_MMAP::delete_entry(ulong cookie)
+{
+ PAGE *p=pages+(cookie/tc_log_page_size);
+ my_xid *x=(my_xid *)(data+cookie);
+
+ DBUG_ASSERT(x >= p->start);
+ DBUG_ASSERT(x < p->end);
+
+ mysql_mutex_lock(&p->lock);
+ *x=0;
+ p->free++;
+ DBUG_ASSERT(p->free <= p->size);
+ set_if_smaller(p->ptr, x);
+ if (p->free == p->size) // the page is completely empty
+ statistic_decrement(tc_log_cur_pages_used, &LOCK_status);
+ if (p->waiters == 0) // the page is in pool and ready to rock
+ mysql_cond_signal(&COND_pool); // ping ... for overflow()
+ mysql_mutex_unlock(&p->lock);
+ return 0;
+}
+
+void TC_LOG_MMAP::close()
+{
+ uint i;
+ switch (inited) {
+ case 6:
+ mysql_mutex_destroy(&LOCK_sync);
+ mysql_mutex_destroy(&LOCK_active);
+ mysql_mutex_destroy(&LOCK_pool);
+ mysql_mutex_destroy(&LOCK_pending_checkpoint);
+ mysql_cond_destroy(&COND_pool);
+ mysql_cond_destroy(&COND_active);
+ mysql_cond_destroy(&COND_queue_busy);
+ /* fall through */
+ case 5:
+ data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
+ /* fall through */
+ case 4:
+ for (i=0; i < npages; i++)
+ {
+ if (pages[i].ptr == 0)
+ break;
+ mysql_mutex_destroy(&pages[i].lock);
+ mysql_cond_destroy(&pages[i].cond);
+ }
+ /* fall through */
+ case 3:
+ my_free(pages);
+ /* fall through */
+ case 2:
+ my_munmap((char*)data, (size_t)file_length);
+ /* fall through */
+ case 1:
+ mysql_file_close(fd, MYF(0));
+ }
+ if (inited>=5) // cannot do in the switch because of Windows
+ mysql_file_delete(key_file_tclog, logname, MYF(MY_WME));
+ if (pending_checkpoint)
+ my_free(pending_checkpoint);
+ inited=0;
+}
+
+
+int TC_LOG_MMAP::recover()
+{
+ HASH xids;
+ PAGE *p=pages, *end_p=pages+npages;
+
+ if (bcmp(data, tc_log_magic, sizeof(tc_log_magic)))
+ {
+ sql_print_error("Bad magic header in tc log");
+ goto err1;
+ }
+
+ /*
+ the first byte after magic signature is set to current
+ number of storage engines on startup
+ */
+ if (data[sizeof(tc_log_magic)] > total_ha_2pc)
+ {
+ sql_print_error("Recovery failed! You must enable "
+ "all engines that were enabled at the moment of the crash");
+ goto err1;
+ }
+
+ if (my_hash_init(PSI_INSTRUMENT_ME, &xids, &my_charset_bin,
+ tc_log_page_size/3, 0, sizeof(my_xid), 0, 0, MYF(0)))
+ goto err1;
+
+ for ( ; p < end_p ; p++)
+ {
+ for (my_xid *x=p->start; x < p->end; x++)
+ if (*x && my_hash_insert(&xids, (uchar *)x))
+ goto err2; // OOM
+ }
+
+ if (ha_recover(&xids))
+ goto err2;
+
+ my_hash_free(&xids);
+ bzero(data, (size_t)file_length);
+ return 0;
+
+err2:
+ my_hash_free(&xids);
+err1:
+ sql_print_error("Crash recovery failed. Either correct the problem "
+ "(if it's, for example, out of memory error) and restart, "
+ "or delete tc log and start server with "
+ "--tc-heuristic-recover={commit|rollback}");
+ return 1;
+}
+#endif
+
+TC_LOG *tc_log;
+TC_LOG_DUMMY tc_log_dummy;
+TC_LOG_MMAP tc_log_mmap;
+
+/**
+ Perform heuristic recovery, if --tc-heuristic-recover was used.
+
+ @note
+ no matter whether heuristic recovery was successful or not
+ mysqld must exit. So, return value is the same in both cases.
+
+ @retval
+ 0 no heuristic recovery was requested
+ @retval
+ 1 heuristic recovery was performed
+*/
+
+int TC_LOG::using_heuristic_recover()
+{
+ if (!tc_heuristic_recover)
+ return 0;
+
+ sql_print_information("Heuristic crash recovery mode");
+ if (ha_recover(0))
+ sql_print_error("Heuristic crash recovery failed");
+ sql_print_information("Please restart without --tc-heuristic-recover");
+ return 1;
+}
+
+/****** transaction coordinator log for 2pc - binlog() based solution ******/
+#define TC_LOG_BINLOG MYSQL_BIN_LOG
+
+/**
+ Truncates the current binlog to specified position. Removes the rest of binlogs
+ which are present after this binlog file.
+
+ @param truncate_file Holds the binlog name to be truncated
+ @param truncate_pos Position within binlog from where it needs to
+ truncated.
+
+ @retval true ok
+ @retval false error
+
+*/
+bool MYSQL_BIN_LOG::truncate_and_remove_binlogs(const char *file_name,
+ my_off_t pos,
+ rpl_gtid *ptr_gtid)
+{
+ int error= 0;
+#ifdef HAVE_REPLICATION
+ LOG_INFO log_info;
+ THD *thd= current_thd;
+ my_off_t index_file_offset= 0;
+ File file= -1;
+ MY_STAT s;
+ my_off_t old_size;
+
+ if ((error= find_log_pos(&log_info, file_name, 1)))
+ {
+ sql_print_error("Failed to locate binary log file:%s."
+ "Error:%d", file_name, error);
+ goto end;
+ }
+
+ while (!(error= find_next_log(&log_info, 1)))
+ {
+ if (!index_file_offset)
+ {
+ index_file_offset= log_info.index_file_start_offset;
+ if ((error= open_purge_index_file(TRUE)))
+ {
+ sql_print_error("Failed to open purge index "
+ "file:%s. Error:%d", purge_index_file_name, error);
+ goto end;
+ }
+ }
+ if ((error= register_purge_index_entry(log_info.log_file_name)))
+ {
+ sql_print_error("Failed to copy %s to purge index"
+ " file. Error:%d", log_info.log_file_name, error);
+ goto end;
+ }
+ }
+
+ if (error != LOG_INFO_EOF)
+ {
+ sql_print_error("Failed to find the next binlog to "
+ "add to purge index register. Error:%d", error);
+ goto end;
+ }
+
+ if (is_inited_purge_index_file())
+ {
+ if (!index_file_offset)
+ index_file_offset= log_info.index_file_start_offset;
+
+ if ((error= sync_purge_index_file()))
+ {
+ sql_print_error("Failed to flush purge index "
+ "file. Error:%d", error);
+ goto end;
+ }
+
+ // Trim index file
+ error= mysql_file_chsize(index_file.file, index_file_offset, '\n',
+ MYF(MY_WME));
+ if (!error)
+ error= mysql_file_sync(index_file.file, MYF(MY_WME));
+ if (error)
+ {
+ sql_print_error("Failed to truncate binlog index "
+ "file:%s to offset:%llu. Error:%d", index_file_name,
+ index_file_offset, error);
+ goto end;
+ }
+
+ /* Reset data in old index cache */
+ if ((error= reinit_io_cache(&index_file, READ_CACHE, (my_off_t) 0, 0, 1)))
+ {
+ sql_print_error("Failed to reinit binlog index "
+ "file. Error:%d", error);
+ goto end;
+ }
+
+ /* Read each entry from purge_index_file and delete the file. */
+ if ((error= purge_index_entry(thd, NULL, TRUE)))
+ {
+ sql_print_error("Failed to process registered "
+ "files that would be purged.");
+ goto end;
+ }
+ }
+
+ DBUG_ASSERT(pos);
+
+ if ((file= mysql_file_open(key_file_binlog, file_name,
+ O_RDWR | O_BINARY, MYF(MY_WME))) < 0)
+ {
+ error= 1;
+ sql_print_error("Failed to open binlog file:%s for "
+ "truncation.", file_name);
+ goto end;
+ }
+ my_stat(file_name, &s, MYF(0));
+ old_size= s.st_size;
+ clear_inuse_flag_when_closing(file);
+ /* Change binlog file size to truncate_pos */
+ error= mysql_file_chsize(file, pos, 0, MYF(MY_WME));
+ if (!error)
+ error= mysql_file_sync(file, MYF(MY_WME));
+ if (error)
+ {
+ sql_print_error("Failed to truncate the "
+ "binlog file:%s to size:%llu. Error:%d",
+ file_name, pos, error);
+ goto end;
+ }
+ else
+ {
+ char buf[21];
+ longlong10_to_str(ptr_gtid->seq_no, buf, 10);
+ sql_print_information("Successfully truncated binlog file:%s "
+ "from previous file size %llu "
+ "to pos:%llu to remove transactions starting from "
+ "GTID %u-%u-%s",
+ file_name, old_size, pos,
+ ptr_gtid->domain_id, ptr_gtid->server_id, buf);
+ }
+
+end:
+ if (file >= 0)
+ mysql_file_close(file, MYF(MY_WME));
+
+ error= error || close_purge_index_file();
+#endif
+ return error > 0;
+}
+int TC_LOG_BINLOG::open(const char *opt_name)
+{
+ int error= 1;
+ DBUG_ENTER("TC_LOG_BINLOG::open");
+
+ DBUG_ASSERT(total_ha_2pc > 1);
+ DBUG_ASSERT(opt_name);
+ DBUG_ASSERT(opt_name[0]);
+
+ if (!my_b_inited(&index_file))
+ {
+ /* There was a failure to open the index file, can't open the binlog */
+ cleanup();
+ DBUG_RETURN(1);
+ }
+
+ if (using_heuristic_recover())
+ {
+ mysql_mutex_lock(&LOCK_log);
+ /* generate a new binlog to mask a corrupted one */
+ open(opt_name, 0, 0, WRITE_CACHE, max_binlog_size, 0, TRUE);
+ mysql_mutex_unlock(&LOCK_log);
+ cleanup();
+ DBUG_RETURN(1);
+ }
+
+ error= do_binlog_recovery(opt_name, true);
+ binlog_state_recover_done= true;
+ DBUG_RETURN(error);
+}
+
+/** This is called on shutdown, after ha_panic. */
+void TC_LOG_BINLOG::close()
+{
+}
+
+/*
+ Do a binlog log_xid() for a group of transactions, linked through
+ thd->next_commit_ordered.
+*/
+int
+TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
+ bool need_prepare_ordered __attribute__((unused)),
+ bool need_commit_ordered __attribute__((unused)))
+{
+ int err;
+ DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
+
+ binlog_cache_mngr *cache_mngr= thd->binlog_setup_trx_data();
+ if (!cache_mngr)
+ {
+ WSREP_DEBUG("Skipping empty log_xid: %s", thd->query());
+ DBUG_RETURN(0);
+ }
+
+ cache_mngr->using_xa= TRUE;
+ cache_mngr->xa_xid= xid;
+ err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid);
+
+ DEBUG_SYNC(thd, "binlog_after_log_and_order");
+
+ if (err)
+ DBUG_RETURN(0);
+
+ bool need_unlog= cache_mngr->need_unlog;
+ /*
+ The transaction won't need the flag anymore.
+ Todo/fixme: consider to move the statement into cache_mngr->reset()
+ relocated to the current or later point.
+ */
+ cache_mngr->need_unlog= false;
+ /*
+ If using explicit user XA, we will not have XID. We must still return a
+ non-zero cookie (as zero cookie signals error).
+ */
+ if (!xid || !need_unlog)
+ DBUG_RETURN(BINLOG_COOKIE_DUMMY(cache_mngr->delayed_error));
+
+ DBUG_RETURN(BINLOG_COOKIE_MAKE(cache_mngr->binlog_id,
+ cache_mngr->delayed_error));
+}
+
+/*
+ After an XID is logged, we need to hold on to the current binlog file until
+ it is fully committed in the storage engine. The reason is that crash
+ recovery only looks at the latest binlog, so we must make sure there are no
+ outstanding prepared (but not committed) transactions before rotating the
+ binlog.
+
+ To handle this, we keep a count of outstanding XIDs. This function is used
+ to increase this count when committing one or more transactions to the
+ binary log.
+*/
+void
+TC_LOG_BINLOG::mark_xids_active(ulong binlog_id, uint xid_count)
+{
+ xid_count_per_binlog *b;
+
+ DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
+ DBUG_PRINT("info", ("binlog_id=%lu xid_count=%u", binlog_id, xid_count));
+
+ mysql_mutex_lock(&LOCK_xid_list);
+ I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+ while ((b= it++))
+ {
+ if (b->binlog_id == binlog_id)
+ {
+ b->xid_count += xid_count;
+ break;
+ }
+ }
+ /*
+ As we do not delete elements until count reach zero, elements should always
+ be found.
+ */
+ DBUG_ASSERT(b);
+ mysql_mutex_unlock(&LOCK_xid_list);
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Once an XID is committed, it can no longer be needed during crash recovery,
+ as it has been durably recorded on disk as "committed".
+
+ This function is called to mark an XID this way. It needs to decrease the
+ count of pending XIDs in the corresponding binlog. When the count reaches
+ zero (for an "old" binlog that is not the active one), that binlog file no
+ longer need to be scanned during crash recovery, so we can log a new binlog
+ checkpoint.
+*/
+void
+TC_LOG_BINLOG::mark_xid_done(ulong binlog_id, bool write_checkpoint)
+{
+ xid_count_per_binlog *b;
+ bool first;
+ ulong current;
+
+ DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
+
+ mysql_mutex_lock(&LOCK_xid_list);
+ current= current_binlog_id;
+ I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+ first= true;
+ while ((b= it++))
+ {
+ if (b->binlog_id == binlog_id)
+ {
+ --b->xid_count;
+
+ DBUG_ASSERT(b->xid_count >= 0); // catch unmatched (++) decrement
+
+ break;
+ }
+ first= false;
+ }
+ /* Binlog is always found, as we do not remove until count reaches 0 */
+ DBUG_ASSERT(b);
+ /*
+ If a RESET MASTER is pending, we are about to remove all log files, and
+ the RESET MASTER thread is waiting for all pending unlog() calls to
+ complete while holding LOCK_log. In this case we should not log a binlog
+ checkpoint event (it would be deleted immediately anyway and we would
+ deadlock on LOCK_log) but just signal the thread.
+ */
+ if (unlikely(reset_master_pending))
+ {
+ mysql_cond_broadcast(&COND_xid_list);
+ mysql_mutex_unlock(&LOCK_xid_list);
+ DBUG_VOID_RETURN;
+ }
+
+ if (likely(binlog_id == current) || b->xid_count != 0 || !first ||
+ !write_checkpoint)
+ {
+ /* No new binlog checkpoint reached yet. */
+ mysql_mutex_unlock(&LOCK_xid_list);
+ DBUG_VOID_RETURN;
+ }
+
+ /*
+ Now log a binlog checkpoint for the first binlog file with a non-zero count.
+
+ Note that it is possible (though perhaps unlikely) that when count of
+ binlog (N-2) drops to zero, binlog (N-1) is already at zero. So we may
+ need to skip several entries before we find the one to log in the binlog
+ checkpoint event.
+
+ We chain the locking of LOCK_xid_list and LOCK_log, so that we ensure that
+ Binlog_checkpoint_events are logged in order. This simplifies recovery a
+ bit, as it can just take the last binlog checkpoint in the log, rather
+ than compare all found against each other to find the one pointing to the
+ most recent binlog.
+
+ Note also that we need to first release LOCK_xid_list, then acquire
+ LOCK_log, then re-aquire LOCK_xid_list. If we were to take LOCK_log while
+ holding LOCK_xid_list, we might deadlock with other threads that take the
+ locks in the opposite order.
+ */
+
+ ++mark_xid_done_waiting;
+ mysql_mutex_unlock(&LOCK_xid_list);
+ mysql_mutex_lock(&LOCK_log);
+ mysql_mutex_lock(&LOCK_xid_list);
+ --mark_xid_done_waiting;
+ mysql_cond_broadcast(&COND_xid_list);
+ /* We need to reload current_binlog_id due to release/re-take of lock. */
+ current= current_binlog_id;
+
+ for (;;)
+ {
+ /* Remove initial element(s) with zero count. */
+ b= binlog_xid_count_list.head();
+ /*
+ We must not remove all elements in the list - the entry for the current
+ binlog must be present always.
+ */
+ DBUG_ASSERT(b);
+ if (b->binlog_id == current || b->xid_count > 0)
+ break;
+ WSREP_XID_LIST_ENTRY("TC_LOG_BINLOG::mark_xid_done(): Removing "
+ "xid_list_entry for %s (%lu)", b);
+ delete binlog_xid_count_list.get();
+ }
+
+ mysql_mutex_unlock(&LOCK_xid_list);
+ write_binlog_checkpoint_event_already_locked(b->binlog_name,
+ b->binlog_name_len);
+ mysql_mutex_unlock(&LOCK_log);
+ DBUG_VOID_RETURN;
+}
+
+int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+{
+ DBUG_ENTER("TC_LOG_BINLOG::unlog");
+ if (!xid)
+ DBUG_RETURN(0);
+
+ if (!BINLOG_COOKIE_IS_DUMMY(cookie))
+ mark_xid_done(BINLOG_COOKIE_GET_ID(cookie), true);
+ /*
+ See comment in trx_group_commit_leader() - if rotate() gave a failure,
+ we delay the return of error code to here.
+ */
+ DBUG_RETURN(BINLOG_COOKIE_GET_ERROR_FLAG(cookie));
+}
+
+static bool write_empty_xa_prepare(THD *thd, binlog_cache_mngr *cache_mngr)
+{
+ return binlog_commit_flush_xa_prepare(thd, true, cache_mngr);
+}
+
+int TC_LOG_BINLOG::unlog_xa_prepare(THD *thd, bool all)
+{
+ DBUG_ASSERT(is_preparing_xa(thd));
+
+ binlog_cache_mngr *cache_mngr= thd->binlog_setup_trx_data();
+ int cookie= 0;
+
+ if (!cache_mngr->need_unlog)
+ {
+ Ha_trx_info *ha_info;
+ uint rw_count= ha_count_rw_all(thd, &ha_info);
+ bool rc= false;
+
+ /*
+ This transaction has not been binlogged as indicated by need_unlog.
+ Such exceptional cases include transactions with no effect to engines,
+ e.g REPLACE that does not change the dat but still the Engine
+ transaction branch claims to be rw, and few more.
+ In all such cases an empty XA-prepare group of events is bin-logged.
+ */
+ if (rw_count > 0)
+ {
+ /* an empty XA-prepare event group is logged */
+ rc= write_empty_xa_prepare(thd, cache_mngr); // normally gains need_unlog
+ trans_register_ha(thd, true, binlog_hton, 0); // do it for future commmit
+ thd->ha_data[binlog_hton->slot].ha_info[1].set_trx_read_write();
+ }
+ if (rw_count == 0 || !cache_mngr->need_unlog)
+ return rc;
+ }
+
+ cookie= BINLOG_COOKIE_MAKE(cache_mngr->binlog_id, cache_mngr->delayed_error);
+ cache_mngr->need_unlog= false;
+
+ return unlog(cookie, 1);
+}
+
+
+void
+TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
+{
+ xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
+ bool found_entry= false;
+ mysql_mutex_lock(&LOCK_binlog_background_thread);
+ /* count the same notification kind from different engines */
+ for (xid_count_per_binlog *link= binlog_background_thread_queue;
+ link && !found_entry; link= link->next_in_queue)
+ {
+ if ((found_entry= (entry == link)))
+ entry->notify_count++;
+ }
+ if (!found_entry)
+ {
+ entry->next_in_queue= binlog_background_thread_queue;
+ binlog_background_thread_queue= entry;
+ }
+ mysql_cond_signal(&COND_binlog_background_thread);
+ mysql_mutex_unlock(&LOCK_binlog_background_thread);
+}
+
+/*
+ Binlog background thread.
+
+ This thread is used to log binlog checkpoints in the background, rather than
+ in the context of random storage engine threads that happen to call
+ commit_checkpoint_notify_ha() and may not like the delays while syncing
+ binlog to disk or may not be setup with all my_thread_init() and other
+ necessary stuff.
+
+ In the future, this thread could also be used to do log rotation in the
+ background, which could eliminate all stalls around binlog rotations.
+*/
+pthread_handler_t
+binlog_background_thread(void *arg __attribute__((unused)))
+{
+ bool stop;
+ MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
+ THD *thd;
+ my_thread_init();
+ DBUG_ENTER("binlog_background_thread");
+
+ thd= new THD(next_thread_id());
+ thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
+ thd->thread_stack= (char*) &thd; /* Set approximate stack start */
+ thd->store_globals();
+ thd->security_ctx->skip_grants();
+ thd->set_command(COM_DAEMON);
+ THD_count::count--;
+
+ /*
+ Load the slave replication GTID state from the mysql.gtid_slave_pos
+ table.
+
+ This is mostly so that we can start our seq_no counter from the highest
+ seq_no seen by a slave. This way, we have a way to tell if a transaction
+ logged by ourselves as master is newer or older than a replicated
+ transaction.
+ */
+#ifdef HAVE_REPLICATION
+ if (rpl_load_gtid_slave_state(thd))
+ sql_print_warning("Failed to load slave replication state from table "
+ "%s.%s: %u: %s", "mysql",
+ rpl_gtid_slave_state_table_name.str,
+ thd->get_stmt_da()->sql_errno(),
+ thd->get_stmt_da()->message());
+#endif
+
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ binlog_background_thread_started= true;
+ mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ for (;;)
+ {
+ /*
+ Wait until there is something in the queue to process, or we are asked
+ to shut down.
+ */
+ THD_STAGE_INFO(thd, stage_binlog_waiting_background_tasks);
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ for (;;)
+ {
+ stop= binlog_background_thread_stop;
+ queue= binlog_background_thread_queue;
+ if (stop && !mysql_bin_log.is_xidlist_idle())
+ {
+ /*
+ Delay stop until all pending binlog checkpoints have been processed.
+ */
+ stop= false;
+ }
+ if (stop || queue)
+ break;
+ mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread,
+ &mysql_bin_log.LOCK_binlog_background_thread);
+ }
+ /* Grab the queue, if any. */
+ binlog_background_thread_queue= NULL;
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ /* Process any incoming commit_checkpoint_notify() calls. */
+#ifdef ENABLED_DEBUG_SYNC
+ DBUG_EXECUTE_IF("inject_binlog_background_thread_before_mark_xid_done",
+ DBUG_ASSERT(!debug_sync_set_action(
+ thd,
+ STRING_WITH_LEN("binlog_background_thread_before_mark_xid_done "
+ "SIGNAL injected_binlog_background_thread "
+ "WAIT_FOR something_that_will_never_happen "
+ "TIMEOUT 2")));
+ );
+#endif
+ while (queue)
+ {
+ long count= queue->notify_count;
+ THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify);
+ DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done");
+ /* Set the thread start time */
+ thd->set_time();
+ /* Grab next pointer first, as mark_xid_done() may free the element. */
+ next= queue->next_in_queue;
+ queue->notify_count= 0;
+ for (long i= 0; i <= count; i++)
+ mysql_bin_log.mark_xid_done(queue->binlog_id, true);
+ queue= next;
+
+#ifdef ENABLED_DEBUG_SYNC
+ DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
+ DBUG_ASSERT(!debug_sync_set_action(
+ thd,
+ STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
+ );
+#endif
+ }
+
+ if (stop)
+ break;
+ }
+
+ THD_STAGE_INFO(thd, stage_binlog_stopping_background_thread);
+
+ /* No need to use mutex as thd is not linked into other threads */
+ THD_count::count++;
+ delete thd;
+
+ my_thread_end();
+
+ /* Signal that we are (almost) stopped. */
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ binlog_background_thread_stop= false;
+ mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ DBUG_RETURN(0);
+}
+
+#ifdef HAVE_PSI_INTERFACE
+static PSI_thread_key key_thread_binlog;
+
+static PSI_thread_info all_binlog_threads[]=
+{
+ { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
+};
+#endif /* HAVE_PSI_INTERFACE */
+
+static bool
+start_binlog_background_thread()
+{
+ pthread_t th;
+
+#ifdef HAVE_PSI_INTERFACE
+ if (PSI_server)
+ PSI_server->register_thread("sql", all_binlog_threads,
+ array_elements(all_binlog_threads));
+#endif
+
+ if (mysql_thread_create(key_thread_binlog, &th, &connection_attrib,
+ binlog_background_thread, NULL))
+ return 1;
+
+ /*
+ Wait for the thread to have started (so we know that the slave replication
+ state is loaded and we have correct global_gtid_counter).
+ */
+ mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
+ while (!binlog_background_thread_started)
+ mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread_end,
+ &mysql_bin_log.LOCK_binlog_background_thread);
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
+
+ return 0;
+}
+#ifdef HAVE_REPLICATION
+class Recovery_context
+{
+public:
+ my_off_t prev_event_pos;
+ rpl_gtid last_gtid;
+ bool last_gtid_standalone;
+ bool last_gtid_valid;
+ bool last_gtid_no2pc; // true when the group does not end with Xid event
+ uint last_gtid_engines;
+ Binlog_offset last_gtid_coord; // <binlog id, binlog offset>
+ /*
+ When true, it's semisync slave recovery mode
+ rolls back transactions in doubt and wipes them off from binlog.
+ The rest of declarations deal with this type of recovery.
+ */
+ bool do_truncate;
+ /*
+ transaction-in-doubt's gtid:s. `truncate_gtid` is the ultimate value,
+ if it's non-zero truncation is taking place to start from it.
+ Its value gets refined throughout binlog scanning conducted with at most
+ 2 rounds.
+ When an estimate is done in the 1st round of 2-round recovery its value
+ gets memorized for possible adoption as the ultimate `truncate_gtid`.
+ */
+ rpl_gtid truncate_gtid, truncate_gtid_1st_round;
+ /*
+ the last non-transactional group that is located in binlog
+ behind truncate_gtid.
+ */
+ rpl_gtid binlog_unsafe_gtid;
+ char binlog_truncate_file_name[FN_REFLEN] ;
+ char binlog_unsafe_file_name[FN_REFLEN] ;
+ /*
+ When do_truncate is true, the truncate position may not be
+ found in one round when recovered transactions are multi-engine
+ or just on different engines.
+ In the single recoverable engine case `truncate_reset_done` and
+ therefore `truncate_validated` remains `false` when the last
+ binlog is the binlog-checkpoint one.
+ The meaning of `truncate_reset_done` is according to the following example:
+ Let round = 1, Binlog contains the sequence of replication event groups:
+ [g1, G2, g3]
+ where `G` (in capital) stands for committed, `g` for prepared.
+ g1 is first set as truncation candidate, then G2 reset it to indicate
+ the actual truncation is behind (to the right of) it.
+ `truncate_validated` is set to true when `binlog_truncate_pos` (as of `g3`)
+ won't change.
+ Observe last_gtid_valid is affected, so in the above example `g1` that
+ was initially ignored for the gtid binlog state now seeing `G2`
+ would have to be added to it. See gtid_maybe_to_truncate.
+ */
+ bool truncate_validated; // trued when the truncate position settled
+ bool truncate_reset_done; // trued when the position is to reevaluate
+ /* Flags the fact of truncate position estimation is done the 1st round */
+ bool truncate_set_in_1st;
+ /*
+ Monotonically indexes binlog files in the recovery list.
+ When the list is "likely" singleton the value is UINT_MAX.
+ Otherwise enumeration starts with zero for the first file, increments
+ by one for any next file except for the last file in the list, which
+ is also the initial binlog file for recovery,
+ that is enumberated with UINT_MAX.
+ */
+ Binlog_file_id id_binlog;
+ enum_binlog_checksum_alg checksum_alg;
+ Binlog_offset binlog_truncate_coord,
+ binlog_truncate_coord_1st_round; // pair is similar to truncate_gtid
+ Binlog_offset binlog_unsafe_coord;
+ /*
+ Populated at decide_or_assess() with gtid-in-doubt whose
+ binlog offset greater of equal by that of the current gtid truncate
+ candidate.
+ Gets empited by reset_truncate_coord into gtid binlog state.
+ */
+ Dynamic_array<rpl_gtid> *gtid_maybe_to_truncate;
+ Recovery_context();
+ ~Recovery_context() { delete gtid_maybe_to_truncate; }
+ /*
+ Completes the recovery procedure.
+ In the normal case prepared xids gets committed when they also found
+ in binlog, otherwise they are rolled back.
+ In the semisync slave case the xids that are located in binlog in
+ a truncated tail get rolled back, otherwise they are committed.
+ Both decisions are contingent on safety to truncate.
+ */
+ bool complete(MYSQL_BIN_LOG *log, HASH &xids);
+
+ /*
+ decides on commit of xid passed through member argument.
+ In the semisync slave case it assigns binlog coordinate to
+ any xid that remains in-doubt. Decision on them will be
+ done after binlog scan rounds.
+ */
+ bool decide_or_assess(xid_recovery_member *member, int round,
+ Format_description_log_event *fdle,
+ LOG_INFO *linfo, my_off_t pos);
+
+ /*
+ Assigns last_gtid and assesses the maximum (in the binlog offset term)
+ unsafe gtid (group of events).
+ */
+ void process_gtid(int round, Gtid_log_event *gev, LOG_INFO *linfo);
+
+ /*
+ Compute next action at the end of processing of the current binlog file.
+ It may increment the round.
+ When the round turns in the semisync-slave recovery
+ binlog_id, truncate_validated, truncate_reset_done
+ gets reset/set for the next round.
+ Within the 2nd round id_binlog keeps incrementing.
+
+ Passed arguments:
+ round the current round that *may* be increment here
+ last_log_name the recovery starting binlog file
+ binlog_checkpoint_name
+ binlog checkpoint file
+ linfo binlog file list struct for next file
+ log pointer to mysql_bin_log instance
+
+ Returns: 0 when rounds continue, maybe the current one remains
+ 1 when all rounds are done
+ */
+ int next_binlog_or_round(int& round,
+ const char *last_log_name,
+ const char *binlog_checkpoint_name,
+ LOG_INFO *linfo, MYSQL_BIN_LOG *log);
+ /*
+ Relates to the semisync recovery.
+ Returns true when truncated tail does not contain non-transactional
+ group of events.
+ Otherwise returns false.
+ */
+ bool is_safe_to_truncate()
+ {
+ return !do_truncate ? true :
+ (truncate_gtid.seq_no == 0 || // no truncate
+ binlog_unsafe_coord < binlog_truncate_coord); // or unsafe is earlier
+ }
+
+ /*
+ Relates to the semisync recovery.
+ Is invoked when a standalone or non-2pc group is detected.
+ Both are unsafe to truncate in the semisync-slave recovery so
+ the maximum unsafe coordinate may be updated.
+ In the non-2pc group case though, *exeptionally*,
+ the no-engine group is considered safe, to be invalidated
+ to not contribute to binlog state.
+ */
+ void update_binlog_unsafe_coord_if_needed(LOG_INFO *linfo);
+
+ /*
+ Relates to the semisync recovery.
+ Is called when a committed or decided to-commit transaction is detected.
+ Actions:
+ truncate_gtid then is set to "nil" as indicated by rpl_gtid::seq_no := 0.
+ truncate_reset_done takes a note of that fact.
+ binlog_truncate_coord gets reset to the current gtid offset merely to
+ "suggest" any potential future truncate gtid must have a greater offset.
+ gtid_maybe_to_truncate gets emptied into gtid binlog state.
+
+ Returns:
+ false on success, otherwise
+ true when OOM at rpl_global_gtid_binlog_state insert
+ */
+ bool reset_truncate_coord(my_off_t pos);
+
+ /*
+ Sets binlog_truncate_pos to the value of the current transaction's gtid.
+ In multi-engine case that might be just an assessment to be refined
+ in the current round and confirmed in a next one.
+ gtid_maybe_to_truncate receives the current gtid as a new element.
+ Returns
+ false on success, otherwise
+ true when OOM at gtid_maybe_to_truncate append
+
+ */
+ bool set_truncate_coord(LOG_INFO *linfo, int round,
+ enum_binlog_checksum_alg fd_checksum_alg);
+};
+
+bool Recovery_context::complete(MYSQL_BIN_LOG *log, HASH &xids)
+{
+ if (!do_truncate || is_safe_to_truncate())
+ {
+ uint count_in_prepare=
+ ha_recover_complete(&xids,
+ !do_truncate ? NULL :
+ (truncate_gtid.seq_no > 0 ?
+ &binlog_truncate_coord : &last_gtid_coord));
+
+ if (count_in_prepare > 0 && global_system_variables.log_warnings > 2)
+ {
+ sql_print_warning("Could not complete %u number of transactions.",
+ count_in_prepare);
+ return false; // there's later dry run ha_recover() to error out
+ }
+ }
+
+ /* Truncation is not done when there's no transaction to roll back */
+ if (do_truncate && truncate_gtid.seq_no > 0)
+ {
+ if (is_safe_to_truncate())
+ {
+ if (log->truncate_and_remove_binlogs(binlog_truncate_file_name,
+ binlog_truncate_coord.second,
+ &truncate_gtid))
+ {
+ sql_print_error("Failed to truncate the binary log to "
+ "file:%s pos:%llu.", binlog_truncate_file_name,
+ binlog_truncate_coord.second);
+ return true;
+ }
+ }
+ else
+ {
+ sql_print_error("Cannot truncate the binary log to file:%s "
+ "pos:%llu as unsafe statement "
+ "is found at file:%s pos:%llu which is "
+ "beyond the truncation position;"
+ "all transactions in doubt are left intact. ",
+ binlog_truncate_file_name, binlog_truncate_coord.second,
+ binlog_unsafe_file_name, binlog_unsafe_coord.second);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+Recovery_context::Recovery_context() :
+ prev_event_pos(0),
+ last_gtid_standalone(false), last_gtid_valid(false), last_gtid_no2pc(false),
+ last_gtid_engines(0),
+ do_truncate(rpl_semi_sync_slave_enabled),
+ truncate_validated(false), truncate_reset_done(false),
+ truncate_set_in_1st(false), id_binlog(MAX_binlog_id),
+ checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF), gtid_maybe_to_truncate(NULL)
+{
+ last_gtid_coord= Binlog_offset(0,0);
+ binlog_truncate_coord= binlog_truncate_coord_1st_round= Binlog_offset(0,0);
+ binlog_unsafe_coord= Binlog_offset(0,0);
+ binlog_truncate_file_name[0]= 0;
+ binlog_unsafe_file_name [0]= 0;
+ binlog_unsafe_gtid= truncate_gtid= truncate_gtid_1st_round= rpl_gtid();
+ if (do_truncate)
+ gtid_maybe_to_truncate= new Dynamic_array<rpl_gtid>(16, 16);
+}
+
+bool Recovery_context::reset_truncate_coord(my_off_t pos)
+{
+ DBUG_ASSERT(binlog_truncate_coord.second == 0 ||
+ last_gtid_coord >= binlog_truncate_coord ||
+ truncate_set_in_1st);
+ // save as backup to restore at next_binlog_or_round when necessary
+ if (truncate_set_in_1st && truncate_gtid_1st_round.seq_no == 0)
+ {
+ truncate_gtid_1st_round= truncate_gtid;
+ binlog_truncate_coord_1st_round= binlog_truncate_coord;
+ }
+ binlog_truncate_coord= Binlog_offset(id_binlog, pos);
+ truncate_gtid= rpl_gtid();
+ truncate_reset_done= true;
+ for (uint i= 0; i < gtid_maybe_to_truncate->elements(); i++)
+ {
+ rpl_gtid gtid= gtid_maybe_to_truncate->at(i);
+ if (rpl_global_gtid_binlog_state.update_nolock(&gtid, false))
+ return true;
+ }
+ gtid_maybe_to_truncate->clear();
+
+ return false;
+}
+
+bool Recovery_context::set_truncate_coord(LOG_INFO *linfo, int round,
+ enum_binlog_checksum_alg fd_checksum)
+{
+ binlog_truncate_coord= last_gtid_coord;
+ strmake_buf(binlog_truncate_file_name, linfo->log_file_name);
+
+ truncate_gtid= last_gtid;
+ checksum_alg= fd_checksum;
+ truncate_set_in_1st= (round == 1);
+
+ return gtid_maybe_to_truncate->append(last_gtid);
+}
+
+bool Recovery_context::decide_or_assess(xid_recovery_member *member, int round,
+ Format_description_log_event *fdle,
+ LOG_INFO *linfo, my_off_t pos)
+{
+ if (member)
+ {
+ /*
+ xid in doubt are resolved as follows:
+ in_engine_prepare is compared agaist binlogged info to
+ yield the commit-or-rollback decision in the normal case.
+ In the semisync-slave recovery the decision is done later
+ after the binlog scanning has determined the truncation offset.
+ */
+ if (member->in_engine_prepare > last_gtid_engines)
+ {
+ char buf[21];
+ longlong10_to_str(last_gtid.seq_no, buf, 10);
+ sql_print_error("Error to recovery multi-engine transaction: "
+ "the number of engines prepared %u exceeds the "
+ "respective number %u in its GTID %u-%u-%s "
+ "located at file:%s pos:%llu",
+ member->in_engine_prepare, last_gtid_engines,
+ last_gtid.domain_id, last_gtid.server_id, buf,
+ linfo->log_file_name, last_gtid_coord.second);
+ return true;
+ }
+ else if (member->in_engine_prepare < last_gtid_engines)
+ {
+ DBUG_ASSERT(member->in_engine_prepare > 0);
+ /*
+ This is an "unlikely" branch of two or more engines in transaction
+ that is partially committed, so to complete.
+ */
+ member->decided_to_commit= true;
+ if (do_truncate)
+ {
+ /* Validated truncate at this point can be only in the 2nd round. */
+ DBUG_ASSERT(!truncate_validated ||
+ (round == 2 && truncate_set_in_1st &&
+ last_gtid_coord < binlog_truncate_coord));
+ /*
+ Estimated truncate must not be greater than the current one's
+ offset, unless the turn of the rounds.
+ */
+ DBUG_ASSERT(truncate_validated ||
+ (last_gtid_coord >= binlog_truncate_coord ||
+ (round == 2 && truncate_set_in_1st)));
+
+ if (!truncate_validated && reset_truncate_coord(pos))
+ return true;
+ }
+ }
+ else // member->in_engine_prepare == last_gtid_engines
+ {
+ if (!do_truncate) // "normal" recovery
+ {
+ member->decided_to_commit= true;
+ }
+ else
+ {
+ member->binlog_coord= last_gtid_coord;
+ last_gtid_valid= false;
+ /*
+ First time truncate position estimate before its validation.
+ An estimate may change to involve reset_truncate_coord call.
+ */
+ if (!truncate_validated)
+ {
+ if (truncate_gtid.seq_no == 0 /* was reset or never set */ ||
+ (truncate_set_in_1st && round == 2 /* reevaluted at round turn */))
+ {
+ if (set_truncate_coord(linfo, round, fdle->checksum_alg))
+ return true;
+ }
+ else
+ {
+ /* Truncate estimate was done ago, this gtid can't improve it. */
+ DBUG_ASSERT(last_gtid_coord >= binlog_truncate_coord);
+
+ gtid_maybe_to_truncate->append(last_gtid);
+ }
+
+ DBUG_ASSERT(member->decided_to_commit == false); // may redecided
+ }
+ else
+ {
+ /*
+ binlog truncate was determined, possibly to none, otherwise
+ its offset greater than that of the current gtid.
+ */
+ DBUG_ASSERT(truncate_gtid.seq_no == 0 ||
+ last_gtid_coord < binlog_truncate_coord);
+ member->decided_to_commit= true;
+ }
+ }
+ }
+ }
+ else if (do_truncate) // "0" < last_gtid_engines
+ {
+ /*
+ Similar to the partial commit branch above.
+ */
+ DBUG_ASSERT(!truncate_validated || last_gtid_coord < binlog_truncate_coord);
+ DBUG_ASSERT(truncate_validated ||
+ (last_gtid_coord >= binlog_truncate_coord ||
+ (round == 2 && truncate_set_in_1st)));
+
+ if (!truncate_validated && reset_truncate_coord(pos))
+ return true;
+ }
+
+ return false;
+}
+
+void Recovery_context::update_binlog_unsafe_coord_if_needed(LOG_INFO *linfo)
+{
+ if (!do_truncate)
+ return;
+
+ if (truncate_gtid.seq_no > 0 && // g1,U2, *not* G1,U2
+ last_gtid_coord > binlog_truncate_coord)
+ {
+ DBUG_ASSERT(binlog_truncate_coord.second > 0);
+ /*
+ Potentially unsafe when the truncate coordinate is not determined,
+ just detected as unsafe when behind the latter.
+ */
+ if (last_gtid_engines == 0)
+ {
+ last_gtid_valid= false;
+ }
+ else
+ {
+ binlog_unsafe_gtid= last_gtid;
+ binlog_unsafe_coord= last_gtid_coord;
+ strmake_buf(binlog_unsafe_file_name, linfo->log_file_name);
+ }
+ }
+}
+
+void Recovery_context::process_gtid(int round, Gtid_log_event *gev,
+ LOG_INFO *linfo)
+{
+ last_gtid.domain_id= gev->domain_id;
+ last_gtid.server_id= gev->server_id;
+ last_gtid.seq_no= gev->seq_no;
+ last_gtid_engines= gev->extra_engines != UCHAR_MAX ?
+ gev->extra_engines + 1 : 0;
+ last_gtid_coord= Binlog_offset(id_binlog, prev_event_pos);
+
+ DBUG_ASSERT(!last_gtid_valid);
+ DBUG_ASSERT(last_gtid.seq_no != 0);
+
+ if (round == 1 || (do_truncate && !truncate_validated))
+ {
+ DBUG_ASSERT(!last_gtid_valid);
+
+ last_gtid_no2pc= false;
+ last_gtid_standalone=
+ (gev->flags2 & Gtid_log_event::FL_STANDALONE) ? true : false;
+ if (do_truncate && last_gtid_standalone)
+ update_binlog_unsafe_coord_if_needed(linfo);
+ /* Update the binlog state with any 'valid' GTID logged after Gtid_list. */
+ last_gtid_valid= true; // may flip at Xid when falls to truncate
+ }
+}
+
+int Recovery_context::next_binlog_or_round(int& round,
+ const char *last_log_name,
+ const char *binlog_checkpoint_name,
+ LOG_INFO *linfo,
+ MYSQL_BIN_LOG *log)
+{
+ if (!strcmp(linfo->log_file_name, last_log_name))
+ {
+ /* Exit the loop now at the end of the current round. */
+ DBUG_ASSERT(round <= 2);
+
+ if (do_truncate)
+ {
+ truncate_validated= truncate_reset_done;
+ truncate_reset_done= false;
+ /*
+ Restore the 1st round saved estimate if it was not refined in the 2nd.
+ That can only occur in multiple log files context when the inital file
+ has a truncation candidate (a `g`) and does not have any commited `G`,
+ *and* other files (binlog-checkpoint one and so on) do not have any
+ transaction-in-doubt.
+ */
+ if (truncate_gtid.seq_no == 0 && truncate_set_in_1st)
+ {
+ DBUG_ASSERT(truncate_gtid_1st_round.seq_no > 0);
+
+ truncate_gtid= truncate_gtid_1st_round;
+ binlog_truncate_coord= binlog_truncate_coord_1st_round;
+ }
+ }
+ return 1;
+ }
+ else if (round == 1)
+ {
+ if (do_truncate)
+ {
+ truncate_validated= truncate_reset_done;
+ if (!truncate_validated)
+ {
+ rpl_global_gtid_binlog_state.reset_nolock();
+ gtid_maybe_to_truncate->clear();
+ }
+ truncate_reset_done= false;
+ id_binlog= 0;
+ }
+ round++;
+ }
+ else if (do_truncate) // binlog looping within round 2
+ {
+ id_binlog++;
+
+ DBUG_ASSERT(id_binlog <= MAX_binlog_id); // the assert is "practical"
+ }
+
+ DBUG_ASSERT(!do_truncate || id_binlog != MAX_binlog_id ||
+ !strcmp(linfo->log_file_name, binlog_checkpoint_name));
+
+ return 0;
+}
+#endif
+
+/*
+ Execute recovery of the binary log
+
+ @param do_xa
+ if true: Collect all Xid events and call ha_recover().
+ if false: Collect only Xid events from Query events. This is
+ used to disable entries in the ddl recovery log that
+ are found in the binary log (and thus already executed and
+ logged and thus don't have to be redone).
+*/
+
+int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
+ IO_CACHE *first_log,
+ Format_description_log_event *fdle, bool do_xa)
+{
+ Log_event *ev= NULL;
+ HASH xids, ddl_log_ids;
+ MEM_ROOT mem_root;
+ char binlog_checkpoint_name[FN_REFLEN];
+ bool binlog_checkpoint_found;
+ IO_CACHE log;
+ File file= -1;
+ const char *errmsg;
+#ifdef HAVE_REPLICATION
+ Recovery_context ctx;
+#endif
+ DBUG_ENTER("TC_LOG_BINLOG::recover");
+ /*
+ The for-loop variable is updated by the following rule set:
+ Initially set to 1.
+ After the initial binlog file is processed to identify
+ the Binlog-checkpoint file it is incremented when the latter file
+ is different from the initial one. Otherwise the only log has been
+ fully parsed so the for loop exits.
+ The 2nd round parses all earlier in binlog index order files
+ starting from the Binlog-checkpoint file. It ends when the initial
+ binlog file is reached.
+ */
+ int round;
+
+ if (! fdle->is_valid() ||
+ (my_hash_init(key_memory_binlog_recover_exec, &xids,
+ &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0,
+ sizeof(my_xid), 0, 0, MYF(0))) ||
+ (my_hash_init(key_memory_binlog_recover_exec, &ddl_log_ids,
+ &my_charset_bin, 64, 0,
+ sizeof(my_xid), 0, 0, MYF(0))))
+ goto err1;
+
+ init_alloc_root(key_memory_binlog_recover_exec, &mem_root,
+ TC_LOG_PAGE_SIZE, TC_LOG_PAGE_SIZE, MYF(0));
+
+ fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error
+
+ /* finds xids when root is not NULL */
+ if (do_xa && ha_recover(&xids, &mem_root))
+ goto err1;
+
+ /*
+ Scan the binlog for XIDs that need to be committed if still in the
+ prepared stage.
+
+ Start with the latest binlog file, then continue with any other binlog
+ files if the last found binlog checkpoint indicates it is needed.
+ */
+
+ binlog_checkpoint_found= false;
+ for (round= 1;;)
+ {
+ while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log,
+ fdle, opt_master_verify_checksum))
+ && ev->is_valid())
+ {
+ enum Log_event_type typ= ev->get_type_code();
+ switch (typ)
+ {
+ case XID_EVENT:
+ if (do_xa)
+ {
+ xid_recovery_member *member=
+ (xid_recovery_member*)
+ my_hash_search(&xids, (uchar*) &static_cast<Xid_log_event*>(ev)->xid,
+ sizeof(my_xid));
+#ifndef HAVE_REPLICATION
+ {
+ if (member)
+ member->decided_to_commit= true;
+ }
+#else
+ if (ctx.decide_or_assess(member, round, fdle, linfo, ev->log_pos))
+ goto err2;
+#endif
+ }
+ break;
+ case QUERY_EVENT:
+ {
+ Query_log_event *query_ev= (Query_log_event*) ev;
+ if (query_ev->xid)
+ {
+ DBUG_PRINT("QQ", ("xid: %llu xid"));
+ DBUG_ASSERT(sizeof(query_ev->xid) == sizeof(my_xid));
+ uchar *x= (uchar *) memdup_root(&mem_root,
+ (uchar*) &query_ev->xid,
+ sizeof(query_ev->xid));
+ if (!x || my_hash_insert(&ddl_log_ids, x))
+ goto err2;
+ }
+#ifdef HAVE_REPLICATION
+ if (((Query_log_event *)ev)->is_commit() ||
+ ((Query_log_event *)ev)->is_rollback())
+ {
+ ctx.last_gtid_no2pc= true;
+ ctx.update_binlog_unsafe_coord_if_needed(linfo);
+ }
+#endif
+ break;
+ }
+ case BINLOG_CHECKPOINT_EVENT:
+ if (round == 1 && do_xa)
+ {
+ size_t dir_len;
+ Binlog_checkpoint_log_event *cev= (Binlog_checkpoint_log_event *)ev;
+ if (cev->binlog_file_len >= FN_REFLEN)
+ sql_print_warning("Incorrect binlog checkpoint event with too "
+ "long file name found.");
+ else
+ {
+ /*
+ Note that we cannot use make_log_name() here, as we have not yet
+ initialised MYSQL_BIN_LOG::log_file_name.
+ */
+ dir_len= dirname_length(last_log_name);
+ strmake(strnmov(binlog_checkpoint_name, last_log_name, dir_len),
+ cev->binlog_file_name, FN_REFLEN - 1 - dir_len);
+ binlog_checkpoint_found= true;
+ }
+ }
+ break;
+#ifdef HAVE_REPLICATION
+ case GTID_LIST_EVENT:
+ if (round == 1 || (ctx.do_truncate && ctx.id_binlog == 0))
+ {
+ Gtid_list_log_event *glev= (Gtid_list_log_event *)ev;
+
+ /* Initialise the binlog state from the Gtid_list event. */
+ if (rpl_global_gtid_binlog_state.load(glev->list, glev->count))
+ goto err2;
+ }
+ break;
+
+ case GTID_EVENT:
+ ctx.process_gtid(round, (Gtid_log_event *)ev, linfo);
+ break;
+
+ case XA_PREPARE_LOG_EVENT:
+ ctx.last_gtid_no2pc= true; // TODO: complete MDEV-21469 that removes this block
+ ctx.update_binlog_unsafe_coord_if_needed(linfo);
+ break;
+#endif
+
+ case START_ENCRYPTION_EVENT:
+ {
+ if (fdle->start_decryption((Start_encryption_log_event*) ev))
+ goto err2;
+ }
+ break;
+
+ default:
+ /* Nothing. */
+ break;
+ } // end of switch
+
+#ifdef HAVE_REPLICATION
+ if (ctx.last_gtid_valid &&
+ ((ctx.last_gtid_standalone && !ev->is_part_of_group(typ)) ||
+ (!ctx.last_gtid_standalone &&
+ (typ == XID_EVENT || ctx.last_gtid_no2pc))))
+ {
+ DBUG_ASSERT(round == 1 || (ctx.do_truncate && !ctx.truncate_validated));
+ DBUG_ASSERT(!ctx.last_gtid_no2pc ||
+ (ctx.last_gtid_standalone ||
+ typ == XA_PREPARE_LOG_EVENT ||
+ (LOG_EVENT_IS_QUERY(typ) &&
+ (((Query_log_event *)ev)->is_commit() ||
+ ((Query_log_event *)ev)->is_rollback()))));
+
+ if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid, false))
+ goto err2;
+ ctx.last_gtid_valid= false;
+ }
+ ctx.prev_event_pos= ev->log_pos;
+#endif
+ delete ev;
+ ev= NULL;
+ } // end of while
+
+ /*
+ If the last binlog checkpoint event points to an older log, we have to
+ scan all logs from there also, to get all possible XIDs to recover.
+
+ If there was no binlog checkpoint event at all, this means the log was
+ written by an older version of MariaDB (or MySQL) - these always have an
+ (implicit) binlog checkpoint event at the start of the last binlog file.
+ */
+ if (round == 1)
+ {
+ if (!binlog_checkpoint_found)
+ break;
+ DBUG_EXECUTE_IF("xa_recover_expect_master_bin_000004",
+ if (0 != strcmp("./master-bin.000004", binlog_checkpoint_name) &&
+ 0 != strcmp(".\\master-bin.000004", binlog_checkpoint_name))
+ DBUG_SUICIDE();
+ );
+ if (find_log_pos(linfo, binlog_checkpoint_name, 1))
+ {
+ sql_print_error("Binlog file '%s' not found in binlog index, needed "
+ "for recovery. Aborting.", binlog_checkpoint_name);
+ goto err2;
+ }
+ }
+ else
+ {
+ end_io_cache(&log);
+ mysql_file_close(file, MYF(MY_WME));
+ file= -1;
+ /*
+ NOTE: reading other binlog's FD is necessary for finding out
+ the checksum status of the respective binlog file.
+ */
+ if (find_next_log(linfo, 1))
+ {
+ sql_print_error("Error reading binlog files during recovery. "
+ "Aborting.");
+ goto err2;
+ }
+ }
+
+#ifdef HAVE_REPLICATION
+ int rc= ctx.next_binlog_or_round(round, last_log_name,
+ binlog_checkpoint_name, linfo, this);
+ if (rc == -1)
+ goto err2;
+ else if (rc == 1)
+ break; // all rounds done
+#else
+ if (!strcmp(linfo->log_file_name, last_log_name))
+ break; // No more files to do
+ round++;
+#endif
+
+ if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0)
+ {
+ sql_print_error("%s", errmsg);
+ goto err2;
+ }
+ fdle->reset_crypto();
+ } // end of for
+
+ if (do_xa)
+ {
+ if (binlog_checkpoint_found)
+ {
+#ifndef HAVE_REPLICATION
+ if (ha_recover_complete(&xids))
+#else
+ if (ctx.complete(this, xids))
+#endif
+ goto err2;
+ }
+ }
+ if (ddl_log_close_binlogged_events(&ddl_log_ids))
+ goto err2;
+ free_root(&mem_root, MYF(0));
+ my_hash_free(&xids);
+ my_hash_free(&ddl_log_ids);
+ DBUG_RETURN(0);
+
+err2:
+ delete ev;
+ if (file >= 0)
+ {
+ end_io_cache(&log);
+ mysql_file_close(file, MYF(MY_WME));
+ }
+ free_root(&mem_root, MYF(0));
+ my_hash_free(&xids);
+ my_hash_free(&ddl_log_ids);
+
+err1:
+ sql_print_error("Crash recovery failed. Either correct the problem "
+ "(if it's, for example, out of memory error) and restart, "
+ "or delete (or rename) binary log and start serverwith "
+ "--tc-heuristic-recover={commit|rollback}");
+ DBUG_RETURN(1);
+}
+
+
+
+int
+MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
+{
+ LOG_INFO log_info;
+ const char *errmsg;
+ IO_CACHE log;
+ File file;
+ Log_event *ev= 0;
+ Format_description_log_event fdle(BINLOG_VERSION);
+ char log_name[FN_REFLEN];
+ int error;
+
+ if (unlikely((error= find_log_pos(&log_info, NullS, 1))))
+ {
+ /*
+ If there are no binlog files (LOG_INFO_EOF), then we still try to read
+ the .state file to restore the binlog state. This allows to copy a server
+ to provision a new one without copying the binlog files (except the
+ master-bin.state file) and still preserve the correct binlog state.
+ */
+ if (error != LOG_INFO_EOF)
+ sql_print_error("find_log_pos() failed (error: %d)", error);
+ else
+ {
+ error= read_state_from_file();
+ if (error == 2)
+ {
+ /*
+ No binlog files and no binlog state is not an error (eg. just initial
+ server start after fresh installation).
+ */
+ error= 0;
+ }
+ }
+ return error;
+ }
+
+ if (! fdle.is_valid())
+ return 1;
+
+ do
+ {
+ strmake_buf(log_name, log_info.log_file_name);
+ } while (!(error= find_next_log(&log_info, 1)));
+
+ if (error != LOG_INFO_EOF)
+ {
+ sql_print_error("find_log_pos() failed (error: %d)", error);
+ return error;
+ }
+
+ if ((file= open_binlog(&log, log_name, &errmsg)) < 0)
+ {
+ sql_print_error("%s", errmsg);
+ return 1;
+ }
+
+ if ((ev= Log_event::read_log_event(&log, &fdle,
+ opt_master_verify_checksum)) &&
+ ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
+ {
+ if (ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
+ {
+ sql_print_information("Recovering after a crash using %s", opt_name);
+ error= recover(&log_info, log_name, &log,
+ (Format_description_log_event *)ev, do_xa_recovery);
+ }
+ else
+ {
+ error= read_state_from_file();
+ if (unlikely(error == 2))
+ {
+ /*
+ The binlog exists, but the .state file is missing. This is normal if
+ this is the first master start after a major upgrade to 10.0 (with
+ GTID support).
+
+ However, it could also be that the .state file was lost somehow, and
+ in this case it could be a serious issue, as we would set the wrong
+ binlog state in the next binlog file to be created, and GTID
+ processing would be corrupted. A common way would be copying files
+ from an old server to a new one and forgetting the .state file.
+
+ So in this case, we want to try to recover the binlog state by
+ scanning the last binlog file (but we do not need any XA recovery).
+
+ ToDo: We could avoid one scan at first start after major upgrade, by
+ detecting that there is no GTID_LIST event at the start of the
+ binlog file, and stopping the scan in that case.
+ */
+ error= recover(&log_info, log_name, &log,
+ (Format_description_log_event *)ev, false);
+ }
+ }
+ }
+
+ delete ev;
+ end_io_cache(&log);
+ mysql_file_close(file, MYF(MY_WME));
+
+ return error;
+}
+
+
+#ifdef INNODB_COMPATIBILITY_HOOKS
+/*
+ Get the current position of the MySQL binlog for transaction currently being
+ committed.
+
+ This is valid to call from within storage engine commit_ordered() and
+ commit() methods only.
+
+ Since it stores the position inside THD, it is safe to call without any
+ locking.
+*/
+void
+mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
+{
+ binlog_cache_mngr *cache_mngr;
+ if (opt_bin_log &&
+ (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
+ {
+ *out_file= cache_mngr->last_commit_pos_file;
+ *out_pos= (ulonglong)(cache_mngr->last_commit_pos_offset);
+ }
+ else
+ {
+ *out_file= NULL;
+ *out_pos= 0;
+ }
+}
+#endif /* INNODB_COMPATIBILITY_HOOKS */
+
+
+static void
+binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= *((ulong *)save);
+ bool check_purge= false;
+ ulong UNINIT_VAR(prev_binlog_id);
+
+ mysql_mutex_lock(mysql_bin_log.get_log_lock());
+ if(mysql_bin_log.is_open())
+ {
+ prev_binlog_id= mysql_bin_log.current_binlog_id;
+ if (binlog_checksum_options != value)
+ mysql_bin_log.checksum_alg_reset= (enum_binlog_checksum_alg)value;
+ if (mysql_bin_log.rotate(true, &check_purge))
+ check_purge= false;
+ }
+ else
+ {
+ binlog_checksum_options= value;
+ }
+ DBUG_ASSERT(binlog_checksum_options == value);
+ mysql_bin_log.checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF;
+ mysql_mutex_unlock(mysql_bin_log.get_log_lock());
+ if (check_purge)
+ mysql_bin_log.checkpoint_and_purge(prev_binlog_id);
+}
+
+
+static int show_binlog_vars(THD *thd, SHOW_VAR *var, void *,
+ system_status_var *status_var, enum_var_type)
+{
+ mysql_bin_log.set_status_variables(thd);
+ var->type= SHOW_ARRAY;
+ var->value= (char *)&binlog_status_vars_detail;
+ return 0;
+}
+
+static SHOW_VAR binlog_status_vars_top[]= {
+ SHOW_FUNC_ENTRY("Binlog", &show_binlog_vars),
+ {NullS, NullS, SHOW_LONG}
+};
+
+static MYSQL_SYSVAR_BOOL(
+ optimize_thread_scheduling,
+ opt_optimize_thread_scheduling,
+ PLUGIN_VAR_READONLY,
+ "Run fast part of group commit in a single thread, to optimize kernel "
+ "thread scheduling. On by default. Disable to run each transaction in group "
+ "commit in its own thread, which can be slower at very high concurrency. "
+ "This option is mostly for testing one algorithm versus the other, and it "
+ "should not normally be necessary to change it.",
+ NULL,
+ NULL,
+ 1);
+
+static MYSQL_SYSVAR_ENUM(
+ checksum,
+ binlog_checksum_options,
+ PLUGIN_VAR_RQCMDARG,
+ "Type of BINLOG_CHECKSUM_ALG. Include checksum for "
+ "log events in the binary log",
+ NULL,
+ binlog_checksum_update,
+ BINLOG_CHECKSUM_ALG_CRC32,
+ &binlog_checksum_typelib);
+
+static struct st_mysql_sys_var *binlog_sys_vars[]=
+{
+ MYSQL_SYSVAR(optimize_thread_scheduling),
+ MYSQL_SYSVAR(checksum),
+ NULL
+};
+
+
+/*
+ Copy out the non-directory part of binlog position filename for the
+ `binlog_snapshot_file' status variable, same way as it is done for
+ SHOW BINLOG STATUS.
+*/
+static void
+set_binlog_snapshot_file(const char *src)
+{
+ size_t dir_len = dirname_length(src);
+ strmake_buf(binlog_snapshot_file, src + dir_len);
+}
+
+/*
+ Copy out current values of status variables, for SHOW STATUS or
+ information_schema.global_status.
+
+ This is called only under LOCK_all_status_vars, so we can fill in a static array.
+*/
+void
+TC_LOG_BINLOG::set_status_variables(THD *thd)
+{
+ binlog_cache_mngr *cache_mngr;
+
+ if (thd && opt_bin_log)
+ cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+ else
+ cache_mngr= 0;
+
+ bool have_snapshot= (cache_mngr && cache_mngr->last_commit_pos_file[0] != 0);
+ mysql_mutex_lock(&LOCK_commit_ordered);
+ binlog_status_var_num_commits= this->num_commits;
+ binlog_status_var_num_group_commits= this->num_group_commits;
+ if (!have_snapshot)
+ {
+ set_binlog_snapshot_file(last_commit_pos_file);
+ binlog_snapshot_position= last_commit_pos_offset;
+ }
+ mysql_mutex_unlock(&LOCK_commit_ordered);
+ mysql_mutex_lock(&LOCK_prepare_ordered);
+ binlog_status_group_commit_trigger_count= this->group_commit_trigger_count;
+ binlog_status_group_commit_trigger_timeout= this->group_commit_trigger_timeout;
+ binlog_status_group_commit_trigger_lock_wait= this->group_commit_trigger_lock_wait;
+ mysql_mutex_unlock(&LOCK_prepare_ordered);
+
+ if (have_snapshot)
+ {
+ set_binlog_snapshot_file(cache_mngr->last_commit_pos_file);
+ binlog_snapshot_position= cache_mngr->last_commit_pos_offset;
+ }
+}
+
+
+/*
+ Find the Gtid_list_log_event at the start of a binlog.
+
+ NULL for ok, non-NULL error message for error.
+
+ If ok, then the event is returned in *out_gtid_list. This can be NULL if we
+ get back to binlogs written by old server version without GTID support. If
+ so, it means we have reached the point to start from, as no GTID events can
+ exist in earlier binlogs.
+*/
+const char *
+get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
+{
+ Format_description_log_event init_fdle(BINLOG_VERSION);
+ Format_description_log_event *fdle;
+ Log_event *ev;
+ const char *errormsg = NULL;
+
+ *out_gtid_list= NULL;
+
+ if (!(ev= Log_event::read_log_event(cache, &init_fdle,
+ opt_master_verify_checksum)) ||
+ ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ {
+ if (ev)
+ delete ev;
+ return "Could not read format description log event while looking for "
+ "GTID position in binlog";
+ }
+
+ fdle= static_cast<Format_description_log_event *>(ev);
+
+ for (;;)
+ {
+ Log_event_type typ;
+
+ ev= Log_event::read_log_event(cache, fdle, opt_master_verify_checksum);
+ if (!ev)
+ {
+ errormsg= "Could not read GTID list event while looking for GTID "
+ "position in binlog";
+ break;
+ }
+ typ= ev->get_type_code();
+ if (typ == GTID_LIST_EVENT)
+ break; /* Done, found it */
+ if (typ == START_ENCRYPTION_EVENT)
+ {
+ if (fdle->start_decryption((Start_encryption_log_event*) ev))
+ {
+ errormsg= "Could not set up decryption for binlog.";
+ typ= UNKNOWN_EVENT; // to cleanup and abort below
+ }
+ }
+ delete ev;
+ if (typ == ROTATE_EVENT || typ == STOP_EVENT ||
+ typ == FORMAT_DESCRIPTION_EVENT || typ == START_ENCRYPTION_EVENT)
+ continue; /* Continue looking */
+
+ /* We did not find any Gtid_list_log_event, must be old binlog. */
+ ev= NULL;
+ break;
+ }
+
+ delete fdle;
+ *out_gtid_list= static_cast<Gtid_list_log_event *>(ev);
+ return errormsg;
+}
+
+
+struct st_mysql_storage_engine binlog_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+maria_declare_plugin(binlog)
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &binlog_storage_engine,
+ "binlog",
+ "MySQL AB",
+ "This is a pseudo storage engine to represent the binlog in a transaction",
+ PLUGIN_LICENSE_GPL,
+ binlog_init, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ 0x0100 /* 1.0 */,
+ binlog_status_vars_top, /* status variables */
+ binlog_sys_vars, /* system variables */
+ "1.0", /* string version */
+ MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
+
+#ifdef WITH_WSREP
+#include "wsrep_mysqld.h"
+
+IO_CACHE *wsrep_get_cache(THD * thd, bool is_transactional)
+{
+ DBUG_ASSERT(binlog_hton->slot != HA_SLOT_UNDEF);
+ binlog_cache_mngr *cache_mngr = (binlog_cache_mngr*)
+ thd_get_ha_data(thd, binlog_hton);
+ if (cache_mngr)
+ return cache_mngr->get_binlog_cache_log(is_transactional);
+
+ WSREP_DEBUG("binlog cache not initialized, conn: %llu",
+ thd->thread_id);
+ return NULL;
+}
+
+void wsrep_thd_binlog_trx_reset(THD * thd)
+{
+ DBUG_ENTER("wsrep_thd_binlog_trx_reset");
+ WSREP_DEBUG("wsrep_thd_binlog_reset");
+ /*
+ todo: fix autocommit select to not call the caller
+ */
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+ if (cache_mngr)
+ {
+ cache_mngr->reset(false, true);
+ if (!cache_mngr->stmt_cache.empty())
+ {
+ WSREP_DEBUG("pending events in stmt cache, sql: %s", thd->query());
+ cache_mngr->stmt_cache.reset();
+ }
+ }
+ thd->reset_binlog_for_next_statement();
+ DBUG_VOID_RETURN;
+}
+
+void wsrep_thd_binlog_stmt_rollback(THD * thd)
+{
+ DBUG_ENTER("wsrep_thd_binlog_stmt_rollback");
+ WSREP_DEBUG("wsrep_thd_binlog_stmt_rollback");
+ binlog_cache_mngr *const cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+ if (cache_mngr)
+ {
+ thd->binlog_remove_pending_rows_event(TRUE, TRUE);
+ cache_mngr->stmt_cache.reset();
+ }
+ DBUG_VOID_RETURN;
+}
+
+void wsrep_register_binlog_handler(THD *thd, bool trx)
+{
+ DBUG_ENTER("register_binlog_handler");
+ /*
+ If this is the first call to this function while processing a statement,
+ the transactional cache does not have a savepoint defined. So, in what
+ follows:
+ . an implicit savepoint is defined;
+ . callbacks are registered;
+ . binary log is set as read/write.
+
+ The savepoint allows for truncating the trx-cache transactional changes
+ fail. Callbacks are necessary to flush caches upon committing or rolling
+ back a statement or a transaction. However, notifications do not happen
+ if the binary log is set as read/write.
+ */
+ binlog_cache_mngr *cache_mngr=
+ (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+ /* cache_mngr may be missing e.g. in mtr test ev51914.test */
+ if (cache_mngr)
+ {
+ /*
+ Set an implicit savepoint in order to be able to truncate a trx-cache.
+ */
+ if (cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF)
+ {
+ my_off_t pos= 0;
+ binlog_trans_log_savepos(thd, &pos);
+ cache_mngr->trx_cache.set_prev_position(pos);
+ }
+
+ /*
+ Set callbacks in order to be able to call commmit or rollback.
+ */
+ if (trx)
+ trans_register_ha(thd, TRUE, binlog_hton, 0);
+ trans_register_ha(thd, FALSE, binlog_hton, 0);
+
+ /*
+ Set the binary log as read/write otherwise callbacks are not called.
+ */
+ thd->ha_data[binlog_hton->slot].ha_info[0].set_trx_read_write();
+ }
+ DBUG_VOID_RETURN;
+}
+
+#endif /* WITH_WSREP */