summaryrefslogtreecommitdiffstats
path: root/sql/gtid_index.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sql/gtid_index.cc')
-rw-r--r--sql/gtid_index.cc1434
1 files changed, 1434 insertions, 0 deletions
diff --git a/sql/gtid_index.cc b/sql/gtid_index.cc
new file mode 100644
index 00000000..0467c7cf
--- /dev/null
+++ b/sql/gtid_index.cc
@@ -0,0 +1,1434 @@
+/*
+ Copyright (c) 2023 Kristian Nielsen <knielsen@knielsen-hq.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+*/
+
+#include "gtid_index.h"
+#include "sql_const.h"
+#include "log.h"
+
+
+static const uchar GTID_INDEX_MAGIC[4]= {
+ 254, 254, 12, 1,
+};
+
+Gtid_index_writer *Gtid_index_writer::hot_index_list= nullptr;
+/* gtid_index_mutex is inited in MYSQL_LOG::init_pthread_objects(). */
+mysql_mutex_t Gtid_index_writer::gtid_index_mutex;
+
+
+Gtid_index_writer::Gtid_index_writer(const char *filename, uint32 offset,
+ rpl_binlog_state_base *binlog_state,
+ uint32 opt_page_size,
+ my_off_t opt_span_min)
+ : offset_min_threshold(opt_span_min),
+ nodes(nullptr), previous_offset(0),
+ max_level(0), index_file(-1),
+ error_state(false), file_header_written(false), in_hot_index_list(false)
+{
+ uint32 count;
+ rpl_gtid *gtid_list;
+ page_size= opt_page_size;
+ pending_state.init();
+
+ if (alloc_level_if_missing(0))
+ {
+ give_error("Out of memory allocating node list");
+ return;
+ }
+
+ /*
+ Lock the index mutex at this point just before we create the new index
+ file on disk. From this point on, and until the index is fully written,
+ the reader will find us in the "hot index" list and will be able to read
+ from the index while it's still being constructed.
+ */
+ lock_gtid_index();
+
+ build_index_filename(filename);
+ int create_flags= O_WRONLY|O_TRUNC|O_BINARY|O_EXCL;
+ index_file= mysql_file_create(key_file_gtid_index, index_file_name,
+ CREATE_MODE, create_flags, MYF(0));
+ if (index_file < 0 && my_errno == EEXIST)
+ {
+ /*
+ It shouldn't happen that an old GTID index file remains, as we remove
+ them as part of RESET MASTER and PURGE BINARY LOGS. But if it happens
+ due to some external file copy of the user or something, delete any old
+ GTID index file first.
+ */
+ sql_print_information("Old GTID index file found '%s', deleting",
+ index_file_name);
+ my_errno= 0;
+ mysql_file_delete(key_file_gtid_index, index_file_name, MYF(0));
+ index_file= mysql_file_create(key_file_gtid_index, index_file_name,
+ CREATE_MODE, create_flags, MYF(0));
+ }
+ if (index_file < 0)
+ {
+ give_error("Failed to open new index file for writing");
+ goto err;
+ }
+
+ /*
+ Write out an initial index record, i.e. corresponding to the GTID_LIST
+ event / binlog state at the start of the binlog file.
+ */
+ count= binlog_state->count_nolock();
+ gtid_list= gtid_list_buffer(count);
+ if (count > 0)
+ {
+ if (!gtid_list)
+ goto err;
+ binlog_state->get_gtid_list_nolock(gtid_list, count);
+ }
+ write_record(offset, gtid_list, count);
+
+ insert_in_hot_index();
+
+err:
+ unlock_gtid_index();
+}
+
+
+Gtid_index_writer::~Gtid_index_writer()
+{
+ if (in_hot_index_list)
+ {
+ lock_gtid_index();
+ close();
+ unlock_gtid_index();
+ }
+
+ if (index_file > 0)
+ {
+ /*
+ Should have been closed by call to Gtid_index_writer::close().
+ We can at least avoid leaking file descriptor.
+ */
+ mysql_file_close(index_file, MYF(0));
+ }
+
+ if (nodes)
+ {
+ for (uint32 i= 0; i <= max_level; ++i)
+ delete nodes[i];
+ my_free(nodes);
+ }
+
+ /*
+ state.free() is not needed here, will be called from rpl_binlog_state_base
+ destructor.
+ */
+}
+
+
+void
+Gtid_index_writer::gtid_index_init()
+{
+ mysql_mutex_init(key_gtid_index_lock, &gtid_index_mutex, MY_MUTEX_INIT_SLOW);
+}
+
+void
+Gtid_index_writer::gtid_index_cleanup()
+{
+ mysql_mutex_destroy(&gtid_index_mutex);
+}
+
+
+const Gtid_index_writer *
+Gtid_index_writer::find_hot_index(const char *file_name)
+{
+ mysql_mutex_assert_owner(&gtid_index_mutex);
+
+ for (const Gtid_index_writer *p= hot_index_list; p; p= p->next_hot_index)
+ {
+ if (0 == strcmp(file_name, p->index_file_name))
+ return p;
+ }
+ return nullptr;
+}
+
+void
+Gtid_index_writer::insert_in_hot_index()
+{
+ mysql_mutex_assert_owner(&gtid_index_mutex);
+
+ next_hot_index= hot_index_list;
+ hot_index_list= this;
+ in_hot_index_list= true;
+}
+
+
+void
+Gtid_index_writer::remove_from_hot_index()
+{
+ mysql_mutex_assert_owner(&gtid_index_mutex);
+
+ Gtid_index_writer **next_ptr_ptr= &hot_index_list;
+ for (;;)
+ {
+ Gtid_index_writer *p= *next_ptr_ptr;
+ if (!p)
+ break;
+ if (p == this)
+ {
+ *next_ptr_ptr= p->next_hot_index;
+ break;
+ }
+ next_ptr_ptr= &p->next_hot_index;
+ }
+ next_hot_index= nullptr;
+ in_hot_index_list= false;
+}
+
+void
+Gtid_index_writer::process_gtid(uint32 offset, const rpl_gtid *gtid)
+{
+ rpl_gtid *gtid_list;
+ uint32 gtid_count;
+
+ if (process_gtid_check_batch(offset, gtid, &gtid_list, &gtid_count))
+ return; // Error
+
+ if (gtid_list)
+ async_update(offset, gtid_list, gtid_count);
+}
+
+
+int
+Gtid_index_writer::process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
+ rpl_gtid **out_gtid_list,
+ uint32 *out_gtid_count)
+{
+ uint32 count;
+ rpl_gtid *gtid_list;
+
+ mysql_mutex_assert_not_owner(&gtid_index_mutex);
+
+ if (unlikely(pending_state.update_nolock(gtid)))
+ {
+ give_error("Out of memory processing GTID for binlog GTID index");
+ return 1;
+ }
+ /*
+ Sparse index; we record only selected GTIDs, and scan the binlog forward
+ from there to find the exact spot.
+ */
+ if (offset - previous_offset < offset_min_threshold)
+ {
+ *out_gtid_list= nullptr;
+ *out_gtid_count= 0;
+ return 0;
+ }
+
+ count= pending_state.count_nolock();
+ DBUG_ASSERT(count > 0 /* Since we just updated with a GTID. */);
+ gtid_list= (rpl_gtid *)
+ my_malloc(key_memory_binlog_gtid_index, count*sizeof(*gtid_list), MYF(0));
+ if (unlikely(!gtid_list))
+ {
+ give_error("Out of memory allocating GTID list for binlog GTID index");
+ return 1;
+ }
+ if (unlikely(pending_state.get_gtid_list_nolock(gtid_list, count)))
+ {
+ /* Shouldn't happen as we allocated the list with the correct length. */
+ DBUG_ASSERT(false);
+ give_error("Internal error allocating GTID list for binlog GTID index");
+ my_free(gtid_list);
+ return 1;
+ }
+ pending_state.reset_nolock();
+ previous_offset= offset;
+ *out_gtid_list= gtid_list;
+ *out_gtid_count= count;
+ return 0;
+}
+
+
+int
+Gtid_index_writer::async_update(uint32 event_offset,
+ rpl_gtid *gtid_list,
+ uint32 gtid_count)
+{
+ lock_gtid_index();
+ int res= write_record(event_offset, gtid_list, gtid_count);
+ unlock_gtid_index();
+ my_free(gtid_list);
+ return res;
+}
+
+
+void
+Gtid_index_writer::close()
+{
+ lock_gtid_index();
+ if (!error_state)
+ {
+
+ /*
+ Write out the remaining pending pages, and insert the final child pointer
+ in interior nodes.
+ */
+ for (uint32 level= 0; ; ++level)
+ {
+ uint32 node_ptr= write_current_node(level, level==max_level);
+ nodes[level]->reset();
+ if (!node_ptr || level >= max_level)
+ break;
+ add_child_ptr(level+1, node_ptr);
+ }
+ }
+ remove_from_hot_index();
+ unlock_gtid_index();
+
+ if (!error_state)
+ {
+ if (mysql_file_sync(index_file, MYF(0)))
+ give_error("Error syncing index file to disk");
+ }
+
+ mysql_file_close(index_file, MYF(0));
+ index_file= (File)-1;
+}
+
+
+Gtid_index_base::Index_node_base::Index_node_base()
+ : first_page(nullptr), current_page(nullptr), current_ptr(nullptr)
+{
+}
+
+
+Gtid_index_base::Index_node_base::~Index_node_base()
+{
+ free_pages();
+}
+
+
+void
+Gtid_index_base::Index_node_base::free_pages()
+{
+ for (Node_page *p= first_page; p; )
+ {
+ Node_page *q= p->next;
+ my_free(p);
+ p= q;
+ }
+}
+
+
+void
+Gtid_index_base::Index_node_base::reset()
+{
+ free_pages();
+ first_page= current_page= nullptr;
+}
+
+
+Gtid_index_base::Gtid_index_base()
+ : gtid_buffer(nullptr), gtid_buffer_alloc(0)
+{
+}
+
+
+Gtid_index_base::~Gtid_index_base()
+{
+ if (gtid_buffer_alloc > 0)
+ my_free(gtid_buffer);
+}
+
+
+void
+Gtid_index_base::make_gtid_index_file_name(char *out_name, size_t bufsize,
+ const char *base_filename)
+{
+ char *p= strmake(out_name, base_filename, bufsize-1);
+ size_t remain= bufsize - (p - out_name);
+ strmake(p, ".idx", remain-1);
+}
+
+
+void
+Gtid_index_base::build_index_filename(const char *filename)
+{
+ make_gtid_index_file_name(index_file_name, sizeof(index_file_name), filename);
+}
+
+
+rpl_gtid *
+Gtid_index_base::gtid_list_buffer(uint32 count)
+{
+ if (gtid_buffer_alloc >= count)
+ return gtid_buffer;
+ rpl_gtid *new_buffer= (rpl_gtid *)
+ my_malloc(key_memory_binlog_gtid_index, count*sizeof(*new_buffer), MYF(0));
+ if (!new_buffer)
+ {
+ give_error("Out of memory allocating buffer for GTID list");
+ return NULL;
+ }
+ my_free(gtid_buffer);
+ gtid_buffer= new_buffer;
+ gtid_buffer_alloc= count;
+ return new_buffer;
+}
+
+
+Gtid_index_writer::Index_node::Index_node(uint32 level_)
+ : num_records(0), level(level_), force_spill_page(false)
+{
+ state.init();
+}
+
+
+Gtid_index_writer::Index_node::~Index_node()
+{
+ free_pages();
+}
+
+
+uint32
+Gtid_index_writer::write_current_node(uint32 level, bool is_root)
+{
+ Index_node *n= nodes[level];
+
+ uint32 node_pos= (uint32)mysql_file_tell(index_file, MYF(0));
+
+ for (Node_page *p= n->first_page; p ; p= p->next)
+ {
+ if (unlikely(is_root))
+ *(p->flag_ptr) |= PAGE_FLAG_ROOT;
+ if (likely(!p->next))
+ *(p->flag_ptr) |= PAGE_FLAG_LAST;
+ int4store(p->page + page_size - CHECKSUM_LEN,
+ my_checksum(0, p->page, page_size - CHECKSUM_LEN));
+ if (mysql_file_write(index_file, p->page, page_size, MYF(MY_NABP)))
+ {
+ give_error("Error writing index page");
+ return 0;
+ }
+ }
+
+ DBUG_ASSERT(node_pos % page_size == 0);
+ /* Page numbers are +1 just so that zero can denote invalid page pointer. */
+ return 1 + (node_pos / (uint32)page_size);
+}
+
+
+void
+Gtid_index_writer::Index_node::reset()
+{
+ Index_node_base::reset();
+ state.reset_nolock();
+ num_records= 0;
+ force_spill_page= false;
+}
+
+
+/*
+ Make sure there is requested space in the current page, by allocating a
+ new spill page if necessary.
+*/
+int
+Gtid_index_writer::reserve_space(Index_node *n, size_t bytes)
+{
+ DBUG_ASSERT(bytes <= page_size);
+ if (likely(n->current_page) &&
+ likely(n->current_ptr - n->current_page->page + bytes <=
+ (page_size - CHECKSUM_LEN)))
+ return 0;
+ /* Not enough room, allocate a spill page. */
+ Node_page *page= alloc_page();
+ n->force_spill_page= false;
+ if (!page)
+ return 1;
+ n->current_ptr=
+ init_header(page, n->level==0, !n->current_page);
+ if (n->current_page)
+ n->current_page->next= page;
+ else
+ n->first_page= page;
+ n->current_page= page;
+ return 0;
+}
+
+
+int
+Gtid_index_writer::do_write_record(uint32 level,
+ uint32 event_offset,
+ const rpl_gtid *gtid_list,
+ uint32 gtid_count)
+{
+ DBUG_ASSERT(level <= max_level);
+ Index_node *n= nodes[level];
+ if (reserve_space(n, 8))
+ return 1;
+ /* Store the count as +1, so that 0 can mean "no more records". */
+ int4store(n->current_ptr, gtid_count+1);
+ int4store(n->current_ptr+4, event_offset);
+ n->current_ptr+= 8;
+ for (uint32 i= 0; i < gtid_count; ++i)
+ {
+ if (reserve_space(n, 16))
+ return 1;
+ int4store(n->current_ptr, gtid_list[i].domain_id);
+ int4store(n->current_ptr+4, gtid_list[i].server_id);
+ int8store(n->current_ptr+8, gtid_list[i].seq_no);
+ n->current_ptr+= 16;
+ }
+
+ ++n->num_records;
+ return 0;
+}
+
+
+/*
+ Add a child pointer to the current node on LEVEL.
+ The first page has node_ptr=1 just so that a zero node_ptr can be used as
+ a no/invalid value (effectively node_ptr points to the end of the target
+ page, in unit of pages).
+
+ Adding a child pointer shouldn't spill to a new page, code must make sure that
+ there is always room for the final child pointer in current non-leaf node.
+*/
+int
+Gtid_index_writer::add_child_ptr(uint32 level, my_off_t node_offset)
+{
+ DBUG_ASSERT(level <= max_level);
+ DBUG_ASSERT(node_offset > 0);
+ Index_node *n= nodes[level];
+ if (reserve_space(n, 4))
+ return 1;
+ DBUG_ASSERT(n->current_page);
+ DBUG_ASSERT((size_t)(n->current_ptr - n->current_page->page + 4) <=
+ page_size - CHECKSUM_LEN);
+
+ int4store(n->current_ptr, node_offset);
+ n->current_ptr+= 4;
+ return 0;
+}
+
+
+/*
+ Write one index record to the GTID index, flushing nodes and allocating
+ new nodes as necessary.
+*/
+int
+Gtid_index_writer::write_record(uint32 event_offset,
+ const rpl_gtid *gtid_list,
+ uint32 gtid_count)
+{
+ if (error_state)
+ return 1; /* Avoid continuing on a possibly corrupt state. */
+
+ uint32 level= 0;
+ /*
+ The most frequent case is when there is room in the current page for the
+ current position to be written, in which case we exit early in the first
+ iteration of the following loop.
+
+ In the general case, we move up through the path to the root, writing
+ lower-level node page to disk and adding child pointers in higher-level
+ nodes, until we reach a node that has room. This final node may be a
+ freshly allocated new root node in the few times when the height of the
+ tree increases.
+ */
+ for (;;)
+ {
+ Index_node *n= nodes[level];
+ if (update_gtid_state(&n->state, gtid_list, gtid_count))
+ return give_error("Out of memory updating the local GTID state");
+
+ if (check_room(level, gtid_count))
+ {
+ /* There is room in the node, just add the index record. */
+ return do_write_record(level, event_offset, gtid_list, gtid_count);
+ }
+
+ /*
+ This node is full:
+ - First, write out this node to disk.
+ - Add a child pointer in the parent node (allocating one if needed).
+ - On level 0, allocate a new leaf node and add the index record there.
+ - On levels >0, skip the last index record when the node gets full
+ (B+-Tree has (k-1) keys for k child pointers).
+ - Loop to the parent node to add an index record there.
+ */
+ uint32 node_ptr= write_current_node(level, false);
+ if (!node_ptr)
+ return 1;
+ if (alloc_level_if_missing(level+1) ||
+ add_child_ptr(level+1, node_ptr))
+ return 1;
+ uint32 new_count= n->state.count_nolock();
+ rpl_gtid *new_gtid_list= gtid_list_buffer(new_count);
+ if (new_count > 0 && !new_gtid_list)
+ return 1;
+ if (n->state.get_gtid_list_nolock(new_gtid_list, new_count))
+ return give_error("Internal error processing GTID state");
+ n->reset();
+ if (level == 0)
+ {
+ if (do_write_record(level, event_offset, new_gtid_list, new_count))
+ return 1;
+ }
+ else
+ {
+ /*
+ Allocate a page for the node. This is mostly to help the reader of hot
+ index to not see NULL pointers, and we will need the page later anyway
+ to put at least one child pointer to the level below.
+ */
+ if (reserve_space(n, 4))
+ return 1;
+ }
+ gtid_list= new_gtid_list;
+ gtid_count= new_count;
+ ++level;
+ }
+ // NotReached.
+}
+
+
+bool
+Gtid_index_writer::check_room(uint32 level, uint32 gtid_count)
+{
+ Index_node *n= nodes[level];
+ /* There's always room in an empty (to-be-allocated) page. */
+ if (!n->current_page || n->num_records == 0)
+ return true;
+ /*
+ Make sure we use at least 1/2 a page of room after the initial record,
+ setting a flag to allocate a spill page later if needed.
+ */
+ size_t avail= page_size - CHECKSUM_LEN - (n->current_ptr - n->current_page->page);
+ if (n->num_records==1 && avail < page_size/2)
+ {
+ n->force_spill_page= true;
+ return true;
+ }
+ if (n->force_spill_page)
+ return true;
+ size_t needed= 8 + 16*gtid_count;
+ /* Non-leaf pages need extra 4 bytes for a child pointer. */
+ if (level > 0)
+ needed+= 4;
+ return needed <= avail;
+}
+
+
+int
+Gtid_index_writer::alloc_level_if_missing(uint32 level)
+{
+ if (likely(nodes))
+ {
+ if (likely(max_level >= level))
+ return 0;
+ DBUG_ASSERT(level == max_level+1); // Alloc one at a time
+ }
+
+ Index_node *node= new Index_node(level);
+ if (!node)
+ return give_error("Out of memory allocating new node");
+ Index_node **new_nodes= (Index_node **)
+ my_realloc(key_memory_binlog_gtid_index, nodes, (level+1)*sizeof(*nodes),
+ MYF(MY_ALLOW_ZERO_PTR|MY_ZEROFILL));
+ if (!new_nodes)
+ {
+ delete node;
+ return give_error("Out of memory allocating larger node list");
+ }
+ new_nodes[level]= node;
+ nodes= new_nodes;
+ max_level= level;
+ return 0;
+}
+
+
+/*
+ Initialize the start of a data page.
+ This is at the start of a page, except for the very first page where it
+ comes after the global file header.
+ Format:
+ 0 flags.
+ 1-3 unused padding/reserved.
+
+ The argument FIRST denotes if this is the first page (if false it is a
+ continuation page).
+*/
+uchar *
+Gtid_index_writer::init_header(Node_page *page, bool is_leaf, bool is_first)
+{
+ uchar *p= page->page;
+ bool is_file_header= !file_header_written;
+
+ if (unlikely(is_file_header))
+ {
+ memcpy(p, GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC));
+ p+= sizeof(GTID_INDEX_MAGIC);
+ *p++= GTID_INDEX_VERSION_MAJOR;
+ *p++= GTID_INDEX_VERSION_MINOR;
+ /* Flags/padding currently unused. */
+ *p++= 0;
+ *p++= 0;
+ int4store(p, page_size);
+ p+= 4;
+ DBUG_ASSERT(p == page->page + GTID_INDEX_FILE_HEADER_SIZE);
+ file_header_written= true;
+ }
+
+ uchar flags= 0;
+ if (is_leaf)
+ flags|= PAGE_FLAG_IS_LEAF;
+ if (unlikely(!is_first))
+ flags|= PAGE_FLAG_IS_CONT;
+ page->flag_ptr= p;
+ *p++= flags;
+ /* Padding/reserved. */
+ p+= 3;
+ DBUG_ASSERT(p == page->page +
+ (is_file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0) +
+ GTID_INDEX_PAGE_HEADER_SIZE);
+ DBUG_ASSERT((size_t)(p - page->page) < page_size - CHECKSUM_LEN);
+ return p;
+}
+
+
+int
+Gtid_index_base::update_gtid_state(rpl_binlog_state_base *state,
+ const rpl_gtid *gtid_list, uint32 gtid_count)
+{
+ for (uint32 i= 0; i < gtid_count; ++i)
+ if (state->update_nolock(&gtid_list[i]))
+ return 1;
+ return 0;
+}
+
+
+Gtid_index_base::Node_page *Gtid_index_base::alloc_page()
+{
+ Node_page *new_node= (Node_page *)
+ my_malloc(key_memory_binlog_gtid_index,
+ sizeof(Node_page) + page_size,
+ MYF(MY_ZEROFILL));
+ if (!new_node)
+ give_error("Out of memory for allocating index page");
+ return new_node;
+}
+
+
+int Gtid_index_writer::give_error(const char *msg)
+{
+ if (!error_state)
+ {
+ sql_print_information("Error during binlog GTID index creation, will "
+ "fallback to slower sequential binlog scan. "
+ "Error is: %s", msg);
+ error_state= true;
+ }
+ return 1;
+}
+
+
+Gtid_index_reader::Gtid_index_reader()
+ : n(nullptr), index_file(-1),
+ file_open(false), index_valid(false), has_root_node(false),
+ version_major(0), version_minor(0)
+{
+ current_state.init();
+ compare_state.init();
+}
+
+
+Gtid_index_reader::~Gtid_index_reader()
+{
+ if (file_open)
+ mysql_file_close(index_file, MYF(0));
+}
+
+
+int
+Gtid_index_reader::search_offset(uint32 in_offset,
+ uint32 *out_offset, uint32 *out_gtid_count)
+{
+ in_search_offset= in_offset;
+ search_cmp_function= &Gtid_index_reader::search_cmp_offset;
+
+ return do_index_search(out_offset, out_gtid_count);
+}
+
+int
+Gtid_index_reader::search_gtid_pos(slave_connection_state *in_gtid_pos,
+ uint32 *out_offset, uint32 *out_gtid_count)
+{
+ in_search_gtid_pos= in_gtid_pos;
+ search_cmp_function= &Gtid_index_reader::search_cmp_gtid_pos;
+
+ int res= do_index_search(out_offset, out_gtid_count);
+ /* Let's not leave a dangling pointer to the caller's memory. */
+ in_search_gtid_pos= nullptr;
+
+ return res;
+}
+
+rpl_gtid *
+Gtid_index_reader::search_gtid_list()
+{
+ return gtid_buffer;
+}
+
+
+int
+Gtid_index_reader::search_cmp_offset(uint32 offset,
+ rpl_binlog_state_base *state)
+{
+ if (offset <= in_search_offset)
+ return 0;
+ else
+ return -1;
+}
+
+
+int
+Gtid_index_reader::search_cmp_gtid_pos(uint32 offset,
+ rpl_binlog_state_base *state)
+{
+ if (state->is_before_pos(in_search_gtid_pos))
+ return 0;
+ else
+ return -1;
+}
+
+
+int
+Gtid_index_reader::next_page()
+{
+ if (!read_page->next)
+ return 1;
+ read_page= read_page->next;
+ read_ptr= read_page->flag_ptr + 4;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::find_bytes(uint32 num_bytes)
+{
+ if ((my_ptrdiff_t)(read_ptr - read_page->page + num_bytes) <=
+ (my_ptrdiff_t)(page_size - CHECKSUM_LEN))
+ return 0;
+ return next_page();
+}
+
+
+int
+Gtid_index_reader::get_child_ptr(uint32 *out_child_ptr)
+{
+ if (find_bytes(4))
+ return give_error("Corrupt index, short index node");
+ *out_child_ptr= (uint32)uint4korr(read_ptr);
+ read_ptr+= 4;
+ return 0;
+}
+
+
+/*
+ Read the start of an index record (count of GTIDs in the differential state
+ and offset).
+ Returns:
+ 0 ok
+ 1 EOF, no more data in this node
+*/
+int
+Gtid_index_reader::get_offset_count(uint32 *out_offset, uint32 *out_gtid_count)
+{
+ if (find_bytes(8))
+ return 1;
+ uint32 gtid_count= uint4korr(read_ptr);
+ if (gtid_count == 0)
+ {
+ /* 0 means invalid/no record (we store N+1 for N GTIDs in record). */
+ return 1;
+ }
+ *out_gtid_count= gtid_count - 1;
+ *out_offset= uint4korr(read_ptr + 4);
+ read_ptr+= 8;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::get_gtid_list(rpl_gtid *out_gtid_list, uint32 count)
+{
+ for (uint32 i= 0; i < count; ++i)
+ {
+ if (find_bytes(16))
+ return give_error("Corrupt index, short index node");
+ out_gtid_list[i].domain_id= uint4korr(read_ptr);
+ out_gtid_list[i].server_id= uint4korr(read_ptr + 4);
+ out_gtid_list[i].seq_no= uint8korr(read_ptr + 8);
+ read_ptr+= 16;
+ }
+ return 0;
+}
+
+
+int
+Gtid_index_reader::open_index_file(const char *binlog_filename)
+{
+ close_index_file();
+ build_index_filename(binlog_filename);
+ if ((index_file= mysql_file_open(key_file_gtid_index, index_file_name,
+ O_RDONLY|O_BINARY, MYF(0))) < 0)
+ return 1; // No error for missing index (eg. upgrade)
+
+ file_open= true;
+ if (read_file_header())
+ return 1;
+
+ return 0;
+}
+
+void
+Gtid_index_reader::close_index_file()
+{
+ if (!file_open)
+ return;
+ mysql_file_close(index_file, MYF(0));
+ file_open= false;
+ index_valid= false;
+}
+
+
+int
+Gtid_index_reader::do_index_search(uint32 *out_offset, uint32 *out_gtid_count)
+{
+ /* In cold index, we require a complete index with a valid root node. */
+ if (!has_root_node)
+ return -1;
+
+ return do_index_search_root(out_offset, out_gtid_count);
+}
+
+
+int
+Gtid_index_reader::do_index_search_root(uint32 *out_offset,
+ uint32 *out_gtid_count)
+{
+ current_state.reset_nolock();
+ compare_state.reset_nolock();
+ /*
+ These states will be initialized to the full state stored at the start of
+ the root node and then incrementally updated.
+ */
+ bool current_state_updated= false;
+
+ if (read_root_node())
+ return -1;
+ for (;;)
+ {
+ if (*n->first_page->flag_ptr & PAGE_FLAG_IS_LEAF)
+ break;
+
+ if (compare_state.load_nolock(&current_state))
+ {
+ give_error("Out of memory allocating GTID list");
+ return -1;
+ }
+ uint32 child_ptr;
+ if (get_child_ptr(&child_ptr))
+ return -1;
+
+ /* Scan over the keys in the node to find the child pointer to follow */
+ for (;;)
+ {
+ uint32 offset, gtid_count;
+ int res= get_offset_count(&offset, &gtid_count);
+ if (res == 1) // EOF?
+ {
+ /* Follow the right-most child pointer. */
+ if (read_node(child_ptr))
+ return -1;
+ break;
+ }
+ rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
+ uint32 child2_ptr;
+ if ((gtid_count > 0 && !gtid_list) ||
+ get_gtid_list(gtid_list, gtid_count) ||
+ get_child_ptr(&child2_ptr))
+ return -1;
+ if (update_gtid_state(&compare_state, gtid_list, gtid_count))
+ return -1;
+ int cmp= (this->*search_cmp_function)(offset, &compare_state);
+ if (cmp < 0)
+ {
+ /* Follow the left child of this key. */
+ if (read_node(child_ptr))
+ return -1;
+ break;
+ }
+ /* Continue to scan the next key. */
+ update_gtid_state(&current_state, gtid_list, gtid_count);
+ current_state_updated= true;
+ current_offset= offset;
+ child_ptr= child2_ptr;
+ }
+ }
+ return do_index_search_leaf(current_state_updated,
+ out_offset, out_gtid_count);
+}
+
+int Gtid_index_reader::do_index_search_leaf(bool current_state_updated,
+ uint32 *out_offset,
+ uint32 *out_gtid_count)
+{
+ uint32 offset, gtid_count;
+ int res= get_offset_count(&offset, &gtid_count);
+ if (res == 1)
+ {
+ DBUG_ASSERT(0);
+ give_error("Corrupt index; empty leaf node");
+ return -1;
+ }
+ rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
+ if ((gtid_count > 0 && !gtid_list) ||
+ get_gtid_list(gtid_list, gtid_count))
+ return -1;
+ /*
+ The first key is ignored (already included in the current state), unless
+ it is the very first state in the index.
+ */
+ if (!current_state_updated)
+ update_gtid_state(&current_state, gtid_list, gtid_count);
+ current_offset= offset;
+ if (compare_state.load_nolock(&current_state))
+ {
+ give_error("Out of memory allocating GTID state");
+ return -1;
+ }
+ int cmp= (this->*search_cmp_function)(offset, &compare_state);
+ if (cmp < 0)
+ return 0; // Search position is before start of index.
+
+ /* Scan over the keys in the leaf node. */
+ for (;;)
+ {
+ uint32 offset, gtid_count;
+ int res= get_offset_count(&offset, &gtid_count);
+ if (res == 1) // EOF?
+ {
+ /* Reached end of leaf, last key is the one searched for. */
+ break;
+ }
+ gtid_list= gtid_list_buffer(gtid_count);
+ if ((gtid_count > 0 && !gtid_list) ||
+ get_gtid_list(gtid_list, gtid_count))
+ return -1;
+ if (update_gtid_state(&compare_state, gtid_list, gtid_count))
+ return -1;
+ cmp= (this->*search_cmp_function)(offset, &compare_state);
+ if (cmp < 0)
+ {
+ /* Next key is larger, so current state is the one searched for. */
+ break;
+ }
+ update_gtid_state(&current_state, gtid_list, gtid_count);
+ current_offset= offset;
+ }
+
+ *out_offset= current_offset;
+ *out_gtid_count= current_state.count_nolock();
+ /* Save the result in the shared gtid list buffer. */
+ if ((!(gtid_list= gtid_list_buffer(*out_gtid_count)) && *out_gtid_count > 0) ||
+ current_state.get_gtid_list_nolock(gtid_list, *out_gtid_count))
+ return -1;
+
+ return 1;
+}
+
+
+/*
+ Read the file header and check that it's valid and that the format is not
+ too new a version for us to be able to read it.
+*/
+int
+Gtid_index_reader::read_file_header()
+{
+ if (!file_open)
+ return 1;
+
+ uchar buf[GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE];
+
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, 0, MY_SEEK_SET, MYF(0)) ||
+ mysql_file_read(index_file, buf,
+ GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE,
+ MYF(MY_NABP)))
+ return give_error("Error reading page from index file");
+ if (memcmp(&buf[0], GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC)))
+ return give_error("Corrupt index file, magic not found in header");
+ version_major= buf[4];
+ version_minor= buf[5];
+ /* We cannot safely read a major version we don't know about. */
+ if (version_major > GTID_INDEX_VERSION_MAJOR)
+ return give_error("Incompatible index file, version too high");
+ page_size= uint4korr(&buf[8]);
+
+ /* Verify checksum integrity of page_size and major/minor version. */
+ uint32 crc= my_checksum(0, buf, sizeof(buf));
+ uchar *buf3= (uchar *)
+ my_malloc(key_memory_binlog_gtid_index, page_size - sizeof(buf), MYF(0));
+ if (!buf3)
+ return give_error("Error allocating memory for index page");
+ int res= 0;
+ if (mysql_file_read(index_file, buf3, page_size - sizeof(buf), MYF(MY_NABP)))
+ res= give_error("Error reading page from index file");
+ else
+ {
+ crc= my_checksum(crc, buf3, page_size - sizeof(buf) - CHECKSUM_LEN);
+ if (crc != uint4korr(buf3 + page_size - sizeof(buf) - CHECKSUM_LEN))
+ res= give_error("Corrupt page, invalid checksum");
+ }
+ my_free(buf3);
+ if (res)
+ return res;
+
+ /*
+ Check that there is a valid root node at the end of the file.
+ If there is not, the index may be a "hot index" that is currently being
+ constructed. Or it was only partially written before server crash and not
+ recovered for some reason.
+ */
+ uchar flags= buf[GTID_INDEX_PAGE_HEADER_SIZE];
+ constexpr uchar needed_flags= PAGE_FLAG_ROOT|PAGE_FLAG_LAST;
+ if ((flags & needed_flags) == needed_flags)
+ {
+ /* Special case: the index is a single page, which is the root node. */
+ has_root_node= true;
+ }
+ else
+ {
+ uchar buf2[GTID_INDEX_PAGE_HEADER_SIZE];
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
+ MY_SEEK_END, MYF(0)) ||
+ mysql_file_read(index_file, buf2, GTID_INDEX_PAGE_HEADER_SIZE,
+ MYF(MY_NABP)))
+ return give_error("Error reading root page from index file");
+ flags= buf2[0];
+ has_root_node= ((flags & needed_flags) == needed_flags);
+ /* No need to verify checksum here, will be done by read_root_node(). */
+ }
+ index_valid= true;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::verify_checksum(Gtid_index_base::Node_page *page)
+{
+ uint32 calc_checksum= my_checksum(0, page->page, page_size - CHECKSUM_LEN);
+ uint32 read_checksum= uint4korr(page->page + page_size - CHECKSUM_LEN);
+ if (calc_checksum != read_checksum)
+ return give_error("Corrupt page, invalid checksum");
+ return 0;
+}
+
+
+Gtid_index_base::Node_page *
+Gtid_index_reader::alloc_and_read_page()
+{
+ Node_page *page= alloc_page();
+ if (!page)
+ {
+ give_error("Error allocating memory for index page");
+ return nullptr;
+ }
+ if (mysql_file_read(index_file, page->page, page_size, MYF(MY_NABP)))
+ {
+ my_free(page);
+ give_error("Error reading page from index file");
+ return nullptr;
+ }
+ if (verify_checksum(page))
+ {
+ my_free(page);
+ return nullptr;
+ }
+ return page;
+}
+
+
+int
+Gtid_index_reader::read_root_node()
+{
+ if (!index_valid || !has_root_node)
+ return 1;
+
+ cold_node.reset();
+ n= &cold_node;
+ /*
+ Read pages one by one from the back of the file until we have a complete
+ root node.
+ */
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
+ MY_SEEK_END, MYF(0)))
+ return give_error("Error seeking index file");
+
+ for (;;)
+ {
+ Node_page *page= alloc_and_read_page();
+ if (!page)
+ return 1;
+ if (mysql_file_tell(index_file, MYF(0)) == page_size)
+ page->flag_ptr= &page->page[GTID_INDEX_FILE_HEADER_SIZE];
+ else
+ page->flag_ptr= &page->page[0];
+ page->next= n->first_page;
+ n->first_page= page;
+ uchar flags= *page->flag_ptr;
+ if (unlikely(!(flags & PAGE_FLAG_ROOT)))
+ return give_error("Corrupt or truncated index, no root node found");
+ if (!(flags & PAGE_FLAG_IS_CONT))
+ break; // Found start of root node
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)(2*page_size),
+ MY_SEEK_CUR, MYF(0)))
+ return give_error("Error seeking index file for multi-page root node");
+ }
+
+ read_page= n->first_page;
+ read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
+ return 0;
+}
+
+
+int
+Gtid_index_reader::read_node(uint32 page_ptr)
+{
+ DBUG_ASSERT(page_ptr != 0 /* No zero child pointers in on-disk pages. */);
+ if (!index_valid || !page_ptr)
+ return 1;
+ return read_node_cold(page_ptr);
+}
+
+
+int
+Gtid_index_reader::read_node_cold(uint32 page_ptr)
+{
+ if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, (page_ptr-1)*page_size,
+ MY_SEEK_SET, MYF(0)))
+ return give_error("Error seeking index file");
+
+ bool file_header= (page_ptr == 1);
+ cold_node.reset();
+ n= &cold_node;
+ Node_page **next_ptr_ptr= &n->first_page;
+ for (;;)
+ {
+ Node_page *page= alloc_and_read_page();
+ if (!page)
+ return 1;
+ page->flag_ptr= &page->page[file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0];
+ file_header= false;
+ /* Insert the page at the end of the list. */
+ page->next= nullptr;
+ *next_ptr_ptr= page;
+ next_ptr_ptr= &page->next;
+
+ uchar flags= *(page->flag_ptr);
+ if (flags & PAGE_FLAG_LAST)
+ break;
+ }
+
+ read_page= n->first_page;
+ read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
+ return 0;
+}
+
+
+int Gtid_index_reader::give_error(const char *msg)
+{
+ sql_print_information("Error reading binlog GTID index, will "
+ "fallback to slower sequential binlog scan. "
+ "Error is: %s", msg);
+ return 1;
+}
+
+
+Gtid_index_reader_hot::Gtid_index_reader_hot()
+ : hot_writer(nullptr)
+{
+}
+
+
+int
+Gtid_index_reader_hot::get_child_ptr(uint32 *out_child_ptr)
+{
+ if (find_bytes(4))
+ {
+ /*
+ If reading hot index, EOF or zero child ptr means the child pointer has
+ not yet been written. A zero out_child_ptr makes read_node() read the
+ hot node for the child.
+ */
+ if (hot_writer)
+ {
+ *out_child_ptr= 0;
+ return 0;
+ }
+ return give_error("Corrupt index, short index node");
+ }
+ *out_child_ptr= (uint32)uint4korr(read_ptr);
+ read_ptr+= 4;
+ return 0;
+}
+
+
+int
+Gtid_index_reader_hot::do_index_search(uint32 *out_offset,
+ uint32 *out_gtid_count)
+{
+ /* Check for a "hot" index. */
+ Gtid_index_writer::lock_gtid_index();
+ hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
+ if (!hot_writer)
+ {
+ Gtid_index_writer::unlock_gtid_index();
+ /*
+ Check the index file header (and index end) again, in case it was
+ hot when open_index_file() was called, but became cold in the meantime.
+ */
+ if (!has_root_node && Gtid_index_reader::read_file_header())
+ return -1;
+ }
+
+ int res= do_index_search_root(out_offset, out_gtid_count);
+
+ if (hot_writer)
+ {
+ hot_writer= nullptr;
+ Gtid_index_writer::unlock_gtid_index();
+ }
+ return res;
+}
+
+
+int
+Gtid_index_reader_hot::read_file_header()
+{
+ if (!file_open)
+ return 1;
+
+ Gtid_index_writer::lock_gtid_index();
+ hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
+ if (!hot_writer)
+ Gtid_index_writer::unlock_gtid_index();
+
+ int res;
+ if (hot_writer && hot_writer->max_level == 0)
+ {
+ /*
+ No pages from the hot index have been written to disk, there's just a
+ single incomplete node at level 0.
+ We have to read the file header from the in-memory page.
+ */
+ uchar *p= hot_writer->nodes[0]->first_page->page;
+ page_size= uint4korr(p + 8);
+ has_root_node= false;
+ index_valid= true;
+ res= 0;
+ }
+ else
+ res= Gtid_index_reader::read_file_header();
+
+ if (hot_writer)
+ {
+ hot_writer= nullptr;
+ Gtid_index_writer::unlock_gtid_index();
+ }
+ return res;
+}
+
+
+int
+Gtid_index_reader_hot::read_root_node()
+{
+ if (!index_valid)
+ return 1;
+
+ if (hot_writer)
+ {
+ hot_level= hot_writer->max_level;
+ return read_node_hot();
+ }
+ if (has_root_node)
+ {
+ return Gtid_index_reader::read_root_node();
+ }
+ return 1;
+}
+
+
+int
+Gtid_index_reader_hot::read_node(uint32 page_ptr)
+{
+ if (!index_valid || (!page_ptr && !hot_writer))
+ return 1;
+
+ if (hot_writer)
+ {
+ if (!page_ptr)
+ {
+ /*
+ The "hot" index is only partially written. Not yet written child pages
+ are indicated by zero child pointers. Such child pages are found from
+ the list of active nodes in the writer.
+ */
+ if (hot_level <= 0)
+ {
+ DBUG_ASSERT(0 /* Should be no child pointer to follow on leaf page. */);
+ return give_error("Corrupt hot index (child pointer on leaf page");
+ }
+ DBUG_ASSERT(n == hot_writer->nodes[hot_level]);
+ --hot_level;
+ return read_node_hot();
+ }
+
+ /*
+ We started searching the "hot" index, but now we've reached a "cold"
+ part of the index that's already fully written. So leave the "hot index"
+ mode and continue reading pages from the on-disk index from here.
+ */
+ hot_writer= nullptr;
+ Gtid_index_writer::unlock_gtid_index();
+ }
+
+ return read_node_cold(page_ptr);
+}
+
+
+int
+Gtid_index_reader_hot::read_node_hot()
+{
+ if (hot_writer->error_state)
+ return give_error("Cannot access hot index");
+ n= hot_writer->nodes[hot_level];
+ read_page= n->first_page;
+ /* The writer should allocate pages for all nodes. */
+ DBUG_ASSERT(read_page != nullptr);
+ if (!read_page)
+ return give_error("Page not available in hot index");
+ read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
+ return 0;
+}