1 files changed, 521 insertions, 0 deletions
diff --git a/sql/gtid_index.h b/sql/gtid_index.h
new file mode 100644
index 00000000..4674bf97
--- /dev/null
+++ b/sql/gtid_index.h
@@ -0,0 +1,521 @@
+/*
+   Copyright (c) 2023 Kristian Nielsen <knielsen@knielsen-hq.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+*/
+
+#ifndef GTID_INDEX_H
+#define GTID_INDEX_H
+
+#include "my_global.h"
+#include "mysqld.h"
+#include "mariadb.h"
+#include "rpl_gtid.h"
+
+/*
+  This implements an on-disk index for each binlog file to speed up access to
+  the binlog at a specific offset or GTID position. This is primarily used when
+  a slave connects to the master, but also by user calling BINLOG_GTID_POS().
+
+  A connecting slave first scans the binlog files to find the last one with an
+  initial GTID_LIST event that lies before the starting GTID position. Then a
+  sequential scan of the binlog file is done until the requested GTID position
+  is found.
+
+  The binlog index conceptually extends this using index records corresponding
+  to different offset within one binlog file. Each record functions as if it
+  was the initial GTID_LIST event of a new binlog file, allowing the
+  sequential scan to start from the corresponding position. By having
+  sufficiently many index records, the scan will be fast.
+
+  The code that adds one record to the index is in two parts, a "sync" path
+  and an "async" path. The "sync" path in process_gtid_check_batch() does the
+  minimum amount of work which needs to run as part of transaction commit. The
+  actual writing of the index, in async_update(), can then be done as a
+  background task, minimizing the performance impact on transaction processing.
+  The "sync" and "async" paths each run single threaded, but can execute in
+  parallel with each other.
+
+  The index file is written incrementally together with the binlog file.
+  However there is no fsync()'s of the index file needed while writing. A
+  partially written index left by a crashing server will be re-written during
+  binlog recovery. A reader is allowed to use the index as it is begin written
+  (for the "hot" binlog file); such access is protected by mutex.
+
+  In case of lost or corrupt index, fallback to full sequential scan is done
+  (so performance will be affected but not correct functionality).
+
+  The index file is structured like a B+-tree. The index is append-only, so
+  also resembles a log-structured merge-tree, but with no merging of levels
+  needed as it covers a single fixed-size binlog file. This makes the building
+  of the tree relatively simple.
+
+  Keys in the tree consist of a GTID state (corresponding to a GTID_LIST
+  event) and the associated binlog file offset. All keys (except the first key
+  in each level of the tree) are delta-compressed to save space, holding only
+  the (domain_id, server_id) pairs that differ from the previous record.
+
+  The file is page-based. The first page contains the leftmost leaf node, and
+  the root node is at the end of the file. An incompletely written index file
+  can be detected by the last page in the file not being a root node page.
+  Nodes in the B+-tree usually fit in one page, but a node can be split across
+  multiple pages if GTID states are very large.
+
+  Page format:
+
+  The first page contains an extra file header:
+
+    Offset  Size  Description
+       0      4   MAGIC header identifying the file as a binlog index
+       4      1   Major version number. A new major version of the file format
+                  is not readable by older server versions.
+       5      1   Minor version number. Formats differing only in minor version
+                  are backwards compatible and can be read by older servers.
+       6      2   Padding/unused.
+       8      4   Page size.
+
+  Each page additionally contains this header:
+
+    Offset  Size  Description
+       0      1   Flags
+       1      3   Padding/unused
+
+  The last 4 bytes of each page is a 32-bit CRC.
+
+  An interior node is a sequence of
+    <child ptr> <key> <child ptr> <key> ... <key> <child ptr>
+  while a leaf node has only keys.
+
+  A child pointer is stored as 4 byte integer. The first page is 1, so that
+  0 can be used to denote "not present".
+
+  Format of a key:
+
+    Offset  Size  Description
+      0       4   Number of GTIDs in the key, plus 1. Or 0 for EOF.
+      4       4   Binlog file offset
+      8       4   Domain_id of first GTID
+     12       4   Server_id of first GTID
+     16       8   Seq_no of first GTID
+     ...          and so on for each GTID in the key.
+
+  A node typically fits in one page. But if the GTID state is very big (or
+  the page size very small), multiple pages may be used. When a node is split,
+  it can be split after a child pointer or before or after a GTID, but not
+  elsewhere.
+
+Here is an example GTID index with page_size=64 containing 3 records:
+  Offset  GTID state
+  0x11d   [empty]
+  0x20e   [0-1-1]
+  0x2ad   [0-1-2]
+The example contains 3 nodes, each stored in a single page. Two leaf nodes and
+one interior root node.
+
+Page 1 (leaf node page with file header):
+  fe fe 0c 01              "magic" identifying the file as a binlog GTID index
+  01 00                    Major version 1, minor version 0
+  00 00                    Padding / currently unused
+  40 00 00 00              Page size (64 bytes in this example)
+  05                       Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node)
+  00 00 00                 Padding / current unused
+Key 1:
+  01 00 00 00              <GTID_count + 1> = 1 (entry has zero GTIDs in it)
+  1d 01 00 00              Binlog file offset = 0x11d
+                           [Empty GTID state at the very start of the binlog]
+Key 2:
+  02 00 00 00              GTID_count = 1
+  0e 02 00 00              Binlog file offset = 0x20e
+  00 00 00 00 01 00 00 00
+  01 00 00 00 00 00 00 00  GTID 0-1-1
+
+  00 00 00 00 00           Zero denotes end-of-node
+  00 00 00 00 00 00 00     (Unused space in the page)
+  0e 4f ac 43              Checksum / CRC
+
+Page 2 (leaf node):
+  05                       Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node)
+  00 00 00                 Unused
+Key 1:
+  02 00 00 00              GTID_count = 1
+  ad 02 00 00              Binlog file offset = 0x2ad
+  00 00 00 00 01 00 00 00
+  02 00 00 00 00 00 00 00  GTID 0-1-2
+
+  00 00 00 00              End-of-node
+  00 00 00 00 00 00 00 00
+  00 00 00 00 00 00 00 00
+  00 00 00 00 00 00 00 00
+  00 00 00 00              (Unused space in the page)
+  0c 4e c2 b9              CRC
+
+Page 3 (root node):
+  0c                       PAGE_FLAG_ROOT | PAGE_FLAG_LAST (interior root node)
+  00 00 00                 Unused
+Child pointer:
+  01 00 00 00              Pointer to page 1
+Key for next child page:
+  02 00 00 00              GTID_count = 1
+  ad 02 00 00              Binlog offset = 0x2ad
+  00 00 00 00 01 00 00 00
+  02 00 00 00 00 00 00 00  GTID 0-1-2
+Child pointer:
+  02 00 00 00              Pointer to page 2
+
+  00 00 0 000              Zero denotes end-of-node
+  00 00 00 00 00 00 00 00
+  00 00 00 00 00 00 00 00
+  00 00 00 00              (Unused)
+  8155 a3c7                CRC
+
+  Below is an example of the logical B-Tree structure of a larger GTID index
+  with a total of 12 keys.
+
+  We use S0, S1, ..., S11 to denote a key, which consists of a GTID state (as
+  seen in @@binlog_gtid_state and GTID_LIST_EVENT) and the associated binlog
+  file offset. D1, D2, ..., D11 denote the same keys, but delta-compressed, so
+  that D1 stores only those GTIDs that are not the same as in S0.
+
+  Pages are denoted by P1, P2, ..., P8. In the example, P1, P2, P3, P5, and P6
+  are leaf pages, the rest are interior node pages. P8 is the root node (the
+  root is always the last page in the index).
+
+  The contents of each page is listed in square brackets [...]. So P1[S0 D1 D2]
+  is a leaf page with 3 keys, and P7[P5 <D10+D11> P6] is an interior node page
+  with one key <D10+D11> and two child-page pointers to P5 and P6. The
+  notation <D10+D11> denotes the delta-compression of key S11 relative to S9;
+  all GTIDs in S11 that are not present in S9. In the code, this is computed
+  by combining D10 and D11, hence the use of the notation "D10+D11" instead of
+  the equivalent "S11-S9".
+
+  Here is the example B-Tree. It has 3 levels, with the leaf nodes at the top:
+
+    P1[S0 D1 D2]   P2[D3 D4 D5]   P3[D6 D7 D8]   P5[D9 D10]   P6[D11]
+          P4[P1 <S3> P2 <D4+D5+D6> P3]            P7[P5 <D10+D11> P6]
+                                     P8[P4 <S9> P7]
+
+  To find eg. S4, we start from the root P8. S4<S9, so we follow the left child
+  pointer to P4. S4>S3 and S4<S6 (S6=(S3+D4+D5+D6)), so we follow the child
+  pointer to leaf page P2.
+
+  The index is written completely append-only; this is possible since keys are
+  always inserted in-order, at the end of the index. One page is kept in-memory
+  at each level of the B-Tree; when a new key no longer fits the page, it is
+  written out to disk and a new in-memory page is allocated for it.
+
+  Here are the operations that occur while writing the index file from the
+  above example. The left column is each key added to the index as the
+  corresponding GTID is written into the binlog file (<EOF> is when the index
+  is closed at binlog rotation). The right column are the operations performed,
+  as follows:
+    alloc(p)      Allocate page p
+    add_key(p,k)  Insert the key k into the page p
+    add_ptr(p,q)  Insert a pointer to child page q in parent page p
+    write(p)      Write out page p to disk at the end of the index file.
+
+  GTID STATE   OPERATIONS
+    S0       alloc(P1) add_key(P1,S0)
+    D1         add_key(P1,D1)
+    D2         add_key(P1,D2)
+    D3       write(P1) alloc(P4) add_ptr(P4,P1)
+             alloc(P2) add_key(P2,D3) add_key(P4,S3)
+    D4         add_key(P2,D4)
+    D5         add_key(P2,D5)
+    D6       write(P2) add_ptr(P4,P2)
+             alloc(P3) add_key(P3,D6) add_key(P4,D4+D5+D6)
+    D7         add_key(P3,D7)
+    D8         add_key(P3,D8)
+    D9       write(P3) add_ptr(P4,P3)
+             alloc(P5) add_key(P5,D9)
+             write(P4) alloc(P8) add_ptr(P8,P4) alloc(P7) add_key(P8,S9)
+    D10        add_key(P5,D10)
+    D11      write(P5) add_ptr(P7,P5)
+             alloc(P6) add_key(P6,D11) add_key(P7,D10+D11)
+    <EOF>    write(P6) add_ptr(P7,P6)
+             write(P7) add_ptr(P8,P7)
+             write(P8)
+
+  After adding each record to the index, there is exactly one partial page
+  allocated in-memory for each level present in the B-Tree; new pages being
+  allocated as old pages fill up and are written to disk.
+*/
+
+
+class Gtid_index_base
+{
+public:
+  /* +4 for ".idx" prefix. */
+  static constexpr size_t GTID_INDEX_FILENAME_MAX_SIZE= FN_REFLEN+4;
+
+protected:
+  enum enum_page_flags {
+    /* Set for a leaf node page, cleared for an interior node page. */
+    PAGE_FLAG_IS_LEAF= 1,
+    /* This is a continuation page. */
+    PAGE_FLAG_IS_CONT= 2,
+    /* No continuation page follows (the last page in a group). */
+    PAGE_FLAG_LAST= 4,
+    /*
+      Flag set to mark the root node. (The root node is normally the last page
+      in the index file, but having an explicit flag allows us to detect a
+      partially written index file with the root node missing.
+    */
+    PAGE_FLAG_ROOT= 8,
+  };
+
+  /*
+    Minor version increment represents a backwards-compatible format (can be
+    read by any server version that knows the format of the major version).
+    Major version increment means a server should not attempt to read from the
+    index.
+  */
+  static constexpr uchar GTID_INDEX_VERSION_MAJOR= 1;
+  static constexpr uchar GTID_INDEX_VERSION_MINOR= 0;
+  static constexpr size_t GTID_INDEX_FILE_HEADER_SIZE= 12;
+  static constexpr size_t GTID_INDEX_PAGE_HEADER_SIZE= 4;
+  static constexpr size_t CHECKSUM_LEN= 4;
+
+#ifdef _MSC_VER
+/*
+  Flexible array member is part of C99, but it is not standard in C++.
+  All the compilers and platforms we support do support it, though.
+  Just we need to disable on Windows a warning about using a non-standard
+  C++ extension.
+*/
+#pragma warning(disable : 4200)
+#endif
+  struct Node_page
+  {
+    Node_page *next;
+    /* Pointer to allow to update the "flags" byte at page writeout. */
+    uchar *flag_ptr;
+    /* Flexible array member; will be allocated to opt_gtid_index_page_size. */
+    uchar page[];
+  };
+
+  struct Index_node_base
+  {
+    Node_page *first_page;
+    Node_page *current_page;
+    /* The current_ptr is only valid if current_page != 0. */
+    uchar *current_ptr;
+
+    Index_node_base();
+    ~Index_node_base();
+    void free_pages();
+    void reset();
+  };
+
+public:
+  static void make_gtid_index_file_name(char *out_name, size_t bufsize,
+                                        const char *base_filename);
+
+protected:
+  int update_gtid_state(rpl_binlog_state_base *state,
+                        const rpl_gtid *gtid_list, uint32 gtid_count);
+  Node_page *alloc_page();
+  rpl_gtid *gtid_list_buffer(uint32 count);
+  void build_index_filename(const char *filename);
+  virtual int give_error(const char *msg) = 0;
+
+  /*
+    A buffer to hold a gtid_list temporarily.
+    Increased as needed to hold largest needed list.
+  */
+  rpl_gtid *gtid_buffer;
+  uint32 gtid_buffer_alloc;
+  size_t page_size;
+public:
+  char index_file_name[GTID_INDEX_FILENAME_MAX_SIZE];
+
+protected:
+  Gtid_index_base();
+  virtual ~Gtid_index_base();
+};
+
+
+class Gtid_index_writer : public Gtid_index_base
+{
+private:
+  const my_off_t offset_min_threshold;
+
+  struct Index_node : public Index_node_base
+  {
+    rpl_binlog_state_base state;
+    uint32 num_records;
+    uint32 level;
+    bool force_spill_page;
+
+    Index_node(uint32 level_);
+    ~Index_node();
+    void reset();
+  };
+
+public:
+  static void gtid_index_init();
+  static void gtid_index_cleanup();
+protected:
+  friend class Gtid_index_reader_hot;
+  static void lock_gtid_index() { mysql_mutex_lock(&gtid_index_mutex); }
+  static void unlock_gtid_index() { mysql_mutex_unlock(&gtid_index_mutex); }
+  static const Gtid_index_writer *find_hot_index(const char *file_name);
+
+public:
+  Gtid_index_writer(const char *filename, uint32 offset,
+                    rpl_binlog_state_base *binlog_state,
+                    uint32 opt_page_size, my_off_t opt_span_min);
+  virtual ~Gtid_index_writer();
+  void process_gtid(uint32 offset, const rpl_gtid *gtid);
+  int process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
+                               rpl_gtid **out_gtid_list,
+                               uint32 *out_gtid_count);
+  int async_update(uint32 event_offset, rpl_gtid *gtid_list, uint32 gtid_count);
+  void close();
+
+private:
+  void insert_in_hot_index();
+  void remove_from_hot_index();
+  uint32 write_current_node(uint32 level, bool is_root);
+  int reserve_space(Index_node *n, size_t bytes);
+  int do_write_record(uint32 level, uint32 event_offset,
+                      const rpl_gtid *gtid_list, uint32 gtid_count);
+  int add_child_ptr(uint32 level, my_off_t node_offset);
+  int write_record(uint32 event_offset, const rpl_gtid *gtid_list,
+                   uint32 gtid_count);
+  bool check_room(uint32 level, uint32 gtid_count);
+  int alloc_level_if_missing(uint32 level);
+  uchar *init_header(Node_page *page, bool is_leaf, bool is_first);
+  int give_error(const char *msg) override;
+
+  static mysql_mutex_t gtid_index_mutex;
+  static Gtid_index_writer *hot_index_list;
+
+  rpl_binlog_state_base pending_state;
+  /* Next pointer for the hot_index_list linked list. */
+  Gtid_index_writer *next_hot_index;
+  /* The currently being built index nodes, from leaf[0] to root[max_level]. */
+  Index_node **nodes;
+  my_off_t previous_offset;
+  uint32 max_level;
+
+  File index_file;
+
+  /*
+    This is set if we encounter an error (such as out-of-memory or I/O error).
+    Then we will no longer do any updates to the index, to prevent leaving a
+    corrupt index. This is not fatal; the partial index will work up to where
+    it got the error, and the code can fall-back to sequential scan of the
+    binlog.
+  */
+  bool error_state;
+  /* Flag to help put the file header at the start of the very first page. */
+  bool file_header_written;
+  /* Flag set while this object is visible in the "hot index" list. */
+  bool in_hot_index_list;
+};
+
+
+class Gtid_index_reader : public Gtid_index_base
+{
+public:
+  Gtid_index_reader();
+  virtual ~Gtid_index_reader();
+
+  int open_index_file(const char *binlog_filename);
+  void close_index_file();
+  /*
+    The search functions take either a binlog offset or GTID position to search
+    for. They return:
+      0   for "not found" (searched position is earlier than start of index).
+      1   for "found"
+     -1   for error.
+    When found, the returned position is the last position in the index that
+    lies at or before the searched position. The offset of the returned
+    position is written to *out_offset. The number of GTIDs in the returned
+    GTID state is written to *out_gtid_count; the list of found GTIDs can be
+    accessed with search_gtid_list() and is valid only until next search or
+    freeing of the Gtid_index_reader object.
+  */
+  int search_offset(uint32 in_offset, uint32 *out_offset,
+                    uint32 *out_gtid_count);
+  int search_gtid_pos(slave_connection_state *in_gtid_pos, uint32 *out_offset,
+                      uint32 *out_gtid_count);
+  rpl_gtid *search_gtid_list();
+
+protected:
+  int search_cmp_offset(uint32 offset, rpl_binlog_state_base *state);
+  int search_cmp_gtid_pos(uint32 offset, rpl_binlog_state_base *state);
+  virtual int do_index_search(uint32 *out_offset, uint32 *out_gtid_count);
+  int do_index_search_root(uint32 *out_offset, uint32 *out_gtid_count);
+  int do_index_search_leaf(bool current_state_updated,
+                           uint32 *out_offset, uint32 *out_gtid_count);
+  int next_page();
+  int find_bytes(uint32 num_bytes);
+  virtual int get_child_ptr(uint32 *out_child_ptr);
+  int get_offset_count(uint32 *out_offset, uint32 *out_gtid_count);
+  int get_gtid_list(rpl_gtid *out_gtid_list, uint32 count);
+  virtual int read_file_header();
+  int verify_checksum(Node_page *page);
+  Node_page *alloc_and_read_page();
+  virtual int read_root_node();
+  virtual int read_node(uint32 page_ptr);
+  int read_node_cold(uint32 page_ptr);
+  int give_error(const char *msg) override;
+
+  rpl_binlog_state_base current_state;
+  rpl_binlog_state_base compare_state;
+  Index_node_base cold_node;
+  /* n points to either cold node or hot node in writer. */
+  Index_node_base *n;
+  int (Gtid_index_reader::* search_cmp_function)(uint32, rpl_binlog_state_base *);
+  slave_connection_state *in_search_gtid_pos;
+  Node_page *read_page;
+  uchar *read_ptr;
+  File index_file;
+  uint32 current_offset;
+  uint32 in_search_offset;
+  bool file_open;
+  bool index_valid;
+  bool has_root_node;
+  uchar version_major;
+  uchar version_minor;
+};
+
+
+/*
+   Sub-class of Gtid_index_reader that can additionally access in-memory "hot"
+   pages of the index, which are partially filled pages of the current binlog
+   file, not yet written to disk.
+*/
+class Gtid_index_reader_hot : public Gtid_index_reader
+{
+public:
+  Gtid_index_reader_hot();
+  virtual ~Gtid_index_reader_hot() { }
+
+private:
+  int do_index_search(uint32 *out_offset, uint32 *out_gtid_count) override;
+  int get_child_ptr(uint32 *out_child_ptr) override;
+  int read_file_header() override;
+  int read_root_node() override;
+  int read_node(uint32 page_ptr) override;
+  int read_node_hot();
+
+  /* Pointer to the writer object, if we're reading a hot index. */
+  const Gtid_index_writer *hot_writer;
+  /* The level we are currently reading in the hot writer .*/
+  uint32 hot_level;
+};
+
+#endif  /* GTID_INDEX_H */