summaryrefslogtreecommitdiffstats
path: root/storage/maria/ha_s3.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
commit3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/maria/ha_s3.cc
parentInitial commit. (diff)
downloadmariadb-upstream.tar.xz
mariadb-upstream.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/maria/ha_s3.cc')
-rw-r--r--storage/maria/ha_s3.cc1125
1 files changed, 1125 insertions, 0 deletions
diff --git a/storage/maria/ha_s3.cc b/storage/maria/ha_s3.cc
new file mode 100644
index 00000000..8c105522
--- /dev/null
+++ b/storage/maria/ha_s3.cc
@@ -0,0 +1,1125 @@
+/* Copyright (C) 2019, 2021 MariaDB Corporation Ab
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the
+ Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA
+*/
+
+/*
+ Implementation of S3 storage engine.
+
+ Storage format:
+
+ The S3 engine is read only storage engine. The data is stored in
+ same format as a non transactional Aria table in BLOCK_RECORD format.
+ This makes it easy to cache both index and rows in the page cache.
+ Data and index file are split into blocks of 's3_block_size', default
+ 4M.
+
+ The table and it's associated files are stored in S3 into the following
+ locations:
+
+ frm file (for discovery):
+ aws_bucket/database/table/frm
+
+ First index block (contains description if the Aria file):
+ aws_bucket/database/table/aria
+
+ Rest of the index file:
+ aws_bucket/database/table/index/block_number
+
+ Data file:
+ aws_bucket/database/table/data/block_number
+
+ block_number is 6 digits decimal number, prefixed with 0
+ (Can be larger than 6 numbers, the prefix is just for nice output)
+
+ frm and base blocks are small (just the needed data).
+ index and blocks are of size 's3_block_size'
+
+ If compression is used, then original block size is s3_block_size
+ but the stored block will be the size of the compressed block.
+
+ Implementation:
+ The s3 engine inherits from the ha_maria handler.
+
+ It uses Aria code and relies on Aria being enabled. We don't have to check
+ that Aria is enabled though, because Aria is a mandatory plugin, and
+ the server will refuse to start if Aria failed to initialize.
+
+ s3 will use it's own page cache to not interfere with normal Aria
+ usage but also to ensure that the S3 page cache is large enough
+ (with a 4M s3_block_size the engine will need a large cache to work,
+ at least s3_block_size * 32. The default cache is 512M.
+*/
+
+#define MYSQL_SERVER 1
+#include <my_global.h>
+#include <m_string.h>
+#include "maria_def.h"
+#include "sql_class.h"
+#include <mysys_err.h>
+#include <libmarias3/marias3.h>
+#include <discover.h>
+#include "ha_s3.h"
+#include "s3_func.h"
+#include "aria_backup.h"
+
+#define DEFAULT_AWS_HOST_NAME "s3.amazonaws.com"
+
+static PAGECACHE s3_pagecache;
+static ulong s3_block_size, s3_protocol_version;
+static ulong s3_pagecache_division_limit, s3_pagecache_age_threshold;
+static ulong s3_pagecache_file_hash_size;
+static ulonglong s3_pagecache_buffer_size;
+static char *s3_bucket, *s3_access_key=0, *s3_secret_key=0, *s3_region;
+static char *s3_host_name;
+static int s3_port;
+static my_bool s3_use_http;
+static char *s3_tmp_access_key=0, *s3_tmp_secret_key=0;
+static my_bool s3_debug= 0, s3_slave_ignore_updates= 0;
+static my_bool s3_replicate_alter_as_create_select= 0;
+handlerton *s3_hton= 0;
+
+/* Don't show access or secret keys to users if they exists */
+
+static void update_access_key(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ my_free(s3_access_key);
+ s3_access_key= 0;
+ /* Don't show real key to user in SHOW VARIABLES */
+ if (s3_tmp_access_key[0])
+ {
+ s3_access_key= s3_tmp_access_key;
+ s3_tmp_access_key= my_strdup(PSI_NOT_INSTRUMENTED, "*****", MYF(MY_WME));
+ }
+}
+
+static void update_secret_key(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ my_free(s3_secret_key);
+ s3_secret_key= 0;
+ /* Don't show real key to user in SHOW VARIABLES */
+ if (s3_tmp_secret_key[0])
+ {
+ s3_secret_key= s3_tmp_secret_key;
+ s3_tmp_secret_key= my_strdup(PSI_NOT_INSTRUMENTED, "*****", MYF(MY_WME));
+ }
+}
+
+/* Define system variables for S3 */
+
+static MYSQL_SYSVAR_ULONG(block_size, s3_block_size,
+ PLUGIN_VAR_RQCMDARG,
+ "Block size for S3", 0, 0,
+ 4*1024*1024, 65536, 16*1024*1024, 8192);
+
+static MYSQL_SYSVAR_BOOL(debug, s3_debug,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Generates trace file from libmarias3 on stderr for debugging",
+ 0, 0, 0);
+
+static MYSQL_SYSVAR_BOOL(slave_ignore_updates, s3_slave_ignore_updates,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "If the slave has shares same S3 storage as the master",
+ 0, 0, 0);
+
+static MYSQL_SYSVAR_BOOL(replicate_alter_as_create_select,
+ s3_replicate_alter_as_create_select,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "When converting S3 table to local table, log all rows in binary log",
+ 0, 0, 1);
+
+static MYSQL_SYSVAR_ENUM(protocol_version, s3_protocol_version,
+ PLUGIN_VAR_RQCMDARG,
+ "Protocol used to communication with S3. One of "
+ "\"Auto\", \"Amazon\" or \"Original\".",
+ NULL, NULL, 0, &s3_protocol_typelib);
+
+static MYSQL_SYSVAR_ULONG(pagecache_age_threshold,
+ s3_pagecache_age_threshold, PLUGIN_VAR_RQCMDARG,
+ "This characterizes the number of hits a hot block has to be untouched "
+ "until it is considered aged enough to be downgraded to a warm block. "
+ "This specifies the percentage ratio of that number of hits to the "
+ "total number of blocks in the page cache.", 0, 0,
+ 300, 100, ~ (ulong) 0L, 100);
+
+static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, s3_pagecache_buffer_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The size of the buffer used for index blocks for S3 tables. "
+ "Increase this to get better index handling (for all reads and "
+ "multiple writes) to as much as you can afford.", 0, 0,
+ 128*1024*1024, 1024*1024*32, ~(ulonglong) 0, 8192);
+
+static MYSQL_SYSVAR_ULONG(pagecache_division_limit,
+ s3_pagecache_division_limit,
+ PLUGIN_VAR_RQCMDARG,
+ "The minimum percentage of warm blocks in key cache", 0, 0,
+ 100, 1, 100, 1);
+
+static MYSQL_SYSVAR_ULONG(pagecache_file_hash_size,
+ s3_pagecache_file_hash_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of hash buckets for open files. If you have a lot "
+ "of S3 files open you should increase this for faster flush of "
+ "changes. A good value is probably 1/10 of number of possible open "
+ "S3 files.", 0,0, 512, 32, 16384, 1);
+
+static MYSQL_SYSVAR_STR(bucket, s3_bucket,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "AWS bucket",
+ 0, 0, "MariaDB");
+static MYSQL_SYSVAR_STR(host_name, s3_host_name,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "AWS host name",
+ 0, 0, DEFAULT_AWS_HOST_NAME);
+static MYSQL_SYSVAR_INT(port, s3_port,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Port number to connect to (0 means use default)",
+ NULL /*check*/, NULL /*update*/, 0 /*default*/,
+ 0 /*min*/, 65535 /*max*/, 1 /*blk*/);
+static MYSQL_SYSVAR_BOOL(use_http, s3_use_http,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "If true, force use of HTTP protocol",
+ NULL /*check*/, NULL /*update*/, 0 /*default*/);
+static MYSQL_SYSVAR_STR(access_key, s3_tmp_access_key,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC,
+ "AWS access key",
+ 0, update_access_key, "");
+static MYSQL_SYSVAR_STR(secret_key, s3_tmp_secret_key,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC,
+ "AWS secret key",
+ 0, update_secret_key, "");
+static MYSQL_SYSVAR_STR(region, s3_region,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "AWS region",
+ 0, 0, "");
+
+ha_create_table_option s3_table_option_list[]=
+{
+ /*
+ one numeric option, with the default of UINT_MAX32, valid
+ range of values 0..UINT_MAX32, and a "block size" of 10
+ (any value must be divisible by 10).
+ */
+ HA_TOPTION_SYSVAR("s3_block_size", s3_block_size, block_size),
+ HA_TOPTION_ENUM("compression_algorithm", compression_algorithm, "none,zlib",
+ 0),
+ HA_TOPTION_END
+};
+
+
+/*****************************************************************************
+ S3 handler code
+******************************************************************************/
+
+/**
+ Create S3 handler
+*/
+
+
+ha_s3::ha_s3(handlerton *hton, TABLE_SHARE *table_arg)
+ :ha_maria(hton, table_arg), in_alter_table(S3_NO_ALTER), open_args(NULL)
+{
+ /* Remove things that S3 doesn't support */
+ int_table_flags&= ~(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
+ HA_CAN_EXPORT);
+ can_enable_indexes= 0;
+}
+
+
+/**
+ Remember the handler to use for s3_block_read()
+
+ @note
+ In the future the ms3_st objects could be stored in
+ a list in share. In this case we would however need a mutex
+ to access the next free one. By using st_my_thread_var we
+ can avoid the mutex with the small cost of having to call
+ register handler in all handler functions that will access
+ the page cache
+*/
+
+void ha_s3::register_handler(MARIA_HA *file)
+{
+ struct st_my_thread_var *thread= my_thread_var;
+ thread->keycache_file= (void*) file;
+}
+
+
+/**
+ Write a row
+
+ When generating the table as part of ALTER TABLE, writes are allowed.
+ When table is moved to S3, writes are not allowed.
+*/
+
+int ha_s3::write_row(const uchar *buf)
+{
+ DBUG_ENTER("ha_s3::write_row");
+ if (in_alter_table)
+ DBUG_RETURN(ha_maria::write_row(buf));
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+}
+
+/* Return true if S3 can be used */
+
+static my_bool s3_usable()
+{
+ return (s3_access_key != 0 && s3_secret_key != 0 && s3_region != 0 &&
+ s3_bucket != 0);
+}
+
+
+static my_bool s3_info_init(S3_INFO *info)
+{
+ if (!s3_usable())
+ return 1;
+ info->protocol_version= (uint8_t) s3_protocol_version;
+ lex_string_set(&info->host_name, s3_host_name);
+ info->port= s3_port;
+ info->use_http= s3_use_http;
+ lex_string_set(&info->access_key, s3_access_key);
+ lex_string_set(&info->secret_key, s3_secret_key);
+ lex_string_set(&info->region, s3_region);
+ lex_string_set(&info->bucket, s3_bucket);
+ return 0;
+}
+
+
+/**
+ Fill information in S3_INFO including paths to table and database
+
+ Notes:
+ Database and table name are set even if s3 variables are not
+ initialized. This is needed by s3::drop_table
+*/
+
+static my_bool s3_info_init(S3_INFO *s3_info, const char *path,
+ char *database_buff, size_t database_length)
+{
+ set_database_and_table_from_path(s3_info, path);
+ /* Fix database as it's not \0 terminated */
+ strmake(database_buff, s3_info->database.str,
+ MY_MIN(database_length, s3_info->database.length));
+ s3_info->database.str= database_buff;
+ s3_info->base_table= s3_info->table;
+ return s3_info_init(s3_info);
+}
+
+/*
+ Check if table is a temporary table
+
+ Returns 1 if table is a temporary table that should be stored in Aria
+ (to later be copied to S3 with a name change)
+*/
+
+static int is_mariadb_internal_tmp_table(const char *table_name)
+{
+ int length;
+ const int p_length= sizeof(tmp_file_prefix); // prefix + '-'
+ /* Temporary table from ALTER TABLE */
+ if (!strncmp(table_name, tmp_file_prefix "-" , p_length))
+ {
+ /*
+ Internal temporary tables used by ALTER TABLE and ALTER PARTITION
+ should be stored in S3
+ */
+ if (!strncmp(table_name+p_length, "backup-", sizeof("backup-")-1) ||
+ !strncmp(table_name+p_length, "exchange-", sizeof("exchange-")-1) ||
+ !strncmp(table_name+p_length, "temptable-", sizeof("temptable-")-1))
+ return 0;
+ /* Other temporary tables should be stored in Aria on local disk */
+ return 1;
+ }
+ length= strlen(table_name);
+ if (length > 5 && !strncmp(table_name + length - 5, "#TMP#", 5))
+ return 1;
+ return 0;
+}
+
+
+/**
+ Drop S3 table
+*/
+
+int ha_s3::delete_table(const char *name)
+{
+ ms3_st *s3_client;
+ S3_INFO s3_info;
+ int error;
+ char database[NAME_LEN+1];
+ DBUG_ENTER("ha_s3::delete_table");
+
+ error= s3_info_init(&s3_info, name, database, sizeof(database)-1);
+
+ /* If internal on disk temporary table, let Aria take care of it */
+ if (is_mariadb_internal_tmp_table(s3_info.table.str))
+ DBUG_RETURN(ha_maria::delete_table(name));
+
+ if (error)
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+ if (!(s3_client= s3_open_connection(&s3_info)))
+ DBUG_RETURN(HA_ERR_NO_CONNECTION);
+ error= aria_delete_from_s3(s3_client, s3_info.bucket.str,
+ s3_info.database.str,
+ s3_info.table.str,0);
+ s3_deinit(s3_client);
+ DBUG_RETURN(error);
+}
+
+/*
+ The table is a temporary table as part of ALTER TABLE.
+
+ Copy the on disk 'temporary' Aria table to S3 and delete the Aria table
+*/
+
+static int move_table_to_s3(ms3_st *s3_client,
+ S3_INFO *to_s3_info,
+ const char *local_name,
+ bool is_partition)
+{
+ int error;
+ DBUG_ASSERT(!is_mariadb_internal_tmp_table(to_s3_info->table.str));
+
+ if (!(error= aria_copy_to_s3(s3_client, to_s3_info->bucket.str, local_name,
+ to_s3_info->database.str,
+ to_s3_info->table.str,
+ 0, 0, 1, 0, !is_partition)))
+ {
+ /* Table now in S3. Remove original files table files, keep .frm */
+ error= maria_delete_table_files(local_name, 1, 0);
+ }
+ return error;
+}
+
+
+/**
+ Copy an Aria table to S3 or rename a table in S3
+
+ The copy happens as part of the rename in ALTER TABLE when all data
+ is in an Aria table and we now have to copy it to S3.
+
+ If the table is an old table already in S3, we should just rename it.
+*/
+
+int ha_s3::rename_table(const char *from, const char *to)
+{
+ S3_INFO to_s3_info;
+ char to_name[NAME_LEN+1], frm_name[FN_REFLEN];
+ ms3_st *s3_client;
+ MY_STAT stat_info;
+ int error;
+ bool is_partition= (strstr(from, "#P#") != NULL) ||
+ (strstr(to, "#P#") != NULL);
+ DBUG_ENTER("ha_s3::rename_table");
+
+ if (s3_info_init(&to_s3_info, to, to_name, sizeof(to_name)-1))
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+ if (!(s3_client= s3_open_connection(&to_s3_info)))
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+
+ /*
+ Check if this is a on disk table created by ALTER TABLE that should be
+ copied to S3. We know this is the case if the table is a temporary table
+ and the .MAI file for the table is on disk
+ */
+ fn_format(frm_name, from, "", reg_ext, MYF(0));
+ if (is_mariadb_internal_tmp_table(from + dirname_length(from)) &&
+ (is_partition || my_stat(frm_name, &stat_info, MYF(0))))
+ {
+ error= move_table_to_s3(s3_client, &to_s3_info, from, is_partition);
+ }
+ else
+ {
+ char from_name[NAME_LEN+1];
+ S3_INFO from_s3_info;
+ /* The table is an internal S3 table. Do the renames */
+ s3_info_init(&from_s3_info, from, from_name, sizeof(from_name)-1);
+
+ if (is_mariadb_internal_tmp_table(to + dirname_length(to)))
+ {
+ /*
+ The table is renamed to a temporary table. This only happens
+ in the case of an ALTER PARTITION failure and there will be soon
+ a delete issued for the temporary table. The only thing we can do
+ is to remove the from table. We will get an extra errors for the
+ uppcoming but we will ignore this minor problem for now as this
+ is an unlikely event and the extra warnings are just annoying,
+ not critical.
+ */
+ error= aria_delete_from_s3(s3_client, from_s3_info.bucket.str,
+ from_s3_info.database.str,
+ from_s3_info.table.str,0);
+ }
+ else
+ error= aria_rename_s3(s3_client, to_s3_info.bucket.str,
+ from_s3_info.database.str,
+ from_s3_info.table.str,
+ to_s3_info.database.str,
+ to_s3_info.table.str,
+ !is_partition &&
+ !current_thd->lex->alter_info.partition_flags);
+ }
+ s3_deinit(s3_client);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Create a s3 table.
+
+ @notes
+ One can only create an s3 table as part of ALTER TABLE
+ The table is created as a non transactional Aria table with
+ BLOCK_RECORD format
+*/
+
+int ha_s3::create(const char *name, TABLE *table_arg,
+ HA_CREATE_INFO *ha_create_info)
+{
+ uchar *frm_ptr;
+ size_t frm_len;
+ int error;
+ TABLE_SHARE *share= table_arg->s;
+ DBUG_ENTER("ha_s3::create");
+
+ if (!(ha_create_info->options & HA_CREATE_TMP_ALTER) ||
+ ha_create_info->tmp_table())
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+
+ if (share->table_type == TABLE_TYPE_SEQUENCE)
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+ /* When using partitions, S3 only supports adding and remove partitions */
+ if ((table_arg->in_use->lex->alter_info.partition_flags &
+ ~(ALTER_PARTITION_REMOVE | ALTER_PARTITION_ADD | ALTER_PARTITION_INFO)))
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+ if (!s3_usable())
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+ /* Force the table to a format suitable for S3 */
+ ha_create_info->row_type= ROW_TYPE_PAGE;
+ ha_create_info->transactional= HA_CHOICE_NO;
+ error= ha_maria::create(name, table_arg, ha_create_info);
+ if (error)
+ DBUG_RETURN(error);
+
+#ifdef MOVE_FILES_TO_S3_ON_CREATE
+ /*
+ If we are in ADD PARTITION and we created a new table (not
+ temporary table, which will be moved as part of the final rename),
+ we should move it S3 right away. The other option would to move
+ it as part of close(). We prefer to do this here as there is no error
+ checking with close() which would leave incomplete tables around in
+ case of failures. The downside is that we can't move rows around as
+ part of changing partitions, but that is not a big problem with S3
+ as it's readonly anyway.
+ */
+ if (!is_mariadb_internal_tmp_table(name + dirname_length(name)) &&
+ strstr(name, "#P#"))
+ {
+ S3_INFO to_s3_info;
+ char database[NAME_LEN+1];
+ ms3_st *s3_client;
+
+ if (s3_info_init(&to_s3_info, name, database, sizeof(database)-1))
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+ if (!(s3_client= s3_open_connection(&to_s3_info)))
+ DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+ /* Note that if error is set, then the empty temp table was not removed */
+ error= move_table_to_s3(s3_client, &to_s3_info, name, 1);
+ s3_deinit(s3_client);
+ if (error)
+ maria_delete_table_files(name, 1, 0);
+ else
+#endif /* MOVE_TABLE_TO_S3 */
+ {
+ /* Create the .frm file. Needed for ha_s3::rename_table() later */
+ if (!table_arg->s->read_frm_image((const uchar**) &frm_ptr, &frm_len))
+ {
+ table_arg->s->write_frm_image(frm_ptr, frm_len);
+ table_arg->s->free_frm_image(frm_ptr);
+ }
+ }
+ DBUG_RETURN(error);
+}
+
+/**
+ Open table
+
+ @notes
+ Table is read only, except if opened by ALTER as in this case we
+ are creating the S3 table.
+*/
+
+int ha_s3::open(const char *name, int mode, uint open_flags)
+{
+ bool internal_tmp_table= 0;
+ int res;
+ S3_INFO s3_info;
+ DBUG_ENTER("ha_s3:open");
+
+ if (!s3_usable())
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+
+ /*
+ On slaves with s3_slave_ignore_updates set we allow tables to be
+ opened in write mode to be able to ignore queries that modify
+ the table trough handler::check_if_updates_are_ignored().
+
+ This is needed for the slave to be able to handle
+ CREATE TABLE t1...
+ INSERT INTO TABLE t1 ....
+ ALTER TABLE t1 ENGINE=S3
+ If this is not done, the insert will fail on the slave if the
+ master has already executed the ALTER TABLE.
+
+ We also have to allow open for create, as part of
+ ALTER TABLE ... ENGINE=S3.
+
+ Otherwise we only allow the table to be open in read mode
+ */
+ if (mode != O_RDONLY && !(open_flags & HA_OPEN_FOR_CREATE) &&
+ !s3_slave_ignore_updates)
+ DBUG_RETURN(EACCES);
+
+ open_args= 0;
+ internal_tmp_table= is_mariadb_internal_tmp_table(name +
+ dirname_length(name));
+
+ if (!(open_flags & HA_OPEN_FOR_CREATE) && !internal_tmp_table)
+ {
+ (void) s3_info_init(&s3_info);
+ s3_info.tabledef_version= table->s->tabledef_version;
+ s3_info.base_table= table->s->table_name;
+
+ /* Pass the above arguments to maria_open() */
+ open_args= &s3_info;
+ in_alter_table= S3_NO_ALTER;
+ }
+ else
+ {
+ /*
+ Table was created as an Aria table that will be moved to S3 either
+ by rename_table() or external_lock()
+ */
+ bool is_partition= (strstr(name, "#P#") != NULL);
+ in_alter_table= (!is_partition ? S3_ALTER_TABLE :
+ internal_tmp_table ? S3_ADD_TMP_PARTITION :
+ S3_ADD_PARTITION);
+ }
+ DBUG_PRINT("info", ("in_alter_table: %d", in_alter_table));
+
+ if (!(res= ha_maria::open(name, mode, open_flags)))
+ {
+ if (open_args)
+ {
+ /*
+ Table is in S3. We have to modify the pagecache callbacks for the
+ data file, index file and for bitmap handling.
+ */
+ file->s->pagecache= &s3_pagecache;
+ file->dfile.big_block_size= file->s->kfile.big_block_size=
+ file->s->bitmap.file.big_block_size= file->s->base.s3_block_size;
+ file->s->kfile.head_blocks= file->s->base.keystart / file->s->block_size;
+ file->s->no_status_updates= in_alter_table == S3_NO_ALTER;
+ }
+ }
+ open_args= 0;
+ DBUG_RETURN(res);
+}
+
+
+int ha_s3::external_lock(THD * thd, int lock_type)
+{
+ int error;
+ DBUG_ENTER("ha_s3::external_lock");
+
+ error= ha_maria::external_lock(thd, lock_type);
+ if (in_alter_table == S3_ADD_PARTITION && !error && lock_type == F_UNLCK)
+ {
+ /*
+ This was a new partition. All data is now copied to the table
+ so it's time to move it to S3)
+ */
+
+ MARIA_SHARE *share= file->s;
+ uint org_open_count;
+
+ /* First, flush all data to the Aria table */
+ if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+ FLUSH_RELEASE))
+ error= my_errno;
+ if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
+ FLUSH_RELEASE))
+ error= my_errno;
+ org_open_count= share->state.open_count;
+ if (share->global_changed)
+ share->state.open_count--;
+ if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+ MA_STATE_INFO_WRITE_LOCK))
+ error= my_errno;
+ share->state.open_count= org_open_count;
+
+ if (!error)
+ {
+ S3_INFO to_s3_info;
+ char database[NAME_LEN+1], *name= file->s->open_file_name.str;
+ ms3_st *s3_client;
+
+ /* Copy data to S3 */
+ if (s3_info_init(&to_s3_info, name, database, sizeof(database)-1))
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+ if (!(s3_client= s3_open_connection(&to_s3_info)))
+ DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+ /*
+ Note that if error is set, then the empty temp table was not
+ removed
+ */
+ error= move_table_to_s3(s3_client, &to_s3_info, name, 1);
+ s3_deinit(s3_client);
+
+ maria_delete_table_files(name, 1, 0);
+ }
+ }
+ DBUG_RETURN(error);
+}
+
+
+/******************************************************************************
+ Storage engine handler definitions
+******************************************************************************/
+
+/**
+ Free all resources for s3
+*/
+
+static handler *s3_create_handler(handlerton *hton,
+ TABLE_SHARE * table,
+ MEM_ROOT *mem_root)
+{
+ return new (mem_root) ha_s3(hton, table);
+}
+
+
+static int s3_hton_panic(handlerton *hton, ha_panic_function flag)
+{
+ if (flag == HA_PANIC_CLOSE && s3_hton)
+ {
+ end_pagecache(&s3_pagecache, TRUE);
+ s3_deinit_library();
+ my_free(s3_access_key);
+ my_free(s3_secret_key);
+ s3_access_key= s3_secret_key= 0;
+ s3_hton= 0;
+ }
+ return 0;
+}
+
+
+/**
+ Check if a table is in S3 as part of discovery. Returns TABLE_SHARE if found.
+
+ @param hton S3 handlerton
+ @param thd MariaDB thd
+ @param [out] share If table exists, this is updated to contain the found
+ TABLE_SHARE (based on the .frm in S3)
+
+ @return 0 Table exists
+ @return # Error number
+*/
+
+static int s3_discover_table(handlerton *hton, THD* thd, TABLE_SHARE *share)
+{
+ S3_INFO s3_info;
+ S3_BLOCK frm_block, par_block;
+ ms3_st *s3_client;
+ int error;
+ DBUG_ENTER("s3_discover_table");
+
+ if (s3_info_init(&s3_info))
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+ if (!(s3_client= s3_open_connection(&s3_info)))
+ DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+ s3_info.database= share->db;
+ s3_info.table= share->table_name;
+ s3_info.base_table= share->table_name;
+
+ if (s3_get_def(s3_client, &s3_info, &frm_block, "frm"))
+ {
+ s3_free(&frm_block);
+ s3_deinit(s3_client);
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+ }
+ (void) s3_get_def(s3_client, &s3_info, &par_block, "par");
+
+ error= share->init_from_binary_frm_image(thd, 1,
+ frm_block.str, frm_block.length,
+ par_block.str, par_block.length);
+ s3_free(&frm_block);
+ s3_free(&par_block);
+ s3_deinit(s3_client);
+ DBUG_RETURN((my_errno= error));
+}
+
+
+/**
+ Check if a table exists
+
+ @return 0 frm doesn't exists
+ @return 1 frm exists
+*/
+
+static int s3_discover_table_existence(handlerton *hton, const char *db,
+ const char *table_name)
+{
+ S3_INFO s3_info;
+ ms3_st *s3_client;
+ int res;
+ DBUG_ENTER("s3_discover_table_existence");
+
+ /* Ignore names in "mysql" database to speed up boot */
+ if (!strcmp(db, MYSQL_SCHEMA_NAME.str))
+ DBUG_RETURN(0);
+
+ if (s3_info_init(&s3_info))
+ DBUG_RETURN(0);
+ if (!(s3_client= s3_open_connection(&s3_info)))
+ DBUG_RETURN(0);
+
+ s3_info.database.str= db;
+ s3_info.database.length= strlen(db);
+ s3_info.table.str= table_name;
+ s3_info.table.length= strlen(table_name);
+
+ res= s3_frm_exists(s3_client, &s3_info);
+ s3_deinit(s3_client);
+ DBUG_PRINT("exit", ("exists: %d", res == 0));
+ DBUG_RETURN(res == 0); // Return 1 if exists
+}
+
+
+/**
+ Return a list of all S3 tables in a database
+
+ Partitoned tables are not shown
+*/
+
+static int s3_discover_table_names(handlerton *hton __attribute__((unused)),
+ LEX_CSTRING *db,
+ MY_DIR *dir __attribute__((unused)),
+ handlerton::discovered_list *result)
+{
+ char aws_path[AWS_PATH_LENGTH];
+ S3_INFO s3_info;
+ ms3_st *s3_client;
+ ms3_list_st *list, *org_list= 0;
+ int error;
+ DBUG_ENTER("s3_discover_table_names");
+
+ /* Ignore names in "mysql" database to speed up boot */
+ if (!strcmp(db->str, MYSQL_SCHEMA_NAME.str))
+ DBUG_RETURN(0);
+
+ if (s3_info_init(&s3_info))
+ DBUG_RETURN(0);
+ if (!(s3_client= s3_open_connection(&s3_info)))
+ DBUG_RETURN(0);
+
+ strxnmov(aws_path, sizeof(aws_path)-1, db->str, "/", NullS);
+
+ if ((error= ms3_list_dir(s3_client, s3_info.bucket.str, aws_path, &org_list)))
+ goto end;
+
+ for (list= org_list ; list ; list= list->next)
+ {
+ const char *name= list->key + db->length + 1; // Skip database and '/'
+ if (!strstr(name, "#P#"))
+ {
+ size_t name_length= strlen(name)-1; // Remove end '/'
+ result->add_table(name, name_length);
+ }
+ }
+ if (org_list)
+ ms3_list_free(org_list);
+end:
+ s3_deinit(s3_client);
+ DBUG_RETURN(0);
+}
+
+/*
+ Check if definition of table in S3 is same as in MariaDB.
+ This also covers the case where the table is not in S3 anymore.
+
+ Called when a copy of the S3 table is taken from the MariaDB table cache
+
+ TODO: Could possible be optimized by checking if the file on S3 is
+ of same time, data and size since when table was originally opened.
+*/
+
+int ha_s3::discover_check_version()
+{
+ S3_INFO s3_info= *file->s->s3_path;
+ s3_info.tabledef_version= table->s->tabledef_version;
+ /*
+ We have to change the database and table as the table may part of a
+ partitoned table. In this case we want to check the frm file for the
+ partitioned table, not the part table.
+ */
+ s3_info.base_table= table->s->table_name;
+ return (s3_check_frm_version(file->s3, &s3_info) ?
+ HA_ERR_TABLE_DEF_CHANGED : 0);
+}
+
+
+/**
+ Update the .frm file in S3
+*/
+
+static int s3_notify_tabledef_changed(handlerton *,
+ LEX_CSTRING *db, LEX_CSTRING *table,
+ LEX_CUSTRING *frm,
+ LEX_CUSTRING *org_tabledef_version,
+ handler *)
+{
+ char aws_path[AWS_PATH_LENGTH];
+ S3_INFO s3_info;
+ ms3_st *s3_client;
+ int error= 0;
+ DBUG_ENTER("s3_notify_tabledef_changed");
+
+ if (strstr(table->str, "#P#"))
+ DBUG_RETURN(0); // Ignore partitions
+
+ if (s3_info_init(&s3_info))
+ DBUG_RETURN(0);
+ if (!(s3_client= s3_open_connection(&s3_info)))
+ DBUG_RETURN(0);
+
+ s3_info.database= *db;
+ s3_info.base_table= *table;
+ s3_info.tabledef_version= *org_tabledef_version;
+ if (s3_check_frm_version(s3_client, &s3_info))
+ {
+ error= 1;
+ goto err;
+ }
+
+ strxnmov(aws_path, sizeof(aws_path)-1, db->str, "/", table->str, "/frm",
+ NullS);
+
+ if (s3_put_object(s3_client, s3_info.bucket.str, aws_path, (uchar*) frm->str,
+ frm->length, 0))
+ error= 2;
+
+err:
+ s3_deinit(s3_client);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Update the .frm and .par file of a partitioned table stored in s3
+
+ Logic is:
+ - Skip temporary tables used internally by ALTER TABLE and ALTER PARTITION
+ - In case of delete, delete the .frm and .par file from S3
+ - In case of create, copy the .frm and .par files to S3
+ - In case of rename:
+ - Delete from old_path if not internal temporary file and if exists
+ - Copy new .frm and .par file to S3
+
+ To ensure that this works with the reply logic from ALTER PARTITION
+ there should be no errors, only notes, for deletes.
+*/
+
+static int s3_create_partitioning_metadata(const char *path,
+ const char *old_path,
+ chf_create_flags action_flag)
+{
+ ms3_st *s3_client;
+ S3_INFO s3_info;
+ int error= 0;
+ char database[NAME_LEN+1];
+ const char *tmp_path;
+ DBUG_ENTER("s3_create_partitioning_metadata");
+
+ /* Path is empty in case of delete */
+ tmp_path= path ? path : old_path;
+
+ if (s3_info_init(&s3_info, tmp_path, database, sizeof(database)-1))
+ DBUG_RETURN(HA_ERR_UNSUPPORTED);
+ if (!(s3_client= s3_open_connection(&s3_info)))
+ DBUG_RETURN(HA_ERR_NO_CONNECTION);
+
+ switch (action_flag) {
+ case CHF_DELETE_FLAG:
+ case CHF_RENAME_FLAG:
+ {
+ if (!is_mariadb_internal_tmp_table(old_path + dirname_length(old_path)))
+ {
+ S3_INFO s3_info2;
+ char database2[NAME_LEN+1];
+ s3_info_init(&s3_info2, old_path, database2, sizeof(database2)-1);
+
+ partition_delete_from_s3(s3_client, s3_info2.bucket.str,
+ s3_info2.database.str, s3_info2.table.str,
+ MYF(ME_NOTE));
+ }
+ if (action_flag == CHF_DELETE_FLAG)
+ break;
+ }
+ /* Fall through */
+ case CHF_CREATE_FLAG:
+ if (!is_mariadb_internal_tmp_table(path + dirname_length(path)))
+ error= partition_copy_to_s3(s3_client, s3_info.bucket.str,
+ path, old_path,
+ s3_info.database.str, s3_info.table.str);
+ break;
+ case CHF_INDEX_FLAG:
+ break;
+ }
+ s3_deinit(s3_client);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Initialize s3 plugin
+*/
+
+static int ha_s3_init(void *p)
+{
+ bool res;
+ static const char *no_exts[]= { 0 };
+
+ s3_hton= (handlerton *)p;
+ s3_hton->db_type= DB_TYPE_S3;
+ s3_hton->create= s3_create_handler;
+ s3_hton->panic= s3_hton_panic;
+ s3_hton->table_options= s3_table_option_list;
+ s3_hton->discover_table= s3_discover_table;
+ s3_hton->discover_table_names= s3_discover_table_names;
+ s3_hton->discover_table_existence= s3_discover_table_existence;
+ s3_hton->notify_tabledef_changed= s3_notify_tabledef_changed;
+ s3_hton->create_partitioning_metadata= s3_create_partitioning_metadata;
+ s3_hton->tablefile_extensions= no_exts;
+ s3_hton->commit= 0;
+ s3_hton->rollback= 0;
+ s3_hton->checkpoint_state= 0;
+ s3_hton->flush_logs= 0;
+ s3_hton->show_status= 0;
+ s3_hton->prepare_for_backup= 0;
+ s3_hton->end_backup= 0;
+ s3_hton->flags= ((s3_slave_ignore_updates ? HTON_IGNORE_UPDATES : 0) |
+ (s3_replicate_alter_as_create_select ?
+ HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE : 0));
+ /* Copy global arguments to s3_access_key and s3_secret_key */
+ update_access_key(0,0,0,0);
+ update_secret_key(0,0,0,0);
+
+ if ((res= !init_pagecache(&s3_pagecache,
+ (size_t) s3_pagecache_buffer_size,
+ s3_pagecache_division_limit,
+ s3_pagecache_age_threshold, maria_block_size,
+ s3_pagecache_file_hash_size, 0)))
+ s3_hton= 0;
+ s3_pagecache.big_block_read= s3_block_read;
+ s3_pagecache.big_block_free= s3_free;
+ s3_init_library();
+ if (s3_debug)
+ ms3_debug();
+
+ struct s3_func s3f_real =
+ {
+ ms3_set_option, s3_free, ms3_deinit, s3_unique_file_number,
+ read_index_header, s3_check_frm_version, s3_info_copy,
+ set_database_and_table_from_path, s3_open_connection
+ };
+ s3f= s3f_real;
+
+ return res ? HA_ERR_INITIALIZATION : 0;
+}
+
+static int ha_s3_deinit(void*)
+{
+ bzero(&s3f, sizeof(s3f));
+ return 0;
+}
+
+static SHOW_VAR status_variables[]= {
+ {"pagecache_blocks_not_flushed",
+ (char*) &s3_pagecache.global_blocks_changed, SHOW_LONG},
+ {"pagecache_blocks_unused",
+ (char*) &s3_pagecache.blocks_unused, SHOW_LONG},
+ {"pagecache_blocks_used",
+ (char*) &s3_pagecache.blocks_used, SHOW_LONG},
+ {"pagecache_read_requests",
+ (char*) &s3_pagecache.global_cache_r_requests, SHOW_LONGLONG},
+ {"pagecache_reads",
+ (char*) &s3_pagecache.global_cache_read, SHOW_LONGLONG},
+ {NullS, NullS, SHOW_LONG}
+};
+
+
+static struct st_mysql_sys_var* system_variables[]= {
+ MYSQL_SYSVAR(block_size),
+ MYSQL_SYSVAR(debug),
+ MYSQL_SYSVAR(protocol_version),
+ MYSQL_SYSVAR(pagecache_age_threshold),
+ MYSQL_SYSVAR(pagecache_buffer_size),
+ MYSQL_SYSVAR(pagecache_division_limit),
+ MYSQL_SYSVAR(pagecache_file_hash_size),
+ MYSQL_SYSVAR(host_name),
+ MYSQL_SYSVAR(port),
+ MYSQL_SYSVAR(use_http),
+ MYSQL_SYSVAR(bucket),
+ MYSQL_SYSVAR(access_key),
+ MYSQL_SYSVAR(secret_key),
+ MYSQL_SYSVAR(region),
+ MYSQL_SYSVAR(slave_ignore_updates),
+ MYSQL_SYSVAR(replicate_alter_as_create_select),
+ NULL
+};
+
+struct st_mysql_storage_engine s3_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+maria_declare_plugin(s3)
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &s3_storage_engine,
+ "S3",
+ "MariaDB Corporation Ab",
+ "Read only table stored in S3. Created by running "
+ "ALTER TABLE table_name ENGINE=s3",
+ PLUGIN_LICENSE_GPL,
+ ha_s3_init, /* Plugin Init */
+ ha_s3_deinit, /* Plugin Deinit */
+ 0x0100, /* 1.0 */
+ status_variables, /* status variables */
+ system_variables, /* system variables */
+ "1.0", /* string version */
+ MariaDB_PLUGIN_MATURITY_STABLE/* maturity */
+}
+maria_declare_plugin_end;