diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 12:24:36 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 12:24:36 +0000 |
commit | 06eaf7232e9a920468c0f8d74dcf2fe8b555501c (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/maria | |
parent | Initial commit. (diff) | |
download | mariadb-06eaf7232e9a920468c0f8d74dcf2fe8b555501c.tar.xz mariadb-06eaf7232e9a920468c0f8d74dcf2fe8b555501c.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/maria')
288 files changed, 122601 insertions, 0 deletions
diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt new file mode 100644 index 00000000..f55d78f0 --- /dev/null +++ b/storage/maria/CMakeLists.txt @@ -0,0 +1,138 @@ +# Copyright (C) 2007 MySQL AB +# Copyright (C) 2009,2020 MariaDB Corporation Ab +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +INCLUDE_DIRECTORIES(${SSL_INCLUDE_DIRS}) + +IF(SSL_DEFINES) + SET_SOURCE_FILES_PROPERTIES(ma_crypt.c PROPERTIES COMPILE_FLAGS ${SSL_DEFINES}) +ENDIF() + +SET(ARIA_SOURCES ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c + ma_rnext.c ma_rnext_same.c + ma_search.c ma_page.c ma_key_recover.c ma_key.c + ma_locking.c ma_state.c + ma_rrnd.c ma_scan.c ma_cache.c + ma_statrec.c ma_packrec.c ma_dynrec.c + ma_blockrec.c ma_bitmap.c + ma_update.c ma_write.c ma_unique.c + ma_delete.c + ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c + ma_rsamepos.c ma_panic.c ma_close.c ma_create.c + ma_range.c ma_dbug.c ma_checksum.c + ma_changed.c ma_static.c ma_delete_all.c + ma_delete_table.c ma_rename.c ma_check.c + ma_keycache.c ma_preload.c ma_ft_parser.c + ma_ft_update.c ma_ft_boolean_search.c + ma_ft_nlq_search.c ft_maria.c ma_sort.c + ha_maria.cc trnman.c lockman.c + ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c + ma_sp_key.c ma_control_file.c ma_loghandler.c + ma_pagecache.c ma_pagecaches.c + ma_checkpoint.c ma_recovery.c ma_commit.c ma_pagecrc.c + ha_maria.h maria_def.h ma_recovery_util.c ma_servicethread.c + ma_norec.c + ma_crypt.c ma_backup.c +) + +IF(APPLE) + # Workaround linker bug on OSX 10.7 + ADD_DEFINITIONS(-fno-common) +ENDIF() + +IF(CMAKE_SYSTEM_NAME MATCHES AIX) + # Workaround linker bug on AIX + SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-berok") +ENDIF() + +MYSQL_ADD_PLUGIN(aria ${ARIA_SOURCES} STORAGE_ENGINE MANDATORY + LINK_LIBRARIES myisam mysys mysys_ssl + RECOMPILE_FOR_EMBEDDED) + +MYSQL_ADD_EXECUTABLE(aria_ftdump aria_ftdump.c COMPONENT Server) +TARGET_LINK_LIBRARIES(aria_ftdump aria) + +MYSQL_ADD_EXECUTABLE(aria_chk aria_chk.c COMPONENT Server) +TARGET_LINK_LIBRARIES(aria_chk aria) + +MYSQL_ADD_EXECUTABLE(aria_read_log aria_read_log.c COMPONENT Server) +TARGET_LINK_LIBRARIES(aria_read_log aria) + +MYSQL_ADD_EXECUTABLE(aria_dump_log aria_dump_log.c unittest/ma_loghandler_examples.c COMPONENT Server) +TARGET_LINK_LIBRARIES(aria_dump_log aria) +SET_TARGET_PROPERTIES(aria_dump_log PROPERTIES COMPILE_FLAGS "-DMARIA_DUMP_LOG") + +MYSQL_ADD_EXECUTABLE(aria_pack aria_pack.c COMPONENT Server) +TARGET_LINK_LIBRARIES(aria_pack aria) + +IF(WITH_UNIT_TESTS) + ADD_EXECUTABLE(ma_test1 ma_test1.c) + TARGET_LINK_LIBRARIES(ma_test1 aria) + + ADD_EXECUTABLE(ma_test2 ma_test2.c) + TARGET_LINK_LIBRARIES(ma_test2 aria) + + ADD_EXECUTABLE(ma_test3 ma_test3.c) + TARGET_LINK_LIBRARIES(ma_test3 aria) + + ADD_EXECUTABLE(ma_rt_test ma_rt_test.c) + TARGET_LINK_LIBRARIES(ma_rt_test aria) + + ADD_EXECUTABLE(ma_sp_test ma_sp_test.c) + TARGET_LINK_LIBRARIES(ma_sp_test aria) + + ADD_EXECUTABLE(test_ma_backup test_ma_backup.c) + TARGET_LINK_LIBRARIES(test_ma_backup aria) + + ADD_SUBDIRECTORY(unittest) + +ENDIF() + +IF (MSVC) + SET_TARGET_PROPERTIES(aria_chk aria_pack PROPERTIES LINK_FLAGS "setargv.obj") +ENDIF() + +OPTION(USE_ARIA_FOR_TMP_TABLES "Use Aria for temporary tables" ON) + +# +# S3 +# +INCLUDE (CheckIncludeFiles) + +SET(S3_SOURCES s3_func.c + libmarias3/src/debug.c libmarias3/src/error.c libmarias3/src/marias3.c + libmarias3/src/request.c libmarias3/src/response.c libmarias3/src/sha256.c + libmarias3/src/sha256-internal.c libmarias3/src/xml.c + libmarias3/src/assume_role.c) + +IF(NOT PLUGIN_S3 STREQUAL NO AND NOT WIN32) + FIND_PACKAGE(CURL) +ENDIF() + +IF (CURL_FOUND) + INCLUDE_DIRECTORIES(${CURL_INCLUDE_DIRS}) + MYSQL_ADD_PLUGIN(s3 ha_s3.cc ${S3_SOURCES} COMPONENT s3-engine + LINK_LIBRARIES ${CURL_LIBRARIES} z STORAGE_ENGINE NOT_EMBEDDED CONFIG s3.cnf) +ENDIF() + +SET(CPACK_RPM_s3-engine_PACKAGE_SUMMARY "Amazon S3 archival storage engine for MariaDB" PARENT_SCOPE) +SET(CPACK_RPM_s3-engine_PACKAGE_DESCRIPTION "The S3 storage engine allows one to archive MariaDB tables in Amazon S3 (or any third-party public or private cloud that implements S3 API), but still have them accessible in MariaDB in read-only mode." PARENT_SCOPE) + +IF(TARGET s3) + MYSQL_ADD_EXECUTABLE(aria_s3_copy aria_s3_copy.cc ${S3_SOURCES} COMPONENT s3-engine) + TARGET_LINK_LIBRARIES(aria_s3_copy aria myisam mysys mysys_ssl ${CURL_LIBRARIES} ${ZLIB_LIBRARY}) + INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/libmarias3) + ADD_DEFINITIONS(-DWITH_S3_STORAGE_ENGINE) +ENDIF() diff --git a/storage/maria/aria_chk.c b/storage/maria/aria_chk.c new file mode 100644 index 00000000..143110dc --- /dev/null +++ b/storage/maria/aria_chk.c @@ -0,0 +1,2169 @@ +/* Copyright (C) 2006-2003 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Describe, check and repair of MARIA tables */ + +#include "ma_fulltext.h" +#include <myisamchk.h> +#include <my_bit.h> +#include <m_ctype.h> +#include <my_getopt.h> +#include <my_check_opt.h> +#include <my_handler_errors.h> +/* Remove next line if you want aria_chk to produce a stack trace */ +#undef HAVE_BACKTRACE +#include <my_stacktrace.h> + +static uint decode_bits; +static char **default_argv; +static const char *load_default_groups[]= { "aria_chk", 0 }; +static const char *set_collation_name, *opt_tmpdir, *opt_log_dir; +static const char *default_log_dir; +static CHARSET_INFO *set_collation; +static int stopwords_inited= 0; +static MY_TMPDIR maria_chk_tmpdir; +static my_bool opt_transaction_logging, opt_debug; +static my_bool opt_ignore_control_file, opt_require_control_file; +static my_bool opt_warning_for_wrong_transid, opt_update_state; +static my_bool have_control_file= 0; + +static const char *type_names[]= +{ + "impossible","char","binary", "short", "long", "float", + "double","number","unsigned short", + "unsigned long","longlong","ulonglong","int24", + "uint24","int8","varchar", "varbin", "varchar2", "varbin2", "bit", + "?","?" +}; + +static const char *prefix_packed_txt="packed ", + *bin_packed_txt="prefix ", + *diff_txt="stripped ", + *null_txt="NULL", + *blob_txt="BLOB "; + +static const char *field_pack[]= +{ + "","no endspace", "no prespace", + "no zeros", "blob", "constant", "table-lockup", + "always zero","varchar","unique-hash","?","?" +}; + +static const char *record_formats[]= +{ + "Fixed length", "Packed", "Compressed", "Block", "No data", "?", "?" +}; + +static const char *bitmap_description[]= +{ + "Empty page", "Part filled head page","Part filled head page", + "Part filled head page", "Full head page", + "Part filled tail page","Part filled tail page", + "Full tail or blob page" +}; + +static const char *maria_stats_method_str="nulls_unequal"; +static char default_open_errmsg[]= "%d when opening Aria table '%s'"; +static char default_close_errmsg[]= "%d when closing Aria table '%s'"; + +static void get_options(int *argc,char * * *argv); +static void print_version(void); +static void usage(void); +static int maria_chk(HA_CHECK *param, char *filename); +static void descript(HA_CHECK *param, register MARIA_HA *info, char *name); +static int maria_sort_records(HA_CHECK *param, register MARIA_HA *info, + char *name, uint sort_key, + my_bool write_info, my_bool update_index); +static int sort_record_index(MARIA_SORT_PARAM *sort_param, MARIA_PAGE *page, + uint sortkey, File new_file, + my_bool update_index); +static my_bool write_log_record(HA_CHECK *param); +ATTRIBUTE_NORETURN static void my_exit(int exit_code); + +HA_CHECK check_param; + +/* + Register handler error messages for usage with my_error() + + NOTES + This is safe to call multiple times as my_error_register() + will ignore calls to register already registered error numbers. +*/ + +static const char **get_handler_error_messages(int e __attribute__((unused))) +{ + return handler_error_messages; +} + + +/* Free memory and exit */ + +static void my_exit(int exit_code) +{ + free_tmpdir(&maria_chk_tmpdir); + free_defaults(default_argv); + my_error_unregister(HA_ERR_FIRST, + HA_ERR_FIRST+ array_elements(handler_error_messages)-1); + my_end(check_param.testflag & T_INFO ? + MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(exit_code); +} + +/* Main program */ + +int main(int argc, char **argv) +{ + int error; + MY_INIT(argv[0]); + + my_setup_stacktrace(); + default_log_dir= opt_log_dir= maria_data_root= "."; + maria_chk_init(&check_param); + check_param.opt_lock_memory= 1; /* Lock memory if possible */ + check_param.using_global_keycache = 0; + get_options(&argc,(char***) &argv); + maria_quick_table_bits=decode_bits; + error=0; + maria_init(); + my_error_register(get_handler_error_messages, HA_ERR_FIRST, + HA_ERR_FIRST+ array_elements(handler_error_messages)-1); + + maria_block_size= 0; /* Use block size from control file */ + if (!opt_ignore_control_file) + { + if ((ma_control_file_open(FALSE, opt_require_control_file || + !(check_param.testflag & T_SILENT), + TRUE))) + { + if (opt_require_control_file || + (opt_transaction_logging && (check_param.testflag & T_REP_ANY))) + { + error= 1; + goto end; + } + } + else + have_control_file= 1; + } + if (!have_control_file) + opt_warning_for_wrong_transid= 0; + + /* + If we are doing a repair, user may want to store this repair into the log + so that the log has a complete history and can be used to replay. + */ + if (opt_transaction_logging && (check_param.testflag & T_REP_ANY)) + { + if (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0 || + translog_init(opt_log_dir, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0)) + { + _ma_check_print_error(&check_param, + "Can't initialize transaction logging. Run " + "recovery with switch --skip-transaction-log"); + error= 1; + goto end; + } + } + + while (--argc >= 0) + { + int new_error=maria_chk(&check_param, *(argv++)); + if ((check_param.testflag & T_REP_ANY) != T_REP) + check_param.testflag&= ~T_REP; + fflush(stdout); + fflush(stderr); + if (check_param.wrong_trd_printed && + (check_param.testflag & T_FORCE_CREATE) && + !(check_param.error_printed | check_param.warning_printed)) + { + /* Only wrong create_trd. Run zerofill */ + ulonglong old_testflag= check_param.testflag; + check_param.testflag= T_ZEROFILL; + error|= maria_chk(&check_param, argv[-1]); + check_param.testflag= old_testflag; + check_param.error_printed= 0; + check_param.warning_printed= 0; + fflush(stdout); + fflush(stderr); + } + if ((check_param.error_printed | check_param.warning_printed) && + (check_param.testflag & T_FORCE_CREATE) && + (!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS | + T_SORT_INDEX)))) + { + ulonglong old_testflag=check_param.testflag; + if (!(check_param.testflag & T_REP)) + check_param.testflag|= T_REP_BY_SORT; + check_param.testflag&= ~T_EXTEND; /* Not needed */ + error|=maria_chk(&check_param, argv[-1]); + check_param.testflag= old_testflag; + fflush(stdout); + fflush(stderr); + } + error|=new_error; + if (argc && (!(check_param.testflag & T_SILENT) || + check_param.testflag & T_INFO)) + { + puts("\n---------\n"); + fflush(stdout); + } + } +end: + if (check_param.total_files > 1) + { /* Only if descript */ + char buff[22],buff2[22]; + if (!(check_param.testflag & T_SILENT) || check_param.testflag & T_INFO) + puts("\n---------"); + printf("\nTotal of all %d Aria-files:\nData records: %9s Deleted blocks: %9s\n",check_param.total_files,llstr(check_param.total_records,buff), + llstr(check_param.total_deleted,buff2)); + } + maria_end(); + my_exit(error); +#ifndef _lint + return 0; /* No compiler warning */ +#endif +} /* main */ + +enum options_mc { + OPT_CHARSETS_DIR=256, OPT_SET_COLLATION,OPT_START_CHECK_POS, + OPT_CORRECT_CHECKSUM, OPT_CREATE_MISSING_KEYS, OPT_PAGE_BUFFER_SIZE, + OPT_KEY_CACHE_BLOCK_SIZE, OPT_MARIA_BLOCK_SIZE, + OPT_READ_BUFFER_SIZE, OPT_WRITE_BUFFER_SIZE, OPT_SORT_BUFFER_SIZE, + OPT_SORT_KEY_BLOCKS, OPT_DECODE_BITS, OPT_FT_MIN_WORD_LEN, + OPT_FT_MAX_WORD_LEN, OPT_FT_STOPWORD_FILE, + OPT_MAX_RECORD_LENGTH, OPT_AUTO_CLOSE, OPT_STATS_METHOD, OPT_TRANSACTION_LOG, + OPT_ZEROFILL_KEEP_LSN, + OPT_REQUIRE_CONTROL_FILE, OPT_IGNORE_CONTROL_FILE, + OPT_LOG_DIR, OPT_WARNING_FOR_WRONG_TRANSID +}; + +static struct my_option my_long_options[] = +{ + {"analyze", 'a', + "Analyze distribution of keys. Will make some joins in MySQL faster. You can check the calculated distribution.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifdef __NETWARE__ + {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"block-search", 'b', + "No help available.", + 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"backup", 'B', + "Make a backup of the .MAD file as 'filename-time.BAK'.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR, + "Directory where character sets are.", + (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"check", 'c', + "Check table for errors.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"check-only-changed", 'C', + "Check only tables that have changed since last check. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"correct-checksum", OPT_CORRECT_CHECKSUM, + "Correct checksum information for table.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"create-missing-keys", OPT_CREATE_MISSING_KEYS, + "Create missing keys. This assumes that the data file is correct and that " + "the the number of rows stored in the index file is correct. Enables " + "--quick", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', + "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"description", 'd', + "Prints some information about table.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"data-file-length", 'D', + "Max length of data file (when recreating data-file when it's full).", + &check_param.max_data_file_length, + &check_param.max_data_file_length, + 0, GET_LL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"extend-check", 'e', + "If used when checking a table, ensure that the table is 100 percent consistent, which will take a long time. If used when repairing a table, try to recover every possible row from the data file. Normally this will also find a lot of garbage rows; Don't use this option with repair if you are not totally desperate.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"fast", 'F', + "Check only tables that haven't been closed properly. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', + "Restart with -r if there are any errors in the table. States will be updated as with --update-state.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"HELP", 'H', + "Print all argument options sorted alphabetically and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', + "Print all options by groups and exit. See also --HELP", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"information", 'i', + "Print statistics information about table that is checked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "ignore-control-file", OPT_IGNORE_CONTROL_FILE, + "Ignore the control file", + (uchar**)&opt_ignore_control_file, 0, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"keys-used", 'k', + "Tell Aria to update only some specific keys. # is a bit mask of which keys to use. This can be used to get faster inserts.", + &check_param.keys_in_use, + &check_param.keys_in_use, + 0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0}, + {"datadir", 'h', + "Path for control file (and logs if --logdir not used).", + (char**) &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG, + 0, 0, 0, 0, 0, 0}, + {"logdir", OPT_LOG_DIR, + "Path for log files.", + (char**) &opt_log_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"max-record-length", OPT_MAX_RECORD_LENGTH, + "Skip rows bigger than this if aria_chk can't allocate memory to hold it", + &check_param.max_record_length, + &check_param.max_record_length, + 0, GET_ULL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0}, + {"medium-check", 'm', + "Faster than extend-check, but only finds 99.99% of all errors. Should be good enough for most cases.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"quick", 'q', "Faster repair by not modifying the data file.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"read-only", 'T', + "Don't mark table as checked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"recover", 'r', + "Can fix almost anything except unique keys that aren't unique.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"parallel-recover", 'p', + "Same as '-r' but creates all the keys in parallel.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"safe-recover", 'o', + "Uses old recovery method; Slower than '-r' but can handle a couple of cases where '-r' reports that it can't fix the data file.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-recover", 'n', + "Force recovering with sorting even if the temporary file was very big.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "require-control-file", OPT_REQUIRE_CONTROL_FILE, + "Abort if cannot find control file", + (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, +#ifdef DEBUG + {"start-check-pos", OPT_START_CHECK_POS, + "No help available.", + 0, 0, 0, GET_ULL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"set-auto-increment", 'A', + "Force auto_increment to start at this or higher value. If no value is given, then sets the next auto_increment value to the highest used value for the auto key + 1.", + &check_param.auto_increment_value, + &check_param.auto_increment_value, + 0, GET_ULL, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"set-collation", OPT_SET_COLLATION, + "Change the collation used by the index", + (char**) &set_collation_name, 0, 0, GET_STR, REQUIRED_ARG, + 0, 0, 0, 0, 0, 0}, + {"silent", 's', + "Only print errors. One can use two -s to make aria_chk very silent.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-index", 'S', + "Sort index blocks. This speeds up 'read-next' in applications.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"sort-records", 'R', + "Sort records according to an index. This makes your data much more localized and may speed up things. (It may be VERY slow to do a sort the first time!)", + &check_param.opt_sort_key, + &check_param.opt_sort_key, + 0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 't', "Path for temporary files.", (char**) &opt_tmpdir, + 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transaction-log", OPT_TRANSACTION_LOG, + "Log repair command to transaction log", + &opt_transaction_logging, &opt_transaction_logging, + 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"update-state", 'U', + "Mark tables as crashed if any errors were found and clean if check " + "didn't find any errors but table was marked as 'not clean' before. This " + "allows one to get rid of warnings like 'table not properly closed'. " + "If table was updated, update also the timestamp for when check was made. " + "This option is on by default!", + &opt_update_state, &opt_update_state, 0, GET_BOOL, NO_ARG, + 1, 0, 0, 0, 0, 0}, + {"unpack", 'u', + "Unpack file packed with aria_pack.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', + "Print more information. This can be used with --description and --check. Use many -v for more verbosity!", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"wait", 'w', "Wait if table is locked.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"warning-for-wrong-transaction-id", OPT_WARNING_FOR_WRONG_TRANSID, + "Give a warning if we find a transaction id in the table that is bigger" + "than what exists in the control file. Use --skip-... to disable warning", + &opt_warning_for_wrong_transid, &opt_warning_for_wrong_transid, + 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, + { "page_buffer_size", OPT_PAGE_BUFFER_SIZE, + "Size of page buffer. Used by --safe-repair", + &check_param.use_buffers, &check_param.use_buffers, 0, + GET_ULONG, REQUIRED_ARG, PAGE_BUFFER_INIT, 1024L*1024L, + SIZE_T_MAX, (long) MALLOC_OVERHEAD, (long) IO_SIZE, 0}, + { "read_buffer_size", OPT_READ_BUFFER_SIZE, + "Read buffer size for sequential reads during scanning", + &check_param.read_buffer_length, + &check_param.read_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD, + ~0ULL, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "write_buffer_size", OPT_WRITE_BUFFER_SIZE, + "Write buffer size for sequential writes during repair of fixed size or dynamic size rows", + &check_param.write_buffer_length, + &check_param.write_buffer_length, 0, GET_ULONG, REQUIRED_ARG, + (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD, + ~0UL, (long) MALLOC_OVERHEAD, (long) 1L, 0}, + { "sort_buffer_size", OPT_SORT_BUFFER_SIZE, + "Size of sort buffer. Used by --recover", + &check_param.orig_sort_buffer_length, + &check_param.orig_sort_buffer_length, 0, GET_ULL, REQUIRED_ARG, + SORT_BUFFER_INIT, MARIA_MIN_SORT_MEMORY, SIZE_T_MAX/10, MALLOC_OVERHEAD, + 1L, 0}, + { "sort_key_blocks", OPT_SORT_KEY_BLOCKS, + "Internal buffer for sorting keys; Don't touch :)", + &check_param.sort_key_blocks, + &check_param.sort_key_blocks, 0, GET_ULONG, REQUIRED_ARG, + BUFFERS_WHEN_SORTING, 4L, 100L, 0L, 1L, 0}, + { "decode_bits", OPT_DECODE_BITS, "", &decode_bits, + &decode_bits, 0, GET_UINT, REQUIRED_ARG, 9L, 4L, 17L, 0L, 1L, 0}, + { "ft_min_word_len", OPT_FT_MIN_WORD_LEN, "", &ft_min_word_len, + &ft_min_word_len, 0, GET_ULONG, REQUIRED_ARG, 4, 1, HA_FT_MAXCHARLEN, + 0, 1, 0}, + { "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", &ft_max_word_len, + &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXCHARLEN, 10, + HA_FT_MAXCHARLEN, 0, 1, 0}, + { "aria_ft_stopword_file", OPT_FT_STOPWORD_FILE, + "Use stopwords from this file instead of built-in list.", + (char**) &ft_stopword_file, (char**) &ft_stopword_file, 0, GET_STR, + REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "stats_method", OPT_STATS_METHOD, + "Specifies how index statistics collection code should treat NULLs. " + "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), " + "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".", + (char**) &maria_stats_method_str, (char**) &maria_stats_method_str, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "zerofill", 'z', + "Fill empty space in data and index files with zeroes. This makes the data file movable between different servers. It also fixes any wrong transaction or LSN numbers in the table after a crash or if someone removed the Aria log files.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "zerofill-keep-lsn", OPT_ZEROFILL_KEEP_LSN, + "Like --zerofill but does not zero out LSN of data/index pages;" + " used only for testing and debugging", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void print_version(void) +{ + printf("%s Ver 1.3 for %s on %s\n", my_progname, SYSTEM_TYPE, + MACHINE_TYPE); +} + + +static void usage(void) +{ + print_version(); + puts("By Monty, for your professional use"); + puts("This software comes with NO WARRANTY: see the PUBLIC for details.\n"); + puts("Description, check and repair of Aria tables."); + puts("Used without options all tables on the command will be checked for errors"); + printf("Usage: %s [OPTIONS] tables[.MAI]\n", my_progname_short); + printf("\nGlobal options:\n"); +#ifndef DBUG_OFF + printf("\ + -#, --debug=... Output debug log. Often this is 'd:t:o,filename'.\n"); +#endif + printf("\ + -H, --HELP Print all argument options sorted alphabetically.\n\ + -?, --help Print all options by groups\n\ + --datadir=path Path for control file (and logs if --logdir not used)\n\ + --logdir=path Path for log files\n\ + --ignore-control-file Don't open the control file. Only use this if you\n\ + are sure the tables are not in use by another\n\ + program!\n\ + --require-control-file Abort if we can't find/read the aria_log_control\n\ + file\n\ + -s, --silent Only print errors. One can use two -s to make\n\ + maria_chk very silent.\n\ + -t, --tmpdir=path Path for temporary files. Multiple paths can be\n\ + specified, separated by "); +#if defined( _WIN32) + printf("semicolon (;)"); +#else + printf("colon (:)"); +#endif + printf(", they will be used\n\ + in a round-robin fashion.\n\ + -v, --verbose Print more information. This can be used with\n\ + --description and --check. Use many -v for more verbosity.\n\ + -V, --version Print version and exit.\n\ + -w, --wait Wait if table is locked.\n\n"); +#ifdef DEBUG + puts(" --start-check-pos=# Start reading file at given offset.\n"); +#endif + + puts("Check options (check is the default action for aria_chk):\n\ + -c, --check Check table for errors.\n\ + -e, --extend-check Check the table VERY throughly. Only use this in\n\ + extreme cases as aria_chk should normally be able to\n\ + find out if the table is ok even without this switch.\n\ + -F, --fast Check only tables that haven't been closed properly.\n\ + -C, --check-only-changed\n\ + Check only tables that have changed since last check.\n\ + -f, --force Restart with '-r' if there are any errors in the table.\n\ + States will be updated as with '--update-state'.\n\ + -i, --information Print statistics information about table that is checked.\n\ + -m, --medium-check Faster than extend-check, but only finds 99.99% of\n\ + all errors. Should be good enough for most cases.\n\ + -T, --read-only Don't mark table as checked.\n\ + -U, --update-state Mark tables as crashed if any errors were found and\n\ + clean if check didn't find any errors but table was\n\ + marked as 'not clean' before. This allows one to get\n\ + rid of warnings like 'table not properly closed'. If\n\ + table was updated, update also the timestamp for when\n\ + the check was made. This option is on by default!\n\ + Use --skip-update-state to disable.\n\ + --warning-for-wrong-transaction-id\n\ + Give a warning if we find a transaction id in the table that is bigger\n\ + than what exists in the control file. Use --skip-... to disable warning\n\ + "); + + puts("\ +Recover (repair)/ options (When using '--recover' or '--safe-recover'):\n\ + -B, --backup Make a backup of the .MAD file as 'filename-time.BAK'.\n\ + --correct-checksum Correct checksum information for table.\n\ + -D, --data-file-length=# Max length of data file (when recreating data\n\ + file when it's full).\n\ + --create-missing-keys\n\ + Create missing keys. This assumes that the data\n\ + file is correct and that the the number of rows stored\n\ + in the index file is correct. Enables --quick.\n\ + -e, --extend-check Try to recover every possible row from the data file\n\ + Normally this will also find a lot of garbage rows;\n\ + Don't use this option if you are not totally desperate.\n\ + -f, --force Overwrite old temporary files. Add another --force to\n\ + avoid 'sort_buffer_size is too small' errors.\n\ + In this case we will attempt to do the repair with the\n\ + given sort_buffer_size and dynamically allocate\n\ + as many management buffers as needed.\n\ + -k, --keys-used=# Tell Aria to update only some specific keys. # is a\n\ + bit mask of which keys to use. This can be used to\n\ + get faster inserts.\n\ + --max-record-length=#\n\ + Skip rows bigger than this if aria_chk can't allocate\n\ + memory to hold it.\n\ + -r, --recover Can fix almost anything except unique keys that aren't\n\ + unique.\n\ + -n, --sort-recover Forces recovering with sorting even if the temporary\n\ + file would be very big.\n\ + -p, --parallel-recover\n\ + Uses the same technique as '-r' and '-n', but creates\n\ + all the keys in parallel, in different threads."); + puts("\ + -o, --safe-recover Uses old recovery method; Slower than '-r' but can\n \ + handle a couple of cases where '-r' reports that it\n\ + can't fix the data file.\n\ + --transaction-log Log repair command to transaction log. This is needed\n\ + if one wants to use the aria_read_log to repeat the \n\ + repair\n\ + --character-sets-dir=...\n\ + Directory where character sets are.\n\ + --set-collation=name\n\ + Change the collation used by the index.\n\ + -q, --quick Faster repair by not modifying the data file.\n\ + One can give a second '-q' to force aria_chk to\n\ + modify the original datafile in case of duplicate keys.\n\ + NOTE: Tables where the data file is corrupted can't be\n\ + fixed with this option.\n\ + -u, --unpack Unpack file packed with ariapack.\n\ +"); + + puts("Other actions:\n\ + -a, --analyze Analyze distribution of keys. Will make some joins in\n\ + MariaDB faster. You can check the calculated distribution\n\ + by using '--description --verbose table_name'.\n\ + --stats_method=name Specifies how index statistics collection code should\n\ + treat NULLs. Possible values of name are \"nulls_unequal\"\n\ + (default for 4.1/5.0), \"nulls_equal\" (emulate 4.0), and \n\ + \"nulls_ignored\".\n\ + -d, --description Prints some information about table.\n\ + -A, --set-auto-increment[=value]\n\ + Force auto_increment to start at this or higher value\n\ + If no value is given, then sets the next auto_increment\n\ + value to the highest used value for the auto key + 1.\n\ + -S, --sort-index Sort index blocks. This speeds up 'read-next' in\n\ + applications.\n\ + -R, --sort-records=#\n\ + Sort records according to an index. This makes your\n\ + data much more localized and may speed up things\n\ + (It may be VERY slow to do a sort the first time!).\n\ + -b, --block-search=#\n\ + Find a record, a block at given offset belongs to.\n\ + -z, --zerofill Fill empty space in data and index files with zeroes.\n\ + This makes the data file movable between different \n\ + servers. It also fixes any wrong transaction or LSN\n\ + numbers in the table after a crash or if someone\n\ + removed the Aria log files.\n\ + --zerofill-keep-lsn Like --zerofill but does not zero out LSN of\n\ + data/index pages."); + + puts("Variables:\n\ +--page_buffer_size=# Size of page buffer. Used by --safe-repair\n\ +--read_buffer_size=# Read buffer size for sequential reads during scanning\n\ +--sort_buffer_size=# Size of sort buffer. Used by --recover\n\ +--sort_key_blocks=# Internal buffer for sorting keys; Don't touch :)\n\ +--write_buffer_size=# Write buffer size for sequential writes during repair"); + + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + +const char *maria_stats_method_names[] = {"nulls_unequal", "nulls_equal", + "nulls_ignored", NullS}; +TYPELIB maria_stats_method_typelib= { + array_elements(maria_stats_method_names) - 1, "", + maria_stats_method_names, NULL}; + + /* Read options */ + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument, + const char *filename __attribute__((unused))) +{ + switch (opt->id) { +#ifdef __NETWARE__ + case OPT_AUTO_CLOSE: + setscreenmode(SCR_AUTOCLOSE_ON_EXIT); + break; +#endif + case 'a': + if (argument == disabled_my_option) + check_param.testflag&= ~T_STATISTICS; + else + check_param.testflag|= T_STATISTICS; + break; + case 'A': + if (argument) + check_param.auto_increment_value= strtoull(argument, NULL, 0); + else + check_param.auto_increment_value= 0; /* Set to max used value */ + check_param.testflag|= T_AUTO_INC; + break; + case 'b': + check_param.search_after_block= strtoul(argument, NULL, 10); + break; + case 'B': + if (argument == disabled_my_option) + check_param.testflag&= ~T_BACKUP_DATA; + else + check_param.testflag|= T_BACKUP_DATA; + break; + case 'c': + if (argument == disabled_my_option) + check_param.testflag&= ~T_CHECK; + else + check_param.testflag|= T_CHECK; + break; + case 'C': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_CHECK | T_CHECK_ONLY_CHANGED); + else + check_param.testflag|= T_CHECK | T_CHECK_ONLY_CHANGED; + break; + case 'D': + check_param.max_data_file_length=strtoll(argument, NULL, 10); + break; + case 's': /* silent */ + if (argument == disabled_my_option) + check_param.testflag&= ~(T_SILENT | T_VERY_SILENT); + else + { + if (check_param.testflag & T_SILENT) + check_param.testflag|= T_VERY_SILENT; + check_param.testflag|= T_SILENT; + check_param.testflag&= ~T_WRITE_LOOP; + } + break; + case 'w': + if (argument == disabled_my_option) + check_param.testflag&= ~T_WAIT_FOREVER; + else + check_param.testflag|= T_WAIT_FOREVER; + break; + case 'd': /* description if isam-file */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_DESCRIPT; + else + check_param.testflag|= T_DESCRIPT; + break; + case 'e': /* extend check */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_EXTEND; + else + check_param.testflag|= T_EXTEND; + break; + case 'i': + if (argument == disabled_my_option) + check_param.testflag&= ~T_INFO; + else + check_param.testflag|= T_INFO; + break; + case 'f': + if (argument == disabled_my_option) + { + check_param.tmpfile_createflag= O_RDWR | O_TRUNC | O_EXCL; + check_param.testflag&= ~(T_FORCE_CREATE | T_UPDATE_STATE | + T_FORCE_SORT_MEMORY); + } + else + { + if (check_param.testflag & T_FORCE_CREATE) + check_param.testflag= T_FORCE_SORT_MEMORY; + check_param.tmpfile_createflag= O_RDWR | O_TRUNC; + check_param.testflag|= T_FORCE_CREATE | T_UPDATE_STATE; + } + break; + case 'F': + if (argument == disabled_my_option) + check_param.testflag&= ~T_FAST; + else + check_param.testflag|= T_FAST; + break; + case 'k': + check_param.keys_in_use= (ulonglong) strtoll(argument, NULL, 10); + break; + case 'm': + if (argument == disabled_my_option) + check_param.testflag&= ~T_MEDIUM; + else + check_param.testflag|= T_MEDIUM; /* Medium check */ + break; + case 'r': /* Repair table */ + check_param.testflag&= ~T_REP_ANY; + if (argument != disabled_my_option) + check_param.testflag|= T_REP_BY_SORT; + break; + case 'p': + check_param.testflag&= ~T_REP_ANY; + if (argument != disabled_my_option) + check_param.testflag|= T_REP_PARALLEL; + break; + case 'o': + check_param.testflag&= ~T_REP_ANY; + check_param.force_sort= 0; + if (argument != disabled_my_option) + { + check_param.testflag|= T_REP; + my_disable_async_io= 1; /* More safety */ + } + break; + case 'n': + check_param.testflag&= ~T_REP_ANY; + if (argument == disabled_my_option) + check_param.force_sort= 0; + else + { + check_param.testflag|= T_REP_BY_SORT; + check_param.force_sort= 1; + } + break; + case 'q': + if (argument == disabled_my_option) + check_param.testflag&= ~(T_QUICK | T_FORCE_UNIQUENESS); + else + { + /* + If T_QUICK was specified before, but not OPT_CREATE_MISSING_KEYS, + then add T_FORCE_UNIQUENESS. + */ + check_param.testflag|= + ((check_param.testflag & (T_QUICK | T_CREATE_MISSING_KEYS)) == + T_QUICK ? T_FORCE_UNIQUENESS : T_QUICK); + } + break; + case OPT_CREATE_MISSING_KEYS: + if (argument == disabled_my_option) + check_param.testflag&= ~(T_QUICK | T_CREATE_MISSING_KEYS); + else + { + check_param.testflag|= T_QUICK | T_CREATE_MISSING_KEYS; + /* Use repair by sort by default */ + if (!(check_param.testflag & T_REP_ANY)) + check_param.testflag|= T_REP_BY_SORT; + } + break; + case 'u': + if (argument == disabled_my_option) + check_param.testflag&= ~T_UNPACK; + else + { + check_param.testflag|= T_UNPACK; + if (!(check_param.testflag & T_REP_ANY)) + check_param.testflag|= T_REP_BY_SORT; + } + break; + case 'v': /* Verbose */ + if (argument == disabled_my_option) + { + check_param.testflag&= ~T_VERBOSE; + check_param.verbose=0; + } + else + { + check_param.testflag|= T_VERBOSE; + check_param.verbose++; + } + break; + case 'R': /* Sort records */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_SORT_RECORDS; + else + { + check_param.testflag|= T_SORT_RECORDS; + check_param.opt_sort_key= (uint) atoi(argument) - 1; + if (check_param.opt_sort_key >= MARIA_MAX_KEY) + { + fprintf(stderr, + "The value of the sort key is bigger than max key: %d.\n", + MARIA_MAX_KEY); + my_exit(1); + } + } + break; + case 'S': /* Sort index */ + if (argument == disabled_my_option) + check_param.testflag&= ~T_SORT_INDEX; + else + check_param.testflag|= T_SORT_INDEX; + break; + case 'T': + if (argument == disabled_my_option) + check_param.testflag&= ~T_READONLY; + else + check_param.testflag|= T_READONLY; + break; + case 'U': + if (argument == disabled_my_option) + check_param.testflag&= ~T_UPDATE_STATE; + else + check_param.testflag|= T_UPDATE_STATE; + break; + case '#': + DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/aria_chk.trace"); + opt_debug= 1; + break; + case 'V': + print_version(); + my_exit(0); + case OPT_CORRECT_CHECKSUM: + if (argument == disabled_my_option) + check_param.testflag&= ~T_CALC_CHECKSUM; + else + check_param.testflag|= T_CALC_CHECKSUM; + break; + case OPT_STATS_METHOD: + { + int method; + enum_handler_stats_method UNINIT_VAR(method_conv); + maria_stats_method_str= argument; + if ((method=find_type(argument, &maria_stats_method_typelib, 2)) <= 0) + { + fprintf(stderr, "Invalid value of stats_method: %s.\n", argument); + my_exit(1); + } + switch (method-1) { + case 0: + method_conv= MI_STATS_METHOD_NULLS_EQUAL; + break; + case 1: + method_conv= MI_STATS_METHOD_NULLS_NOT_EQUAL; + break; + case 2: + method_conv= MI_STATS_METHOD_IGNORE_NULLS; + break; + default: abort(); /* Impossible */ + } + check_param.stats_method= method_conv; + break; + } +#ifdef DEBUG /* Only useful if debugging */ + case OPT_START_CHECK_POS: + check_param.start_check_pos= strtoull(argument, NULL, 0); + break; +#endif + case 'z': + if (argument == disabled_my_option) + check_param.testflag&= ~T_ZEROFILL; + else + check_param.testflag|= T_ZEROFILL; + break; + case OPT_ZEROFILL_KEEP_LSN: + if (argument == disabled_my_option) + check_param.testflag&= ~(T_ZEROFILL_KEEP_LSN | T_ZEROFILL); + else + check_param.testflag|= (T_ZEROFILL_KEEP_LSN | T_ZEROFILL); + break; + case 'H': + my_print_help(my_long_options); + my_print_variables(my_long_options); + my_exit(0); + case '?': + usage(); + my_exit(0); + } + return 0; +} + + +static void get_options(register int *argc,register char ***argv) +{ + int ho_error; + + load_defaults_or_exit("my", load_default_groups, argc, argv); + default_argv= *argv; + check_param.testflag= T_UPDATE_STATE; + if (isatty(fileno(stdout))) + check_param.testflag|=T_WRITE_LOOP; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + my_exit(ho_error); + + /* If using repair, then update checksum if one uses --update-state */ + if ((check_param.testflag & T_UPDATE_STATE) && + (check_param.testflag & T_REP_ANY)) + check_param.testflag|= T_CALC_CHECKSUM; + + if (*argc == 0) + { + usage(); + my_exit(-1); + } + + if ((check_param.testflag & T_UNPACK) && + (check_param.testflag & (T_QUICK | T_SORT_RECORDS))) + { + fprintf(stderr, "%s: --unpack can't be used with --quick or --sort-records\n", + my_progname_short); + my_exit(1); + } + if ((check_param.testflag & T_READONLY) && + (check_param.testflag & + (T_REP_ANY | T_STATISTICS | T_AUTO_INC | + T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE))) + { + fprintf(stderr, "%s: Can't use --readonly when repairing or sorting\n", + my_progname_short); + my_exit(1); + } + + if (!opt_debug) + { + DEBUGGER_OFF; /* Speed up things a bit */ + } + if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir)) + my_exit(1); + + check_param.tmpdir=&maria_chk_tmpdir; + + if (set_collation_name) + if (!(set_collation= get_charset_by_name(set_collation_name, + MYF(MY_UTF8_IS_UTF8MB3 | MY_WME)))) + my_exit(1); + + if (maria_data_root != default_log_dir && opt_log_dir == default_log_dir) + { + /* --datadir was used and --log-dir was not. Set log-dir to datadir */ + opt_log_dir= maria_data_root; + } + + /* If we are using zerofill, then we don't need to read the control file */ + if ((check_param.testflag & (T_ZEROFILL_KEEP_LSN | T_ZEROFILL)) && + !(check_param.testflag & ~(T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | T_STATISTICS | T_CHECK | T_FAST | T_CHECK_ONLY_CHANGED))) + opt_ignore_control_file= 1; + + return; +} /* get options */ + + +/** + Check/repair table + + @return 0 table is ok + @return 1 Got warning during check + @return 2 Got error during check/repair. +*/ + +static int maria_chk(HA_CHECK *param, char *filename) +{ + int error,lock_type,recreate; + uint warning_printed_by_chk_status; + my_bool rep_quick= MY_TEST(param->testflag & (T_QUICK | T_FORCE_UNIQUENESS)); + my_bool born_transactional; + MARIA_HA *info; + File datafile; + char llbuff[22],llbuff2[22]; + my_bool state_updated=0; + MARIA_SHARE *share; + DBUG_ENTER("maria_chk"); + + param->out_flag= error= param->error_printed= recreate= 0; + param->warning_printed= param->wrong_trd_printed= 0; + datafile=0; + param->isam_file_name=filename; /* For error messages */ + warning_printed_by_chk_status= 0; + if (!(info=maria_open(filename, + (param->testflag & (T_DESCRIPT | T_READONLY)) ? + O_RDONLY : O_RDWR, + HA_OPEN_FOR_REPAIR | + ((param->testflag & T_WAIT_FOREVER) ? + HA_OPEN_WAIT_IF_LOCKED : + (param->testflag & T_DESCRIPT) ? + HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED), + 0))) + { + /* Avoid twice printing of isam file name */ + param->error_printed++; + switch (my_errno) { + case HA_ERR_CRASHED: + _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename); + break; + case HA_ERR_NOT_A_TABLE: + _ma_check_print_error(param,"'%s' is not a Aria table",filename); + break; + case HA_ERR_CRASHED_ON_USAGE: + _ma_check_print_error(param,"'%s' is marked as crashed",filename); + break; + case HA_ERR_CRASHED_ON_REPAIR: + _ma_check_print_error(param,"'%s' is marked as crashed after last repair",filename); + break; + case HA_ERR_OLD_FILE: + _ma_check_print_error(param,"'%s' is a old type of Aria table", filename); + break; + case HA_ERR_NEW_FILE: + _ma_check_print_error(param,"'%s' uses new features not supported by this version of the Aria library", filename); + break; + case HA_ERR_END_OF_FILE: + _ma_check_print_error(param,"Couldn't read complete header from '%s'", filename); + break; + case EAGAIN: + _ma_check_print_error(param,"'%s' is locked. Use -w to wait until unlocked",filename); + break; + case ENOENT: + _ma_check_print_error(param,"File '%s' doesn't exist",filename); + break; + case EACCES: + _ma_check_print_error(param,"You don't have permission to use '%s'", + filename); + break; + default: + _ma_check_print_error(param,"%d when opening Aria table '%s'", + my_errno,filename); + break; + } + DBUG_RETURN(1); + } + share= info->s; + share->tot_locks-= share->r_locks; + share->r_locks=0; + maria_block_size= share->base.block_size; + + if (share->data_file_type == BLOCK_RECORD || + ((param->testflag & T_UNPACK) && + share->state.header.org_data_file_type == BLOCK_RECORD)) + { + if (param->testflag & T_SORT_RECORDS) + { + _ma_check_print_error(param, + "Record format used by '%s' is is not yet supported with sort-records", + filename); + param->error_printed= 0; + error= 1; + goto end2; + } + /* We can't do parallel repair with BLOCK_RECORD yet */ + if (param->testflag & T_REP_PARALLEL) + { + param->testflag&= ~T_REP_PARALLEL; + param->testflag|= T_REP_BY_SORT; + } + } + if ((share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED) && + !(param->testflag & T_DESCRIPT)) + { + _ma_check_print_warning(param, + "Table %s is encrypted. Only --description (-d) " + "option is supported", filename); + param->warning_printed= 0; + goto end2; + } + + /* + Skip the checking of the file if: + We are using --fast and the table is closed properly + We are using --check-only-changed-tables and the table hasn't changed + */ + if (param->testflag & (T_FAST | T_CHECK_ONLY_CHANGED)) + { + my_bool need_to_check= (maria_is_crashed(info) || + share->state.open_count != 0); + + if ((param->testflag & (T_REP_ANY | T_SORT_RECORDS)) && + ((share->state.changed & (STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR) || + !(param->testflag & T_CHECK_ONLY_CHANGED)))) + need_to_check=1; + + if (info->s->base.keys && info->state->records) + { + if ((param->testflag & T_STATISTICS) && + (share->state.changed & STATE_NOT_ANALYZED)) + need_to_check=1; + if ((param->testflag & T_SORT_INDEX) && + (share->state.changed & STATE_NOT_SORTED_PAGES)) + need_to_check=1; + if ((param->testflag & T_REP_BY_SORT) && + (share->state.changed & STATE_NOT_OPTIMIZED_KEYS)) + need_to_check=1; + } + if ((param->testflag & T_CHECK_ONLY_CHANGED) && + (share->state.changed & (STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR))) + need_to_check=1; + if (!need_to_check) + { + if (!(param->testflag & T_SILENT) || param->testflag & T_INFO) + printf("Aria file: %s is already checked\n",filename); + if (maria_close(info)) + { + _ma_check_print_error(param,"%d when closing Aria table '%s'", + my_errno,filename); + DBUG_RETURN(1); + } + DBUG_RETURN(0); + } + } + if ((param->testflag & (T_REP_ANY | T_STATISTICS | + T_SORT_RECORDS | T_SORT_INDEX)) && + (((param->testflag & T_UNPACK) && + share->data_file_type == COMPRESSED_RECORD) || + mi_uint2korr(share->state.header.state_info_length) != + MARIA_STATE_INFO_SIZE || + mi_uint2korr(share->state.header.base_info_length) != + MARIA_BASE_INFO_SIZE || + maria_is_any_intersect_keys_active(param->keys_in_use, share->base.keys, + ~share->state.key_map) || + maria_test_if_almost_full(info) || + info->s->state.header.file_version[3] != maria_file_magic[3] || + (set_collation && + set_collation->number != share->base.language))) + { + if (set_collation) + param->language= set_collation->number; + if (maria_recreate_table(param, &info,filename)) + { + fprintf(stderr, "Aria table '%s' is not fixed because of errors\n", + filename); + DBUG_RETURN(-1); + } + recreate=1; + if (!(param->testflag & T_REP_ANY)) + { + param->testflag|=T_REP_BY_SORT; /* if only STATISTICS */ + if (!(param->testflag & T_SILENT)) + printf("- '%s' has old table-format. Recreating index\n",filename); + rep_quick= 1; + } + share= info->s; + share->tot_locks-= share->r_locks; + share->r_locks=0; + } + + if (param->testflag & T_DESCRIPT) + { + param->total_files++; + param->total_records+=info->state->records; + param->total_deleted+=info->state->del; + descript(param, info, filename); + maria_close(info); /* Should always succeed */ + DBUG_RETURN(0); + } + + if (!stopwords_inited++) + ft_init_stopwords(); + + if (!(param->testflag & T_READONLY)) + lock_type = F_WRLCK; /* table is changed */ + else + lock_type= F_RDLCK; + if (info->lock_type == F_RDLCK) + info->lock_type=F_UNLCK; /* Read only table */ + if (_ma_readinfo(info,lock_type,0)) + { + _ma_check_print_error(param,"Can't lock indexfile of '%s', error: %d", + filename,my_errno); + param->error_printed=0; + error= 1; + goto end2; + } + /* + _ma_readinfo() has locked the table. + We mark the table as locked (without doing file locks) to be able to + use functions that only works on locked tables (like row caching). + */ + maria_lock_database(info, F_EXTRA_LCK); + datafile= info->dfile.file; + if (init_pagecache(maria_pagecache, (size_t) param->use_buffers, 0, 0, + maria_block_size, 0, MY_WME) == 0) + { + _ma_check_print_error(param, "Can't initialize page cache with %lu memory", + (ulong) param->use_buffers); + error= 1; + goto end2; + } + + if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_ZEROFILL)) + { + /* + Mark table as not transactional to avoid logging. Should not be needed, + maria_repair and maria_zerofill do it already. + */ + _ma_tmp_disable_logging_for_table(info, FALSE); + + if (param->testflag & T_REP_ANY) + { + ulonglong tmp=share->state.key_map; + maria_copy_keys_active(share->state.key_map, share->base.keys, + param->keys_in_use); + if (tmp != share->state.key_map) + info->update|=HA_STATE_CHANGED; + + if (rep_quick && + maria_chk_del(param, info, param->testflag & ~T_VERBOSE)) + { + if (param->testflag & T_FORCE_CREATE) + { + rep_quick=0; + _ma_check_print_info(param,"Creating new data file\n"); + } + else + { + error=1; + _ma_check_print_error(param, + "Quick-recover aborted; Run recovery without switch 'q'"); + } + } + } + if (!error) + { + /* + Unless this was only --zerofill-keep-lsn, old REDOs are not + applicable, tell the server's Recovery to ignore them; we don't + know what the log's end LSN is now, so we just let the server know + that it will have to find and store it. + This is the only case where create_rename_lsn can be a horizon and not + a LSN. + If this was only --zerofill-keep-lsn, the table can be used in + Recovery and especially in this scenario: do a dirty-copy-based backup + (snapshot-like), --zerofill-keep-lsn on the copies to achieve better + compression, compress the copies with an external tool, and after a + restore, Recovery still works (because pages and state still have + their correct LSNs). + */ + if (share->base.born_transactional && + ((param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_ZEROFILL | T_ZEROFILL_KEEP_LSN)) != + (T_ZEROFILL | T_ZEROFILL_KEEP_LSN))) + { + share->state.create_rename_lsn= share->state.is_of_horizon= + share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS; + share->state.create_trid= 0; + } + } + if (!error && (param->testflag & T_REP_ANY)) + { + if ((param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) && + (maria_is_any_key_active(share->state.key_map) || + (rep_quick && !param->keys_in_use && !recreate)) && + maria_test_if_sort_rep(info, info->state->records, + info->s->state.key_map, + param->force_sort)) + { + if (param->testflag & T_REP_BY_SORT) + error=maria_repair_by_sort(param,info,filename,rep_quick); + else + error=maria_repair_parallel(param,info,filename,rep_quick); + state_updated=1; + } + else + error=maria_repair(param, info,filename,rep_quick); + } + if (!error && (param->testflag & T_SORT_RECORDS)) + { + /* + The data file is nowadays reopened in the repair code so we should + soon remove the following reopen-code + */ +#ifndef TO_BE_REMOVED + if (param->out_flag & O_NEW_DATA) + { /* Change temp file to org file */ + mysql_file_close(info->dfile.file, MYF(MY_WME)); /* Close new file */ + error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT, + 0, MYF(0)); + if (_ma_open_datafile(info, info->s)) + error=1; + param->out_flag&= ~O_NEW_DATA; /* We are using new datafile */ + param->read_cache.file= info->dfile.file; + } +#endif + if (! error) + { + uint key; + /* + We can't update the index in maria_sort_records if we have a + prefix compressed or fulltext index + */ + my_bool update_index=1; + for (key=0 ; key < share->base.keys; key++) + if (share->keyinfo[key].flag & (HA_BINARY_PACK_KEY|HA_FULLTEXT)) + update_index=0; + + error=maria_sort_records(param,info,filename,param->opt_sort_key, + /* what is the following parameter for ? */ + (my_bool) !(param->testflag & T_REP), + update_index); + datafile= info->dfile.file; /* This is now locked */ + if (!error && !update_index) + { + if (param->verbose) + puts("Table had a compressed index; We must now recreate the index"); + error=maria_repair_by_sort(param,info,filename,1); + } + } + } + if (!error && (param->testflag & T_SORT_INDEX)) + error= maria_sort_index(param,info,filename); + if (!error && (param->testflag & T_ZEROFILL)) + error= maria_zerofill(param, info, filename); + if (!error) + { + DBUG_PRINT("info", ("Resetting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR); + } + else + maria_mark_crashed(info); + } + else if ((param->testflag & T_CHECK) || !(param->testflag & T_AUTO_INC)) + { + if (!(param->testflag & T_VERY_SILENT) || param->testflag & T_INFO) + printf("Checking Aria file: %s\n",filename); + if (!(param->testflag & T_SILENT)) + printf("Data records: %7s Deleted blocks: %7s\n", + llstr(info->state->records,llbuff), + llstr(info->state->del,llbuff2)); + maria_chk_init_for_check(param, info); + if (opt_warning_for_wrong_transid == 0) + param->max_trid= ~ (ulonglong) 0; + + error= maria_chk_status(param,info); + /* Forget warning printed by maria_chk_status if no problems found */ + warning_printed_by_chk_status= param->warning_printed; + param->warning_printed= 0; + + maria_intersect_keys_active(share->state.key_map, param->keys_in_use); + error|= maria_chk_size(param,info); + if (!error || !(param->testflag & (T_FAST | T_FORCE_CREATE))) + error|=maria_chk_del(param, info,param->testflag); + if ((!error || (!(param->testflag & (T_FAST | T_FORCE_CREATE)) && + !param->start_check_pos))) + { + error|=maria_chk_key(param, info); + if (!error && (param->testflag & (T_STATISTICS | T_AUTO_INC))) + error=maria_update_state_info(param, info, + ((param->testflag & T_STATISTICS) ? + UPDATE_STAT : 0) | + ((param->testflag & T_AUTO_INC) ? + UPDATE_AUTO_INC : 0)); + } + if ((!rep_quick && !error) || + !(param->testflag & (T_FAST | T_FORCE_CREATE))) + { + init_io_cache(¶m->read_cache,datafile, + (uint) param->read_buffer_length, + READ_CACHE, + (param->start_check_pos ? + param->start_check_pos : + share->pack.header_length), + 1, + MYF(MY_WME)); + maria_lock_memory(param); + if ((info->s->data_file_type != STATIC_RECORD) || + (param->testflag & (T_EXTEND | T_MEDIUM))) + error|=maria_chk_data_link(param, info, + MY_TEST(param->testflag & T_EXTEND)); + end_io_cache(¶m->read_cache); + } + if (!error) + { + if (((share->state.changed & + (STATE_CHANGED | STATE_CRASHED_FLAGS | STATE_IN_REPAIR)) || + share->state.open_count != 0) + && (param->testflag & T_UPDATE_STATE)) + info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + DBUG_PRINT("info", ("Resetting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR); + } + else if (!maria_is_crashed(info) && + (param->testflag & T_UPDATE_STATE)) + { /* Mark crashed */ + maria_mark_crashed(info); + info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + } + + if ((param->testflag & T_AUTO_INC) || + ((param->testflag & T_REP_ANY) && info->s->base.auto_key)) + _ma_update_auto_increment_key(param, info, + (my_bool) + !MY_TEST(param->testflag & T_AUTO_INC)); + + if (info->update & HA_STATE_CHANGED && ! (param->testflag & T_READONLY)) + { + error|=maria_update_state_info(param, info, + UPDATE_OPEN_COUNT | + (((param->testflag & + (T_REP_ANY | T_UPDATE_STATE)) ? + UPDATE_TIME : 0) | + (state_updated ? UPDATE_STAT : 0) | + ((param->testflag & T_SORT_RECORDS) ? + UPDATE_SORT : 0))); + if (warning_printed_by_chk_status) + _ma_check_print_info(param, "Aria table '%s' was ok. Status updated", + filename); + else if (!(param->testflag & T_SILENT)) + printf("State updated\n"); + warning_printed_by_chk_status= 0; + } + info->update&= ~HA_STATE_CHANGED; + _ma_reenable_logging_for_table(info, FALSE); + maria_lock_database(info, F_UNLCK); + +end2: + born_transactional= share->base.born_transactional; + if (maria_close(info)) + { + _ma_check_print_error(param, default_close_errmsg, my_errno, filename); + DBUG_RETURN(1); + } + end_pagecache(maria_pagecache, 1); + if (error == 0) + { + if (param->out_flag & O_NEW_DATA) + error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT, + param->backup_time, + ((param->testflag & T_BACKUP_DATA) ? + MYF(MY_REDEL_MAKE_BACKUP) : MYF(0))); + } + if (opt_transaction_logging && + born_transactional && !error && + (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_ZEROFILL))) + error= write_log_record(param); + + if (param->not_visible_rows_found && (param->testflag & T_VERBOSE)) + { + char buff[22]; + printf("Max transaction id found: %s\n", + llstr(param->max_found_trid, buff)); + } + + fflush(stdout); + fflush(stderr); + + if (param->error_printed) + { + error= 2; + if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX)) + { + fprintf(stderr, "Aria table '%s' is not fixed because of errors\n", + filename); + if (param->testflag & T_REP_ANY) + fprintf(stderr, "Try fixing it by using the --safe-recover (-o), " + "the --force (-f) option or by not using the --quick (-q) " + "flag\n"); + } + else if (!(param->testflag & T_FORCE_CREATE)) + fprintf(stderr, "Aria table '%s' is corrupted\nFix it using switch " + "\"-r\" or \"-o\"\n", filename); + } + else if ((param->warning_printed || warning_printed_by_chk_status) && + ! (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX | + T_FORCE_CREATE))) + { + if (!error) + error= 1; + (void) fprintf(stderr, "Aria table '%s' is usable but should be fixed\n", + filename); + } + + (void) fflush(stderr); + DBUG_RETURN(error); +} /* maria_chk */ + + +/* Write info about table */ + +static void descript(HA_CHECK *param, register MARIA_HA *info, char *name) +{ + uint key,keyseg_nr,field; + reg3 MARIA_KEYDEF *keyinfo; + reg2 HA_KEYSEG *keyseg; + reg4 const char *text; + char buff[200],length[10],*pos,*end; + enum en_fieldtype type; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + DBUG_ENTER("descript"); + + if (param->testflag & T_VERY_SILENT) + { + longlong checksum= info->state->checksum; + if (!(share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))) + checksum= 0; + printf("%s %s %s\n", name, llstr(info->state->records,llbuff), + llstr(checksum, llbuff2)); + DBUG_VOID_RETURN; + } + + printf("Aria file: %s\n",name); + printf("Record format: %s\n", record_formats[share->data_file_type]); + printf("Crashsafe: %s\n", + share->base.born_transactional ? "yes" : "no"); + printf("Character set: %s (%d)\n", + get_charset_name(share->base.language), + (int) share->base.language); + + if (param->testflag & T_VERBOSE) + { + if (share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED) + printf("Encrypted: yes\n"); + printf("File-version: %d\n", + (int) share->state.header.file_version[3]); + if (share->state.create_time) + { + get_date(buff,1,share->state.create_time); + printf("Creation time: %s\n",buff); + } + if (share->state.check_time) + { + get_date(buff,1,share->state.check_time); + printf("Check/recover time: %s\n",buff); + } + if (share->base.born_transactional) + { + printf("LSNs: create_rename " LSN_FMT "," + " state_horizon " LSN_FMT ", skip_redo " LSN_FMT "\n", + LSN_IN_PARTS(share->state.create_rename_lsn), + LSN_IN_PARTS(share->state.is_of_horizon), + LSN_IN_PARTS(share->state.skip_redo_lsn)); + printf("create_trid: %s\n", + llstr(share->state.create_trid, llbuff)); + } + compile_time_assert((MY_UUID_STRING_LENGTH + 1) <= sizeof(buff)); + buff[MY_UUID_STRING_LENGTH]= 0; + my_uuid2str(share->base.uuid, buff, 1); + printf("UUID: %s\n", buff); + if (ma_control_file_inited() && + memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE)) + printf("Warning: File UUID not match control file UUID! " + "File is probably moved\n" + "It will be updated to new system on first usage if zerofill is " + "not done\n"); + pos=buff; + if (share->state.changed & STATE_CRASHED) + strmov(buff, share->state.changed & STATE_CRASHED_ON_REPAIR ? + "crashed on repair" : "crashed"); + else if (have_control_file && + (share->state.changed & (STATE_MOVED | STATE_NOT_ZEROFILLED)) == + (STATE_MOVED | STATE_NOT_ZEROFILLED)) + strmov(buff, "moved from another system. Use --zerofill to fix it"); + else + { + if (share->state.open_count) + pos=strmov(pos,"open,"); + if (share->state.changed & STATE_CHANGED) + pos=strmov(pos,"changed,"); + else + pos=strmov(pos,"checked,"); + if (!(share->state.changed & STATE_NOT_ANALYZED)) + pos=strmov(pos,"analyzed,"); + if (!(share->state.changed & STATE_NOT_OPTIMIZED_KEYS)) + pos=strmov(pos,"optimized keys,"); + if (!(share->state.changed & STATE_NOT_SORTED_PAGES)) + pos=strmov(pos,"sorted index pages,"); + if (!(share->state.changed & STATE_NOT_ZEROFILLED)) + pos=strmov(pos,"zerofilled,"); + if (!(share->state.changed & STATE_NOT_MOVABLE)) + pos=strmov(pos,"movable,"); + if (have_control_file && (share->state.changed & STATE_MOVED)) + pos=strmov(pos,"moved,"); + pos[-1]=0; /* Remove extra ',' */ + } + printf("Status: %s\n",buff); + if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + printf("Checksum: %26s\n",llstr(info->state->checksum,llbuff)); +; + if (share->options & HA_OPTION_DELAY_KEY_WRITE) + printf("Keys are only flushed at close\n"); + + if (share->options & HA_OPTION_PAGE_CHECKSUM) + printf("Page checksums are used\n"); + if (share->base.auto_key) + { + printf("Auto increment key: %16d Last value: %18s\n", + share->base.auto_key, + llstr(share->state.auto_increment,llbuff)); + } + } + printf("Data records: %16s Deleted blocks: %18s\n", + llstr(info->state->records,llbuff),llstr(info->state->del,llbuff2)); + if (param->testflag & T_SILENT) + DBUG_VOID_RETURN; /* This is enough */ + + if (param->testflag & T_VERBOSE) + { +#ifdef USE_RELOC + printf("Init-relocation: %16s\n",llstr(share->base.reloc,llbuff)); +#endif + printf("Datafile parts: %16s Deleted data: %18s\n", + llstr(share->state.split,llbuff), + llstr(info->state->empty,llbuff2)); + printf("Datafile pointer (bytes): %11d Keyfile pointer (bytes): %13d\n", + share->rec_reflength,share->base.key_reflength); + printf("Datafile length: %16s Keyfile length: %18s\n", + llstr(info->state->data_file_length,llbuff), + llstr(info->state->key_file_length,llbuff2)); + + if (info->s->base.reloc == 1L && info->s->base.records == 1L) + puts("This is a one-record table"); + else + { + if (share->base.max_data_file_length != HA_OFFSET_ERROR || + share->base.max_key_file_length != HA_OFFSET_ERROR) + printf("Max datafile length: %16s Max keyfile length: %18s\n", + ullstr(share->base.max_data_file_length,llbuff), + ullstr(share->base.max_key_file_length,llbuff2)); + } + } + printf("Block_size: %16d\n",(int) share->block_size); + printf("Recordlength: %16d\n",(int) share->base.pack_reclength); + if (! maria_is_all_keys_active(share->state.key_map, share->base.keys)) + { + longlong2str(share->state.key_map,buff,2); + printf("Using only keys '%s' of %d possibly keys\n", + buff, share->base.keys); + } + puts("\nTable description:"); + printf("Key Start Len Index Type"); + if (param->testflag & T_VERBOSE) + printf(" Rec/key Root Blocksize"); + putchar('\n'); + + for (key=keyseg_nr=0, keyinfo= &share->keyinfo[0] ; + key < share->base.keys; + key++,keyinfo++) + { + keyseg=keyinfo->seg; + if (keyinfo->flag & HA_NOSAME) text="unique "; + else if (keyinfo->flag & HA_FULLTEXT) text="fulltext "; + else text="multip."; + + pos=buff; + if (keyseg->flag & HA_REVERSE_SORT) + *pos++ = '-'; + pos=strmov(pos,type_names[keyseg->type]); + *pos++ = ' '; + *pos=0; + if (keyinfo->flag & HA_PACK_KEY) + pos=strmov(pos,prefix_packed_txt); + if (keyinfo->flag & HA_BINARY_PACK_KEY) + pos=strmov(pos,bin_packed_txt); + if (keyseg->flag & HA_SPACE_PACK) + pos=strmov(pos,diff_txt); + if (keyseg->flag & HA_BLOB_PART) + pos=strmov(pos,blob_txt); + if (keyseg->flag & HA_NULL_PART) + pos=strmov(pos,null_txt); + *pos=0; + + printf("%-4d%-6ld%-3d %-9s%-23s", + key+1,(long) keyseg->start+1,keyseg->length,text,buff); + if (share->state.key_root[key] != HA_OFFSET_ERROR) + llstr(share->state.key_root[key],buff); + else + buff[0]=0; + if (param->testflag & T_VERBOSE) + printf("%9.0f %12s %10d", + share->state.rec_per_key_part[keyseg_nr++], + buff,keyinfo->block_length); + putchar('\n'); + while ((++keyseg)->type != HA_KEYTYPE_END) + { + pos=buff; + if (keyseg->flag & HA_REVERSE_SORT) + *pos++ = '-'; + pos=strmov(pos,type_names[keyseg->type]); + *pos++= ' '; + if (keyseg->flag & HA_SPACE_PACK) + pos=strmov(pos,diff_txt); + if (keyseg->flag & HA_BLOB_PART) + pos=strmov(pos,blob_txt); + if (keyseg->flag & HA_NULL_PART) + pos=strmov(pos,null_txt); + *pos=0; + printf(" %-6ld%-3d %-21s", + (long) keyseg->start+1,keyseg->length,buff); + if (param->testflag & T_VERBOSE) + printf("%11.0f", share->state.rec_per_key_part[keyseg_nr++]); + putchar('\n'); + } + keyseg++; + } + if (share->state.header.uniques) + { + MARIA_UNIQUEDEF *uniqueinfo; + puts("\nUnique Key Start Len Nullpos Nullbit Type"); + for (key=0,uniqueinfo= &share->uniqueinfo[0] ; + key < share->state.header.uniques; key++, uniqueinfo++) + { + my_bool new_row=0; + char null_bit[8],null_pos[8]; + printf("%-8d%-5d",key+1,uniqueinfo->key+1); + for (keyseg=uniqueinfo->seg ; keyseg->type != HA_KEYTYPE_END ; keyseg++) + { + if (new_row) + fputs(" ",stdout); + null_bit[0]=null_pos[0]=0; + if (keyseg->null_bit) + { + my_snprintf(null_bit, sizeof(null_bit), "%d", keyseg->null_bit); + my_snprintf(null_pos, sizeof(null_pos), "%ld", (long) keyseg->null_pos+1); + } + printf("%-7ld%-5d%-9s%-10s%-30s\n", + (long) keyseg->start+1,keyseg->length, + null_pos,null_bit, + type_names[keyseg->type]); + new_row=1; + } + } + } + if (param->verbose > 1) + { + char null_bit[8],null_pos[8]; + printf("\nField Start Length Nullpos Nullbit Type"); + if (share->options & HA_OPTION_COMPRESS_RECORD) + printf(" Huff tree Bits"); + putchar('\n'); + + for (field=0 ; field < share->base.fields ; field++) + { + if (share->options & HA_OPTION_COMPRESS_RECORD) + type=share->columndef[field].base_type; + else + type=(enum en_fieldtype) share->columndef[field].type; + end= strmov(buff, field_pack[type]); + if (end != buff) + { + *(end++)=','; + *(end++)=' '; + } + if (share->options & HA_OPTION_COMPRESS_RECORD) + { + if (share->columndef[field].pack_type & PACK_TYPE_SELECTED) + end=strmov(end,"not_always, "); + if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS) + end=strmov(end,"no empty, "); + if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL) + { + sprintf(end,"zerofill(%d), ",share->columndef[field].space_length_bits); + end=strend(end); + } + } + if (end != buff) + end[-2]= 0; + int10_to_str((long) share->columndef[field].length,length,10); + null_bit[0]=null_pos[0]=0; + if (share->columndef[field].null_bit) + { + sprintf(null_bit,"%d",share->columndef[field].null_bit); + sprintf(null_pos,"%d",share->columndef[field].null_pos+1); + } + printf("%-6d%-6u%-7s%-8s%-8s%-35s",field+1, + (uint) share->columndef[field].offset+1, + length, null_pos, null_bit, buff); + if (share->options & HA_OPTION_COMPRESS_RECORD) + { + if (share->columndef[field].huff_tree) + printf("%3d %2d", + (uint) (share->columndef[field].huff_tree-share->decode_trees)+1, + share->columndef[field].huff_tree->quick_table_bits); + } + putchar('\n'); + } + if (share->data_file_type == BLOCK_RECORD) + { + uint i; + puts("\nBitmap Data size Description"); + for (i=0 ; i <= 7 ; i++) + printf("%u %5u %s\n", i, share->bitmap.sizes[i], + bitmap_description[i]); + } + } + DBUG_VOID_RETURN; +} /* describe */ + + + /* Sort records according to one key */ + +static int maria_sort_records(HA_CHECK *param, + register MARIA_HA *info, char *name, + uint sort_key, + my_bool write_info, + my_bool update_index) +{ + int got_error; + uint key; + MARIA_KEYDEF *keyinfo; + File new_file; + uchar *temp_buff; + ha_rows old_record_count; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO sort_info; + MARIA_SORT_PARAM sort_param; + MARIA_PAGE page; + DBUG_ENTER("sort_records"); + + bzero((char*)&sort_info,sizeof(sort_info)); + bzero((char*)&sort_param,sizeof(sort_param)); + sort_param.sort_info=&sort_info; + sort_info.param=param; + keyinfo= &share->keyinfo[sort_key]; + got_error=1; + temp_buff=0; + new_file= -1; + + if (! maria_is_key_active(share->state.key_map, sort_key)) + { + _ma_check_print_warning(param, + "Can't sort table '%s' on key %d; No such key", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (keyinfo->flag & HA_FULLTEXT) + { + _ma_check_print_warning(param,"Can't sort table '%s' on FULLTEXT key %d", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + _ma_check_print_warning(param, + "Can't sort table '%s' on a key with prefix " + "packing %d", + name,sort_key+1); + param->error_printed=0; + DBUG_RETURN(0); + } + + + if (share->data_file_type == COMPRESSED_RECORD) + { + _ma_check_print_warning(param,"Can't sort read-only table '%s'", name); + param->error_printed=0; + DBUG_RETURN(0); /* Nothing to do */ + } + if (!(param->testflag & T_SILENT)) + { + printf("- Sorting records for Aria table '%s'\n",name); + if (write_info) + printf("Data records: %9s Deleted: %9s\n", + llstr(info->state->records,llbuff), + llstr(info->state->del,llbuff2)); + } + if (share->state.key_root[sort_key] == HA_OFFSET_ERROR) + DBUG_RETURN(0); /* Nothing to do */ + + if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length, + WRITE_CACHE,share->pack.header_length,1, + MYF(MY_WME | MY_WAIT_IF_FULL))) + goto err; + info->opt_flag|=WRITE_CACHE_USED; + + if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length))) + { + _ma_check_print_error(param,"Not enough memory for key block"); + goto err; + } + + if (!(sort_param.record= (uchar*) my_malloc(PSI_INSTRUMENT_ME, + (uint) share->base.default_rec_buff_size, MYF(0)))) + { + _ma_check_print_error(param,"Not enough memory for record"); + goto err; + } + + fn_format(param->temp_filename,name,"", MARIA_NAME_DEXT,2+4+32); + new_file= mysql_file_create(key_file_tmp, + fn_format(param->temp_filename, + param->temp_filename, "", + DATA_TMP_EXT, + MY_REPLACE_EXT | MY_UNPACK_FILENAME), + 0, param->tmpfile_createflag, MYF(0)); + if (new_file < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (share->pack.header_length) + if (maria_filecopy(param, new_file, info->dfile.file, 0L, + share->pack.header_length, + "datafile-header")) + goto err; + info->rec_cache.file=new_file; /* Use this file for cacheing*/ + + maria_lock_memory(param); + for (key=0 ; key < share->base.keys ; key++) + share->keyinfo[key].flag|= HA_SORT_ALLOWS_SAME; + + if (mysql_file_pread(share->kfile.file, temp_buff, + (uint) keyinfo->block_length, + share->state.key_root[sort_key], + MYF(MY_NABP+MY_WME))) + { + _ma_check_print_error(param, "Can't read indexpage from filepos: %s", + llstr(share->state.key_root[sort_key], llbuff)); + goto err; + } + + /* Setup param for _ma_sort_write_record */ + sort_info.info=info; + sort_info.new_data_file_type=share->data_file_type; + sort_param.fix_datafile=1; + sort_param.master=1; + sort_param.filepos=share->pack.header_length; + old_record_count=info->state->records; + info->state->records=0; + if (sort_info.new_data_file_type != COMPRESSED_RECORD) + info->state->checksum=0; + + _ma_page_setup(&page, info, keyinfo, share->state.key_root[sort_key], + temp_buff); + if (sort_record_index(&sort_param, &page, sort_key,new_file,update_index) || + maria_write_data_suffix(&sort_info,1) || + flush_io_cache(&info->rec_cache)) + goto err; + + if (info->state->records != old_record_count) + { + _ma_check_print_error(param,"found %s of %s records", + llstr(info->state->records,llbuff), + llstr(old_record_count,llbuff2)); + goto err; + } + + mysql_file_close(info->dfile.file, MYF(MY_WME)); + param->out_flag|=O_NEW_DATA; /* Data in new file */ + info->dfile.file= new_file; /* Use new datafile */ + _ma_set_data_pagecache_callbacks(&info->dfile, info->s); + + info->state->del=0; + info->state->empty=0; + share->state.dellink= HA_OFFSET_ERROR; + info->state->data_file_length=sort_param.filepos; + share->state.split=info->state->records; /* Only hole records */ + share->state.version=(ulong) time((time_t*) 0); + + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (param->testflag & T_WRITE_LOOP) + { + fputs(" \r",stdout); + fflush(stdout); + } + got_error=0; + +err: + if (got_error && new_file >= 0) + { + end_io_cache(&info->rec_cache); + (void) mysql_file_close(new_file,MYF(MY_WME)); + (void) mysql_file_delete(key_file_tmp, param->temp_filename, MYF(MY_WME)); + } + if (temp_buff) + { + my_afree(temp_buff); + } + my_free(sort_param.record); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + end_io_cache(&info->rec_cache); + my_free(sort_info.buff); + sort_info.buff=0; + share->state.sortkey=sort_key; + DBUG_RETURN(got_error); +} /* sort_records */ + + +/* Sort records recursive using one index */ + +static int sort_record_index(MARIA_SORT_PARAM *sort_param, + MARIA_PAGE *ma_page, uint sort_key, + File new_file,my_bool update_index) +{ + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + uint page_flag, nod_flag,used_length; + my_bool buff_alloced; + uchar *temp_buff,*keypos,*endpos; + my_off_t next_page,rec_pos; + uchar *lastkey; + char llbuff[22]; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_KEY tmp_key; + MARIA_PAGE new_page; + const MARIA_KEYDEF *keyinfo= ma_page->keyinfo; + DBUG_ENTER("sort_record_index"); + + temp_buff=0; + page_flag= ma_page->flag; + nod_flag= ma_page->node; + tmp_key.keyinfo= (MARIA_KEYDEF*) keyinfo; + + alloc_on_stack(*info->stack_end_ptr, lastkey, buff_alloced, + (nod_flag ? keyinfo->block_length : 0) + + ALIGN_SIZE(keyinfo->max_store_length)); + if (!lastkey) + { + _ma_check_print_error(param,"Not Enough memory"); + DBUG_RETURN(-1); + } + if (nod_flag) + temp_buff= lastkey + ALIGN_SIZE(keyinfo->max_store_length); + + tmp_key.data= lastkey; + + used_length= ma_page->size; + keypos= ma_page->buff + share->keypage_header + nod_flag; + endpos= ma_page->buff + used_length; + for ( ;; ) + { + if (nod_flag) + { + next_page= _ma_kpos(nod_flag, keypos); + if (mysql_file_pread(share->kfile.file, temp_buff, + (uint) tmp_key.keyinfo->block_length, next_page, + MYF(MY_NABP+MY_WME))) + { + _ma_check_print_error(param,"Can't read keys from filepos: %s", + llstr(next_page,llbuff)); + goto err; + } + _ma_page_setup(&new_page, info, ma_page->keyinfo, next_page, temp_buff); + + if (sort_record_index(sort_param, &new_page, sort_key, + new_file, update_index)) + goto err; + } + if (keypos >= endpos || + !(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos)) + break; + rec_pos= _ma_row_pos_from_key(&tmp_key); + + if ((*share->read_record)(info,sort_param->record,rec_pos)) + { + _ma_check_print_error(param,"%d when reading datafile",my_errno); + goto err; + } + if (rec_pos != sort_param->filepos && update_index) + { + _ma_dpointer(share, keypos - nod_flag - tmp_key.ref_length, + sort_param->filepos); + if (maria_movepoint(info,sort_param->record,rec_pos,sort_param->filepos, + sort_key)) + { + _ma_check_print_error(param,"%d when updating key-pointers",my_errno); + goto err; + } + } + if (_ma_sort_write_record(sort_param)) + goto err; + } + /* Clear end of block to get better compression if the table is backuped */ + bzero(ma_page->buff + used_length, keyinfo->block_length - used_length); + if (my_pwrite(share->kfile.file, ma_page->buff, (uint)keyinfo->block_length, + ma_page->pos, param->myf_rw)) + { + _ma_check_print_error(param,"%d when updating keyblock",my_errno); + goto err; + } + stack_alloc_free(lastkey, buff_alloced); + DBUG_RETURN(0); + +err: + stack_alloc_free(lastkey, buff_alloced); + DBUG_RETURN(1); +} /* sort_record_index */ + + +static my_bool write_log_record(HA_CHECK *param) +{ + /* + Now that all operations including O_NEW_DATA|INDEX are successfully + done, we can write a log record. + */ + MARIA_HA *info= maria_open(param->isam_file_name, O_RDWR, 0, 0); + if (info == NULL) + _ma_check_print_error(param, default_open_errmsg, my_errno, + param->isam_file_name); + else + { + if (write_log_record_for_repair(param, info)) + _ma_check_print_error(param, "%d when writing log record for" + " Aria table '%s'", my_errno, + param->isam_file_name); + else if (maria_close(info)) + _ma_check_print_error(param, default_close_errmsg, my_errno, + param->isam_file_name); + else + return FALSE; + } + return TRUE; +} + +#include "ma_check_standalone.h" diff --git a/storage/maria/aria_dump_log.c b/storage/maria/aria_dump_log.c new file mode 100644 index 00000000..e64c97fc --- /dev/null +++ b/storage/maria/aria_dump_log.c @@ -0,0 +1,196 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include <my_getopt.h> +extern void translog_example_table_init(); +static const char *load_default_groups[]= { "aria_dump_log",0 }; +static void get_options(int *argc,char * * *argv); +#ifndef DBUG_OFF +#if defined(_WIN32) +const char *default_dbug_option= "d:t:i:O,\\aria_dump_log.trace"; +#else +const char *default_dbug_option= "d:t:i:o,/tmp/aria_dump_log.trace"; +#endif +#endif +static ulonglong opt_offset; +static ulong opt_pages; +static const char *opt_file= NULL; +static File handler= -1; +static my_bool opt_unit= 0; +static struct my_option my_long_options[] = +{ +#ifdef IMPLTMENTED + {"body", 'b', + "Print chunk body dump", + (uchar **) &opt_body, (uchar **) &opt_body, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"file", 'f', "Path to file which will be read", + (uchar**) &opt_file, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "offset", 'o', "Start reading log from this offset", + (uchar**) &opt_offset, (uchar**) &opt_offset, + 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 }, + { "pages", 'n', "Number of pages to read", + (uchar**) &opt_pages, (uchar**) &opt_pages, 0, + GET_ULONG, REQUIRED_ARG, (long) ~(ulong) 0, + (long) 1, (long) ~(ulong) 0, (long) 0, + (long) 1, 0}, + {"unit-test", 'U', + "Use unit test record table (for logs created by unittests", + (uchar **) &opt_unit, (uchar **) &opt_unit, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void print_version(void) +{ + printf("%s Ver 1.1 for %s on %s\n", + my_progname_short, SYSTEM_TYPE, MACHINE_TYPE); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright (C) 2008 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Dump the raw content of aria log pages."); + puts("For a logical dump, use aria_read_log"); + printf("\nUsage: %s OPTIONS aria_log_file\n", my_progname_short); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument __attribute__((unused)), + const char *filename __attribute__((unused))) +{ + switch (opt->id) { + case '?': + usage(); + exit(0); + case 'V': + print_version(); + exit(0); +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (opt_file == NULL && *argc == 1) + opt_file= **argv; + + if (opt_file == NULL) + { + usage(); + exit(1); + } +} + + +/** + @brief maria_dump_log main function. +*/ + +int main(int argc, char **argv) +{ + char **default_argv; + uchar buffer[TRANSLOG_PAGE_SIZE]; + MY_INIT(argv[0]); + + load_defaults_or_exit("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc, &argv); + + if (opt_unit) + translog_example_table_init(); + else + translog_table_init(); + translog_fill_overhead_table(); + + maria_data_root= "."; + + if ((handler= my_open(opt_file, O_RDONLY, MYF(MY_WME))) < 0) + { + fprintf(stderr, "Can't open file: '%s' errno: %d\n", + opt_file, my_errno); + goto err; + } + if (my_seek(handler, opt_offset, SEEK_SET, MYF(MY_WME)) != + opt_offset) + { + fprintf(stderr, "Can't set position %lld file: '%s' errno: %d\n", + opt_offset, opt_file, my_errno); + goto err; + } + for (; + opt_pages; + opt_offset+= TRANSLOG_PAGE_SIZE, opt_pages--) + { + if (my_pread(handler, buffer, TRANSLOG_PAGE_SIZE, opt_offset, + MYF(MY_NABP))) + { + if (my_errno == HA_ERR_FILE_TOO_SHORT) + goto end; + fprintf(stderr, "Can't read page at position %lld file: '%s' " + "errno: %d\n", opt_offset, opt_file, my_errno); + goto err; + } + printf("Page by offset %llu (0x%llx)\n", opt_offset, opt_offset); + dump_page(buffer, handler); + } + +end: + my_close(handler, MYF(0)); + free_defaults(default_argv); + exit(0); + return 0; /* No compiler warning */ + +err: + my_close(handler, MYF(0)); + fprintf(stderr, "%s: FAILED\n", my_progname_short); + free_defaults(default_argv); + exit(1); +} + +#include "ma_check_standalone.h" + diff --git a/storage/maria/aria_ftdump.c b/storage/maria/aria_ftdump.c new file mode 100644 index 00000000..677d0221 --- /dev/null +++ b/storage/maria/aria_ftdump.c @@ -0,0 +1,280 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include <my_getopt.h> + +static void usage(); +static void complain(int val); +static my_bool get_one_option(const struct my_option *, const char *, + const char*); + +static int count=0, stats=0, dump=0, lstats=0; +static my_bool verbose; +static char *query=NULL; +static uint lengths[256]; + +#define MAX_LEN (HA_FT_MAXBYTELEN+10) +#define HOW_OFTEN_TO_WRITE 10000 + +static struct my_option my_long_options[] = +{ + {"help", 'h', "Display help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Synonym for -h.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"count", 'c', "Calculate per-word stats (counts and global weights).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"dump", 'd', "Dump index (incl. data offsets and word weights).", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"length", 'l', "Report length distribution.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"stats", 's', "Report global stats.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Be verbose.", + &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +int main(int argc,char *argv[]) +{ + int error=0; + uint keylen, keylen2=0, inx, doc_cnt=0; + float weight= 1.0; + double gws, min_gws=0, avg_gws=0; + MARIA_HA *info; + char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN]; + ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0; + struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */ + + MY_INIT(argv[0]); + if ((error= handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(error); + maria_init(); + if (count || dump) + verbose=0; + if (!count && !dump && !lstats && !query) + stats=1; + + if (verbose) + setbuf(stdout,NULL); + + if (argc < 2) + usage(); + + { + char *end; + inx= (uint) strtoll(argv[1], &end, 10); + if (*end) + usage(); + } + + init_pagecache(maria_pagecache, PAGE_BUFFER_INIT, 0, 0, + MARIA_KEY_BLOCK_LENGTH, 0, MY_WME); + + if (!(info=maria_open(argv[0], O_RDONLY, + HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER, 0))) + { + error=my_errno; + goto err; + } + + *buf2=0; + aio->info=info; + + if ((inx >= info->s->base.keys) || + !(info->s->keyinfo[inx].flag & HA_FULLTEXT)) + { + printf("Key %d in table %s is not a FULLTEXT key\n", inx, + info->s->open_file_name.str); + goto err; + } + + maria_lock_database(info, F_EXTRA_LCK); + + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_PREV_FOUND; + + while (!(error=maria_rnext(info,NULL,inx))) + { + FT_WEIGTH subkeys; + keylen=*(info->lastkey_buff); + + subkeys.i= ft_sintXkorr(info->lastkey_buff + keylen + 1); + if (subkeys.i >= 0) + weight= subkeys.f; + + snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey_buff+1); + my_casedn_str(default_charset_info,buf); + total++; + lengths[keylen]++; + + if (count || stats) + { + if (strcmp(buf, buf2)) + { + if (*buf2) + { + uniq++; + avg_gws+=gws=GWS_IN_USE; + if (count) + printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); + if (maxlen<keylen2) + { + maxlen=keylen2; + strmov(buf_maxlen, buf2); + } + if (max_doc_cnt < doc_cnt) + { + max_doc_cnt=doc_cnt; + strmov(buf_min_gws, buf2); + min_gws=gws; + } + } + strmov(buf2, buf); + keylen2=keylen; + doc_cnt=0; + } + doc_cnt+= (subkeys.i >= 0 ? 1 : -subkeys.i); + } + if (dump) + { + if (subkeys.i >= 0) + printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf); + else + printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys.i, + buf); + } + if (verbose && (total%HOW_OFTEN_TO_WRITE)==0) + printf("%10ld\r",total); + } + maria_lock_database(info, F_UNLCK); + + if (count || stats) + { + if (*buf2) + { + uniq++; + avg_gws+=gws=GWS_IN_USE; + if (count) + printf("%9u %20.7f %s\n",doc_cnt,gws,buf2); + if (maxlen<keylen2) + { + maxlen=keylen2; + strmov(buf_maxlen, buf2); + } + if (max_doc_cnt < doc_cnt) + { + max_doc_cnt=doc_cnt; + strmov(buf_min_gws, buf2); + min_gws=gws; + } + } + } + + if (stats) + { + count=0; + for (inx=0;inx<256;inx++) + { + count+=lengths[inx]; + if ((ulong) count >= total/2) + break; + } + printf("Total rows: %lu\nTotal words: %lu\n" + "Unique words: %lu\nLongest word: %lu chars (%s)\n" + "Median length: %u\n" + "Average global weight: %f\n" + "Most common word: %lu times, weight: %f (%s)\n", + (long) info->state->records, total, uniq, maxlen, buf_maxlen, + inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws); + } + if (lstats) + { + count=0; + for (inx=0; inx<256; inx++) + { + count+=lengths[inx]; + if (count && lengths[inx]) + printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx, + (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count, + 100.0*count/total); + } + } + +err: + if (error && error != HA_ERR_END_OF_FILE) + printf("got error %d\n",my_errno); + if (info) + maria_close(info); + maria_end(); + return 0; +} + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument __attribute__((unused)), + const char *filename __attribute__((unused))) +{ + switch(opt->id) { + case 'd': + dump=1; + complain(count || query); + break; + case 's': + stats=1; + complain(query!=0); + break; + case 'c': + count= 1; + complain(dump || query); + break; + case 'l': + lstats=1; + complain(query!=0); + break; + case '?': + case 'h': + usage(); + } + return 0; +} + + +static void usage() +{ + printf("Use: aria_ft_dump <table_name> <index_num>\n"); + my_print_help(my_long_options); + my_print_variables(my_long_options); + exit(1); +} + + +static void complain(int val) /* Kinda assert :-) */ +{ + if (val) + { + printf("You cannot use these options together!\n"); + exit(1); + } +} + +#include "ma_check_standalone.h" + diff --git a/storage/maria/aria_pack.c b/storage/maria/aria_pack.c new file mode 100644 index 00000000..40e7e399 --- /dev/null +++ b/storage/maria/aria_pack.c @@ -0,0 +1,3334 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Pack MARIA file */ + +#ifndef USE_MY_FUNC +#define USE_MY_FUNC /* We need at least my_malloc */ +#endif + +#include "maria_def.h" +#include "trnman_public.h" +#include "trnman.h" +#include <queues.h> +#include <my_tree.h> +#include "mysys_err.h" +#ifdef MSDOS +#include <io.h> +#endif +#ifndef __GNU_LIBRARY__ +#define __GNU_LIBRARY__ /* Skip warnings in getopt.h */ +#endif +#include <my_getopt.h> +#include <my_handler_errors.h> + +#if SIZEOF_LONG_LONG > 4 +#define BITS_SAVED 64 +#else +#define BITS_SAVED 32 +#endif +#ifndef MAX_INTERNAL_TRID +#define MAX_INTERNAL_TRID 0xffffffffffffLL +#endif + +#define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */ +#define HEAD_LENGTH 32 +#define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */ + +#define DATA_TMP_EXT ".TMD" +#define OLD_EXT ".OLD" +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE + +struct st_file_buffer { + File file; + uchar *buffer,*pos,*end; + my_off_t pos_in_file; + int bits; + ulonglong bitbucket; +}; + +struct st_huff_tree; +struct st_huff_element; + +typedef struct st_huff_counts { + uint field_length,max_zero_fill; + uint pack_type; + uint max_end_space,max_pre_space,length_bits,min_space; + ulong max_length; + enum en_fieldtype field_type; + struct st_huff_tree *tree; /* Tree for field */ + my_off_t counts[256]; + my_off_t end_space[8]; + my_off_t pre_space[8]; + my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed; + TREE int_tree; /* Tree for detecting distinct column values. */ + uchar *tree_buff; /* Column values, 'field_length' each. */ + uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */ +} HUFF_COUNTS; + +typedef struct st_huff_element HUFF_ELEMENT; + +/* + WARNING: It is crucial for the optimizations in calc_packed_length() + that 'count' is the first element of 'HUFF_ELEMENT'. +*/ +struct st_huff_element { + my_off_t count; + union un_element { + struct st_nod { + HUFF_ELEMENT *left,*right; + } nod; + struct st_leaf { + HUFF_ELEMENT *null; + uint element_nr; /* Number of element */ + } leaf; + } a; +}; + + +typedef struct st_huff_tree { + HUFF_ELEMENT *root,*element_buffer; + HUFF_COUNTS *counts; + uint tree_number; + uint elements; + my_off_t bytes_packed; + uint tree_pack_length; + uint min_chr,max_chr,char_bits,offset_bits,max_offset,height; + ulonglong *code; + uchar *code_len; +} HUFF_TREE; + + +typedef struct st_isam_mrg { + MARIA_HA **file,**current,**end; + uint free_file; + uint count; + uint min_pack_length; /* Theese is used by packed data */ + uint max_pack_length; + uint ref_length; + uint max_blob_length; + my_off_t records; + /* true if at least one source file has at least one disabled index */ + my_bool src_file_has_indexes_disabled; +} PACK_MRG_INFO; + + +extern int main(int argc,char * *argv); +static void get_options(int *argc,char ***argv); +static MARIA_HA *open_maria_file(char *name,int mode); +static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count); +static int compress(PACK_MRG_INFO *file,char *join_name); +static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records); +static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, + uint trees, + HUFF_COUNTS *huff_counts, + uint fields); +static int compare_tree(void* cmp_arg __attribute__((unused)), + const uchar *s,const uchar *t); +static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts); +static void check_counts(HUFF_COUNTS *huff_counts,uint trees, + my_off_t records); +static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records, + uint max_space_length,my_off_t *space_counts, + my_off_t tot_space_count, + enum en_fieldtype field_type); +static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees); +static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts); +static int compare_huff_elements(void *not_used, uchar *a,uchar *b); +static int save_counts_in_queue(uchar *key,element_count count, + HUFF_TREE *tree); +static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag); +static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees); +static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees); +static void make_traverse_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element,uint size, + ulonglong code); +static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees, + my_off_t tot_elements,my_off_t filelength); +static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees); +static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees); +static uint *make_offset_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element, + uint *offset); +static uint max_bit(uint value); +static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts); +static char *make_new_name(char *new_name,char *old_name); +static char *make_old_name(char *new_name,char *old_name); +static void init_file_buffer(File file,pbool read_buffer); +static int flush_buffer(ulong neaded_length); +static void end_file_buffer(void); +static void write_bits(ulonglong value, uint bits); +static void flush_bits(void); +static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg, + my_off_t new_length, ha_checksum crc); +static int save_state_mrg(File file,PACK_MRG_INFO *isam_file, + my_off_t new_length, ha_checksum crc); +static int mrg_close(PACK_MRG_INFO *mrg); +static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf); +static void mrg_reset(PACK_MRG_INFO *mrg); +#if !defined(DBUG_OFF) +static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count); +static int fakecmp(my_off_t **count1, my_off_t **count2); +#endif + +/* + tree_buff_length is somewhat arbitrary. The bigger it is the better + the chance to win in terms of compression factor. On the other hand, + this table becomes part of the compressed file header. And its length + is coded with 16 bits in the header. Hence the limit is 2**16 - 1. +*/ +static uint tree_buff_length= 65536 - MALLOC_OVERHEAD; + +static int error_on_write=0,test_only=0,verbose=0,silent=0, + write_loop=0,force_pack=0, isamchk_neaded=0; +static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL; +static my_bool backup, opt_wait; +static my_bool opt_ignore_control_file, opt_require_control_file; +static char tmp_dir[FN_REFLEN]={0},*join_table; +static my_off_t intervall_length; +static ha_checksum glob_crc; +static struct st_file_buffer file_buffer; +static QUEUE queue; +static HUFF_COUNTS *global_count; +static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +static const char *load_default_groups[]= { "ariapack",0 }; +static char **default_argv; + +/* + Register handler error messages for usage with my_error() + + NOTES + This is safe to call multiple times as my_error_register() + will ignore calls to register already registered error numbers. +*/ + +static const char **get_handler_error_messages(int e __attribute__((unused))) +{ + return handler_error_messages; +} + + +/* The main program */ + +int main(int argc, char **argv) +{ + int error,ok; + PACK_MRG_INFO merge; + my_bool no_control_file= 0; + MY_INIT(argv[0]); + + maria_data_root= "."; + load_defaults_or_exit("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc,&argv); + my_error_register(get_handler_error_messages, HA_ERR_FIRST, + HA_ERR_FIRST+ array_elements(handler_error_messages)-1); + + if (!opt_ignore_control_file && + (no_control_file= ma_control_file_open(FALSE, + (opt_require_control_file || + !silent), FALSE)) && + opt_require_control_file) + { + error= 1; + goto end; + } + maria_init(); + if (no_control_file || force_pack) + { + /* Assume that all rows exists */ + trnman_init(MAX_INTERNAL_TRID-16); + } + + error=ok=isamchk_neaded=0; + if (join_table) + { /* Join files into one */ + if (open_maria_files(&merge,argv,(uint) argc) || + compress(&merge,join_table)) + error=1; + } + else while (argc--) + { + MARIA_HA *isam_file; + if (!(isam_file=open_maria_file(*argv++,O_RDWR))) + error=1; + else + { + merge.file= &isam_file; + merge.current=0; + merge.free_file=0; + merge.count=1; + if (compress(&merge,0)) + error=1; + else + ok=1; + } + } + if (ok && isamchk_neaded && !silent) + puts("Remember to run aria_chk -rq on compressed tables"); + +end: + fflush(stdout); + fflush(stderr); + free_defaults(default_argv); + my_error_unregister(HA_ERR_FIRST, + HA_ERR_FIRST+ array_elements(handler_error_messages)-1); + maria_end(); + my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(error ? 2 : 0); +#ifndef _lint + return 0; /* No compiler warning */ +#endif +} + +static void my_exit(int error) +{ + free_defaults(default_argv); + maria_end(); + my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR); + exit(error); +} + +enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE}; + +static struct my_option my_long_options[] = +{ +#ifdef __NETWARE__ + {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"backup", 'b', "Make a backup of the table as table_name.OLD.", + &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR_MP, + "Directory where character sets are.", (char**) &charsets_dir, + (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"datadir", 'h', + "Path for control file (and logs if --logdir not used).", + (char**) &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG, + 0, 0, 0, 0, 0, 0}, + {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', + "Force packing of table even if it gets bigger or if tempfile exists.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "ignore-control-file", 0, + "Ignore the control file", + (uchar**)&opt_ignore_control_file, 0, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"join", 'j', + "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.", + &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, + 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "require-control-file", 0, + "Abort if cannot find control file", + (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Be more silent.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 'T', "Use temporary directory to store temporary table.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test", 't', "Don't pack table, only test packing it.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Output version information and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"wait", 'w', "Wait and retry if table is in use.", &opt_wait, + &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void print_version(void) +{ + printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc."); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Pack a Aria-table to take much less space."); + puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file"); + puts("afterwards to update the keys."); + puts("You should give the .MAI file as the filename argument."); + puts("To unpack a packed table, run aria_chk -u on the table"); + + printf("\nUsage: %s [OPTIONS] filename...\n", my_progname); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument, + const char *filename __attribute__((unused))) +{ + uint length; + + switch(opt->id) { +#ifdef __NETWARE__ + case OPT_AUTO_CLOSE: + setscreenmode(SCR_AUTOCLOSE_ON_EXIT); + break; +#endif + case 'f': + force_pack= 1; + tmpfile_createflag= O_RDWR | O_TRUNC; + break; + case 's': + write_loop= verbose= 0; + silent= 1; + break; + case 't': + test_only= 1; + /* Avoid to reset 'verbose' if it was already set > 1. */ + if (! verbose) + verbose= 1; + break; + case 'T': + length= (uint) (strmov(tmp_dir, argument) - tmp_dir); + if (length != dirname_length(tmp_dir)) + { + tmp_dir[length]=FN_LIBCHAR; + tmp_dir[length+1]=0; + } + break; + case 'v': + verbose++; /* Allow for selecting the level of verbosity. */ + silent= 0; + break; + case '#': + DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace"); + break; + case 'V': + print_version(); + my_exit(0); + break; + case 'I': + case '?': + usage(); + my_exit(0); + } + return 0; +} + + /* reads options */ + /* Initiates DEBUG - but no debugging here ! */ + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + my_progname= argv[0][0]; + if (isatty(fileno(stdout))) + write_loop=1; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + my_exit(ho_error); + + if (!*argc) + { + usage(); + my_exit(1); + } + if (join_table) + { + backup=0; /* Not needed */ + tmp_dir[0]=0; + } + return; +} + + +static void print_error(int error, const char *filename) +{ + switch (error) { + case HA_ERR_CRASHED: + fprintf(stderr, "'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename); + break; + case HA_ERR_NOT_A_TABLE: + fprintf(stderr, "'%s' is not a Aria table",filename); + break; + case HA_ERR_CRASHED_ON_USAGE: + fprintf(stderr, "'%s' is marked as crashed",filename); + break; + case HA_ERR_CRASHED_ON_REPAIR: + fprintf(stderr, "'%s' is marked as crashed after last repair",filename); + break; + case HA_ERR_OLD_FILE: + fprintf(stderr, "'%s' has transactions newer than registered in control file. If this is ok, please re-run with --ignore-control-file", filename); + break; + case HA_ERR_NEW_FILE: + fprintf(stderr, "'%s' uses new features not supported by this version of the Aria library", filename); + break; + case HA_ERR_END_OF_FILE: + fprintf(stderr, "Couldn't read complete header from '%s'", filename); + break; + case EAGAIN: + fprintf(stderr, "'%s' is locked. Use -w to wait until unlocked",filename); + break; + case ENOENT: + fprintf(stderr, "File '%s' doesn't exist",filename); + break; + case EACCES: + fprintf(stderr, "You don't have permission to use '%s'", filename); + break; + default: + fprintf(stderr, "%d when opening Aria table '%s'", error, filename); + break; + } + fputc('\n',stderr); +} + + +static MARIA_HA *open_maria_file(char *name,int mode) +{ + MARIA_HA *isam_file; + MARIA_SHARE *share; + DBUG_ENTER("open_maria_file"); + + if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE | + (opt_wait ? HA_OPEN_WAIT_IF_LOCKED : + HA_OPEN_ABORT_IF_LOCKED), 0))) + { + print_error(my_errno, name); + DBUG_RETURN(0); + } + share=isam_file->s; + if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table) + { + if (!force_pack) + { + fprintf(stderr, "%s is already compressed\n", name); + maria_close(isam_file); + DBUG_RETURN(0); + } + if (verbose) + puts("Recompressing already compressed table"); + share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */ + } + if (! force_pack && share->state.state.records != 0 && + (share->state.state.records <= 1 || + share->state.state.data_file_length < 1024)) + { + fprintf(stderr, "%s is too small to compress\n", name); + maria_close(isam_file); + DBUG_RETURN(0); + } + maria_lock_database(isam_file,F_WRLCK); + maria_ignore_trids(isam_file); + DBUG_RETURN(isam_file); +} + + +static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count) +{ + uint i,j; + mrg->count=0; + mrg->current=0; + mrg->file=(MARIA_HA**) my_malloc(PSI_NOT_INSTRUMENTED, sizeof(MARIA_HA*)*count,MYF(MY_FAE)); + mrg->free_file=1; + mrg->src_file_has_indexes_disabled= 0; + for (i=0; i < count ; i++) + { + if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY))) + goto error; + + mrg->src_file_has_indexes_disabled|= + ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map, + mrg->file[i]->s->base.keys); + } + /* Check that files are identical */ + for (j=0 ; j < count-1 ; j++) + { + MARIA_COLUMNDEF *m1,*m2,*end; + if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength || + mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields) + goto diff_file; + m1=mrg->file[j]->s->columndef; + end=m1+mrg->file[j]->s->base.fields; + m2=mrg->file[j+1]->s->columndef; + for ( ; m1 != end ; m1++,m2++) + { + if (m1->type != m2->type || m1->length != m2->length) + goto diff_file; + } + } + mrg->count=count; + return 0; + + diff_file: + fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n", + my_progname, names[j], names[j+1]); + error: + while (i--) + maria_close(mrg->file[i]); + my_free(mrg->file); + return 1; +} + + +static int compress(PACK_MRG_INFO *mrg,char *result_table) +{ + int error; + File new_file,join_maria_file; + MARIA_HA *isam_file; + MARIA_SHARE *share; + char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN]; + uint i,header_length,fields,trees,used_trees; + my_off_t old_length,new_length,tot_elements; + HUFF_COUNTS *huff_counts; + HUFF_TREE *huff_trees; + DBUG_ENTER("compress"); + + isam_file=mrg->file[0]; /* Take this as an example */ + share=isam_file->s; + new_file=join_maria_file= -1; + trees=fields=0; + huff_trees=0; + huff_counts=0; + maria_block_size= isam_file->s->block_size; + + /* Create temporary or join file */ + if (backup) + fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2); + else + fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16); + + if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0, + maria_block_size, 0, MY_WME) == 0) + { + fprintf(stderr, "Can't initialize page cache\n"); + goto err; + } + + if (!test_only && result_table) + { + /* Make a new indexfile based on first file in list */ + uint length; + uchar *buff; + strmov(org_name,result_table); /* Fix error messages */ + fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2); + if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) + < 0) + goto err; + length=(uint) share->base.keystart; + if (!(buff= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, length, MYF(MY_WME)))) + goto err; + if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) || + my_write(join_maria_file,buff,length, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + my_free(buff); + goto err; + } + my_free(buff); + fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2); + } + else if (!tmp_dir[0]) + make_new_name(new_name,org_name); + else + fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4); + if (!test_only && + (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0) + goto err; + + /* Start calculating statistics */ + + mrg->records=0; + for (i=0 ; i < mrg->count ; i++) + mrg->records+=mrg->file[i]->s->state.state.records; + + DBUG_PRINT("info", ("Compressing %s: (%lu records)", + result_table ? new_name : org_name, + (ulong) mrg->records)); + if (write_loop || verbose) + { + printf("Compressing %s: (%lu records)\n", + result_table ? new_name : org_name, (ulong) mrg->records); + } + trees=fields=share->base.fields; + huff_counts=init_huff_count(isam_file,mrg->records); + + /* + Read the whole data file(s) for statistics. + */ + DBUG_PRINT("info", ("- Calculating statistics")); + if (write_loop || verbose) + printf("- Calculating statistics\n"); + if (get_statistic(mrg,huff_counts)) + goto err; + + old_length=0; + for (i=0; i < mrg->count ; i++) + old_length+= (mrg->file[i]->s->state.state.data_file_length - + mrg->file[i]->s->state.state.empty); + + /* + Create a global priority queue in preparation for making + temporary Huffman trees. + */ + if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0)) + goto err; + + /* + Check each column if we should use pre-space-compress, end-space- + compress, empty-field-compress or zero-field-compress. + */ + check_counts(huff_counts,fields,mrg->records); + + /* + Build a Huffman tree for each column. + */ + huff_trees=make_huff_trees(huff_counts,trees); + + /* + If the packed lengths of combined columns is less then the sum of + the non-combined columns, then create common Huffman trees for them. + We do this only for uchar compressed columns, not for distinct values + compressed columns. + */ + if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0) + goto err; + + /* + Assign codes to all uchar or column values. + */ + if (make_huff_decode_table(huff_trees,fields)) + goto err; + + /* Prepare a file buffer. */ + init_file_buffer(new_file,0); + + /* + Reserve space in the target file for the fixed compressed file header. + */ + file_buffer.pos_in_file=HEAD_LENGTH; + if (! test_only) + my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)); + + /* + Write field infos: field type, pack type, length bits, tree number. + */ + write_field_info(huff_counts,fields,used_trees); + + /* + Write decode trees. + */ + if (!(tot_elements=write_huff_tree(huff_trees,trees))) + goto err; + + /* + Calculate the total length of the compression info header. + This includes the fixed compressed file header, the column compression + type descriptions, and the decode trees. + */ + header_length=(uint) file_buffer.pos_in_file+ + (uint) (file_buffer.pos-file_buffer.buffer); + + /* + Compress the source file into the target file. + */ + DBUG_PRINT("info", ("- Compressing file")); + if (write_loop || verbose) + printf("- Compressing file\n"); + error=compress_maria_file(mrg,huff_counts); + new_length=file_buffer.pos_in_file; + if (!error && !test_only) + { + uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */ + bzero(buff,sizeof(buff)); + error=my_write(file_buffer.file,buff,sizeof(buff), + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0; + } + + /* + Write the fixed compressed file header. + */ + if (!error) + error=write_header(mrg,header_length,used_trees,tot_elements, + new_length); + + /* Flush the file buffer. */ + end_file_buffer(); + + /* Display statistics. */ + DBUG_PRINT("info", ("Min record length: %6d Max length: %6d " + "Mean total length: %6ld", + mrg->min_pack_length, mrg->max_pack_length, + (ulong) (mrg->records ? (new_length/mrg->records) : 0))); + if (verbose && mrg->records) + printf("Min record length: %6d Max length: %6d " + "Mean total length: %6ld\n", mrg->min_pack_length, + mrg->max_pack_length, (ulong) (new_length/mrg->records)); + + /* Close source and target file. */ + if (!test_only) + { + error|=my_close(new_file,MYF(MY_WME)); + if (!result_table) + { + (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile, + FLUSH_RELEASE); + error|=my_close(isam_file->dfile.file, MYF(MY_WME)); + isam_file->dfile.file= -1; /* Tell maria_close file is closed */ + isam_file->s->bitmap.file.file= -1; + } + } + + /* Cleanup. */ + free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields); + if (! test_only && ! error) + { + if (result_table) + { + error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc); + } + else + { + if (backup) + { + if (my_rename(org_name,make_old_name(temp_name, + isam_file->s->open_file_name.str), + MYF(MY_WME))) + error=1; + else + { + if (tmp_dir[0]) + error=my_copy(new_name,org_name,MYF(MY_WME)); + else + error=my_rename(new_name,org_name,MYF(MY_WME)); + if (!error) + { + my_copystat(temp_name,org_name,MYF(MY_COPYTIME)); + if (tmp_dir[0]) + my_delete(new_name,MYF(MY_WME)); + } + } + } + else + { + if (tmp_dir[0]) + { + error=my_copy(new_name,org_name, + MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME)); + if (!error) + my_delete(new_name,MYF(MY_WME)); + } + else + error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME)); + } + if (! error) + error=save_state(isam_file,mrg,new_length,glob_crc); + } + } + error|=mrg_close(mrg); + if (join_maria_file >= 0) + error|=my_close(join_maria_file,MYF(MY_WME)); + if (error) + { + fprintf(stderr, "Aborting: %s is not compressed\n", org_name); + my_delete(new_name,MYF(MY_WME)); + DBUG_RETURN(-1); + } + if (write_loop || verbose) + { + if (old_length) + printf("%.4g%% \n", + (((longlong) (old_length - new_length)) * 100.0 / + (longlong) old_length)); + else + puts("Empty file saved in compressed format"); + } + DBUG_RETURN(0); + + err: + free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields); + if (new_file >= 0) + my_close(new_file,MYF(0)); + if (join_maria_file >= 0) + my_close(join_maria_file,MYF(0)); + mrg_close(mrg); + end_pagecache(maria_pagecache, 1); + fprintf(stderr, "Aborted: %s is not compressed\n", org_name); + DBUG_RETURN(-1); +} + + /* Init a huff_count-struct for each field and init it */ + +static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records) +{ + reg2 uint i; + reg1 HUFF_COUNTS *count; + if ((count = (HUFF_COUNTS*) my_malloc(PSI_NOT_INSTRUMENTED, + info->s->base.fields*sizeof(HUFF_COUNTS), MYF(MY_ZEROFILL | MY_WME)))) + { + for (i=0 ; i < info->s->base.fields ; i++) + { + enum en_fieldtype type; + uint col_nr = info->s->columndef[i].column_nr; + count[col_nr].field_length=info->s->columndef[i].length; + type= count[col_nr].field_type= + (enum en_fieldtype) info->s->columndef[i].type; + if (type == FIELD_INTERVALL || + type == FIELD_CONSTANT || + type == FIELD_ZERO) + type = FIELD_NORMAL; + if (count[col_nr].field_length <= 8 && + (type == FIELD_NORMAL || + type == FIELD_SKIP_ZERO)) + count[col_nr].max_zero_fill= count[col_nr].field_length; + /* + For every column initialize a tree, which is used to detect distinct + column values. 'int_tree' works together with 'tree_buff' and + 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'. + This is accomplished by '-1' as the element size. + */ + init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL, + NULL, MYF(0)); + if (records && type != FIELD_BLOB && type != FIELD_VARCHAR) + count[col_nr].tree_pos=count[col_nr].tree_buff = + my_malloc(PSI_NOT_INSTRUMENTED, + count[col_nr].field_length > 1 ? tree_buff_length : 2, + MYF(MY_WME)); + } + } + return count; +} + + + /* Free memory used by counts and trees */ + +static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees, + HUFF_COUNTS *huff_counts, + uint fields) +{ + register uint i; + + if (huff_trees) + { + for (i=0 ; i < trees ; i++) + { + if (huff_trees[i].element_buffer) + my_free(huff_trees[i].element_buffer); + if (huff_trees[i].code) + my_free(huff_trees[i].code); + } + my_free(huff_trees); + } + if (huff_counts) + { + for (i=0 ; i < fields ; i++) + { + if (huff_counts[i].tree_buff) + { + my_free(huff_counts[i].tree_buff); + delete_tree(&huff_counts[i].int_tree, 0); + } + } + my_free(huff_counts); + } + delete_queue(&queue); /* This is safe to free */ + return; +} + + /* Read through old file and gather some statistics */ + +static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts) +{ + int error; + uint length, null_bytes; + ulong reclength,max_blob_length; + uchar *record,*pos,*next_pos,*end_pos,*start_pos; + ha_rows record_count; + HUFF_COUNTS *count,*end_count; + TREE_ELEMENT *element; + ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *); + DBUG_ENTER("get_statistic"); + + reclength= mrg->file[0]->s->base.reclength; + null_bytes= mrg->file[0]->s->base.null_bytes; + record=(uchar*) my_safe_alloca(reclength); + end_count=huff_counts+mrg->file[0]->s->base.fields; + record_count=0; glob_crc=0; + max_blob_length=0; + + /* Check how to calculate checksum */ + if (mrg->file[0]->s->data_file_type == STATIC_RECORD) + calc_checksum= _ma_static_checksum; + else + calc_checksum= _ma_checksum; + + mrg_reset(mrg); + while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE) + { + ulong tot_blob_length=0; + if (! error) + { + /* glob_crc is a checksum over all bytes of all records. */ + glob_crc+= (*calc_checksum)(mrg->file[0],record); + + /* Count the incidence of values separately for every column. */ + for (pos=record + null_bytes, count=huff_counts ; + count < end_count ; + count++, + pos=next_pos) + { + next_pos=end_pos=(start_pos=pos)+count->field_length; + + /* + Put the whole column value in a tree if there is room for it. + 'int_tree' is used to quickly check for duplicate values. + 'tree_buff' collects as many distinct column values as + possible. If the field length is > 1, it is tree_buff_length, + else 2 bytes. Each value is 'field_length' bytes big. If there + are more distinct column values than fit into the buffer, we + give up with this tree. BLOBs and VARCHARs do not have a + tree_buff as it can only be used with fixed length columns. + For the special case of field length == 1, we handle only the + case that there is only one distinct value in the table(s). + Otherwise, we can have a maximum of 256 distinct values. This + is then handled by the normal Huffman tree build. + + Another limit for collecting distinct column values is the + number of values itself. Since we would need to build a + Huffman tree for the values, we are limited by the 'IS_OFFSET' + constant. This constant expresses a bit which is used to + determine if a tree element holds a final value or an offset + to a child element. Hence, all values and offsets need to be + smaller than 'IS_OFFSET'. A tree element is implemented with + two integer values, one for the left branch and one for the + right branch. For the extreme case that the first element + points to the last element, the number of integers in the tree + must be less or equal to IS_OFFSET. So the number of elements + must be less or equal to IS_OFFSET / 2. + + WARNING: At first, we insert a pointer into the record buffer + as the key for the tree. If we got a new distinct value, which + is really inserted into the tree, instead of being counted + only, we will copy the column value from the record buffer to + 'tree_buff' and adjust the key pointer of the tree accordingly. + */ + if (count->tree_buff) + { + global_count=count; + if (!(element=tree_insert(&count->int_tree,pos, 0, + count->int_tree.custom_arg)) || + (element->count == 1 && + (count->tree_buff + tree_buff_length < + count->tree_pos + count->field_length)) || + (count->int_tree.elements_in_tree > IS_OFFSET / 2) || + (count->field_length == 1 && + count->int_tree.elements_in_tree > 1)) + { + delete_tree(&count->int_tree, 0); + my_free(count->tree_buff); + count->tree_buff=0; + } + else + { + /* + If tree_insert() succeeds, it either creates a new element + or increments the counter of an existing element. + */ + if (element->count == 1) + { + /* Copy the new column value into 'tree_buff'. */ + memcpy(count->tree_pos,pos,(size_t) count->field_length); + /* Adjust the key pointer in the tree. */ + tree_set_pointer(element,count->tree_pos); + /* Point behind the last column value so far. */ + count->tree_pos+=count->field_length; + } + } + } + + /* Save character counters and space-counts and zero-field-counts */ + if (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_ENDSPACE) + { + /* Ignore trailing space. */ + for ( ; end_pos > pos ; end_pos--) + if (end_pos[-1] != ' ') + break; + /* Empty fields are just counted. Go to the next record. */ + if (end_pos == pos) + { + count->empty_fields++; + count->max_zero_fill=0; + continue; + } + /* + Count the total of all trailing spaces and the number of + short trailing spaces. Remember the longest trailing space. + */ + length= (uint) (next_pos-end_pos); + count->tot_end_space+=length; + if (length < 8) + count->end_space[length]++; + if (count->max_end_space < length) + count->max_end_space = length; + } + + if (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_PRESPACE) + { + /* Ignore leading space. */ + for (pos=start_pos; pos < end_pos ; pos++) + if (pos[0] != ' ') + break; + /* Empty fields are just counted. Go to the next record. */ + if (end_pos == pos) + { + count->empty_fields++; + count->max_zero_fill=0; + continue; + } + /* + Count the total of all leading spaces and the number of + short leading spaces. Remember the longest leading space. + */ + length= (uint) (pos-start_pos); + count->tot_pre_space+=length; + if (length < 8) + count->pre_space[length]++; + if (count->max_pre_space < length) + count->max_pre_space = length; + } + + /* Calculate pos, end_pos, and max_length for variable length fields. */ + if (count->field_type == FIELD_BLOB) + { + uint field_length=count->field_length -portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(field_length, start_pos); + memcpy(&pos, start_pos+field_length,sizeof(char*)); + end_pos=pos+blob_length; + tot_blob_length+=blob_length; + set_if_bigger(count->max_length,blob_length); + } + else if (count->field_type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1); + length= (pack_length == 1 ? (uint) *(uchar*) start_pos : + uint2korr(start_pos)); + pos= start_pos+pack_length; + end_pos= pos+length; + set_if_bigger(count->max_length,length); + } + + /* Evaluate 'max_zero_fill' for short fields. */ + if (count->field_length <= 8 && + (count->field_type == FIELD_NORMAL || + count->field_type == FIELD_SKIP_ZERO)) + { + uint i; + /* Zero fields are just counted. Go to the next record. */ + if (!memcmp(start_pos, zero_string, count->field_length)) + { + count->zero_fields++; + continue; + } + /* + max_zero_fill starts with field_length. It is decreased every + time a shorter "zero trailer" is found. It is set to zero when + an empty field is found (see above). This suggests that the + variable should be called 'min_zero_fill'. + */ + for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ; + i++) ; + if (i < count->max_zero_fill) + count->max_zero_fill=i; + } + + /* Ignore zero fields and check fields. */ + if (count->field_type == FIELD_ZERO || + count->field_type == FIELD_CHECK) + continue; + + /* + Count the incidence of every uchar value in the + significant field value. + */ + for ( ; pos < end_pos ; pos++) + count->counts[(uchar) *pos]++; + + /* Step to next field. */ + } + + if (tot_blob_length > max_blob_length) + max_blob_length=tot_blob_length; + record_count++; + if (write_loop && record_count % WRITE_COUNT == 0) + { + printf("%lu\r", (ulong) record_count); + fflush(stdout); + } + } + else if (error != HA_ERR_RECORD_DELETED) + { + fprintf(stderr, "Got error %d while reading rows\n", error); + break; + } + + /* Step to next record. */ + } + if (write_loop) + { + printf(" \r"); + fflush(stdout); + } + + /* + If --debug=d,fakebigcodes is set, fake the counts to get big Huffman + codes. + */ + DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count);); + + DBUG_PRINT("info", ("Found the following number of incidents " + "of the uchar codes:")); + if (verbose >= 2) + printf("Found the following number of incidents " + "of the uchar codes:\n"); + for (count= huff_counts ; count < end_count; count++) + { + uint idx; + my_off_t total_count; + char llbuf[32]; + + DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1))); + if (verbose >= 2) + printf("column: %3u\n", (uint) (count - huff_counts + 1)); + if (count->tree_buff) + { + DBUG_PRINT("info", ("number of distinct values: %u", + (uint) ((count->tree_pos - count->tree_buff) / + count->field_length))); + if (verbose >= 2) + printf("number of distinct values: %u\n", + (uint) ((count->tree_pos - count->tree_buff) / + count->field_length)); + } + total_count= 0; + for (idx= 0; idx < 256; idx++) + { + if (count->counts[idx]) + { + total_count+= count->counts[idx]; + DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx, + llstr((longlong) count->counts[idx], llbuf))); + if (verbose >= 2) + printf("counts[0x%02x]: %12s\n", idx, + llstr((longlong) count->counts[idx], llbuf)); + } + } + DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count, + llbuf))); + if ((verbose >= 2) && total_count) + { + printf("total: %12s\n", + llstr((longlong) total_count, llbuf)); + } + } + + mrg->records=record_count; + mrg->max_blob_length=max_blob_length; + my_safe_afree(record, reclength); + DBUG_RETURN(error != HA_ERR_END_OF_FILE); +} + +static int compare_huff_elements(void *not_used __attribute__((unused)), + uchar *a, uchar *b) +{ + return *((my_off_t*) a) < *((my_off_t*) b) ? -1 : + (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1); +} + + /* Check each tree if we should use pre-space-compress, end-space- + compress, empty-field-compress or zero-field-compress */ + +static void check_counts(HUFF_COUNTS *huff_counts, uint trees, + my_off_t records) +{ + uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count]; + my_off_t old_length,new_length,length; + DBUG_ENTER("check_counts"); + + bzero((uchar*) field_count,sizeof(field_count)); + space_fields=fill_zero_fields=0; + + for (; trees-- ; huff_counts++) + { + if (huff_counts->field_type == FIELD_BLOB) + { + huff_counts->length_bits=max_bit(huff_counts->max_length); + goto found_pack; + } + else if (huff_counts->field_type == FIELD_VARCHAR) + { + huff_counts->length_bits=max_bit(huff_counts->max_length); + goto found_pack; + } + else if (huff_counts->field_type == FIELD_CHECK) + { + huff_counts->bytes_packed=0; + huff_counts->counts[0]=0; + goto found_pack; + } + + huff_counts->field_type=FIELD_NORMAL; + huff_counts->pack_type=0; + + /* Check for zero-filled records (in this column), or zero records. */ + if (huff_counts->zero_fields || ! records) + { + my_off_t old_space_count; + /* + If there are only zero filled records (in this column), + or no records at all, we are done. + */ + if (huff_counts->zero_fields == records) + { + huff_counts->field_type= FIELD_ZERO; + huff_counts->bytes_packed=0; + huff_counts->counts[0]=0; + goto found_pack; + } + /* Remeber the number of significant spaces. */ + old_space_count=huff_counts->counts[' ']; + /* Add all leading and trailing spaces. */ + huff_counts->counts[' ']+= (huff_counts->tot_end_space + + huff_counts->tot_pre_space + + huff_counts->empty_fields * + huff_counts->field_length); + /* Check, what the compressed length of this would be. */ + old_length=calc_packed_length(huff_counts,0)+records/8; + /* Get the number of zero bytes. */ + length=huff_counts->zero_fields*huff_counts->field_length; + /* Add it to the counts. */ + huff_counts->counts[0]+=length; + /* Check, what the compressed length of this would be. */ + new_length=calc_packed_length(huff_counts,0); + /* If the compression without the zeroes would be shorter, we are done. */ + if (old_length < new_length && huff_counts->field_length > 1) + { + huff_counts->field_type=FIELD_SKIP_ZERO; + huff_counts->counts[0]-=length; + huff_counts->bytes_packed=old_length- records/8; + goto found_pack; + } + /* Remove the insignificant spaces, but keep the zeroes. */ + huff_counts->counts[' ']=old_space_count; + } + /* Check, what the compressed length of this column would be. */ + huff_counts->bytes_packed=calc_packed_length(huff_counts,0); + + /* + If there are enough empty records (in this column), + treating them specially may pay off. + */ + if (huff_counts->empty_fields) + { + if (huff_counts->field_length > 2 && + huff_counts->empty_fields + (records - huff_counts->empty_fields)* + (1+max_bit(MY_MAX(huff_counts->max_pre_space, + huff_counts->max_end_space))) < + records * max_bit(huff_counts->field_length)) + { + huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS; + } + else + { + length=huff_counts->empty_fields*huff_counts->field_length; + if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space) + { + huff_counts->tot_end_space+=length; + huff_counts->max_end_space=huff_counts->field_length; + if (huff_counts->field_length < 8) + huff_counts->end_space[huff_counts->field_length]+= + huff_counts->empty_fields; + } + if (huff_counts->tot_pre_space) + { + huff_counts->tot_pre_space+=length; + huff_counts->max_pre_space=huff_counts->field_length; + if (huff_counts->field_length < 8) + huff_counts->pre_space[huff_counts->field_length]+= + huff_counts->empty_fields; + } + } + } + + /* + If there are enough trailing spaces (in this column), + treating them specially may pay off. + */ + if (huff_counts->tot_end_space) + { + huff_counts->counts[' ']+=huff_counts->tot_pre_space; + if (test_space_compress(huff_counts,records,huff_counts->max_end_space, + huff_counts->end_space, + huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE)) + goto found_pack; + huff_counts->counts[' ']-=huff_counts->tot_pre_space; + } + + /* + If there are enough leading spaces (in this column), + treating them specially may pay off. + */ + if (huff_counts->tot_pre_space) + { + if (test_space_compress(huff_counts,records,huff_counts->max_pre_space, + huff_counts->pre_space, + huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE)) + goto found_pack; + } + + found_pack: /* Found field-packing */ + + /* Test if we can use zero-fill */ + + if (huff_counts->max_zero_fill && + (huff_counts->field_type == FIELD_NORMAL || + huff_counts->field_type == FIELD_SKIP_ZERO)) + { + huff_counts->counts[0]-=huff_counts->max_zero_fill* + (huff_counts->field_type == FIELD_SKIP_ZERO ? + records - huff_counts->zero_fields : records); + huff_counts->pack_type|=PACK_TYPE_ZERO_FILL; + huff_counts->bytes_packed=calc_packed_length(huff_counts,0); + } + + /* Test if intervall-field is better */ + + if (huff_counts->tree_buff) + { + HUFF_TREE tree; + + DBUG_EXECUTE_IF("forceintervall", + huff_counts->bytes_packed= ~ (my_off_t) 0;); + tree.element_buffer=0; + if (!make_huff_tree(&tree,huff_counts) && + tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed) + { + if (tree.elements == 1) + huff_counts->field_type=FIELD_CONSTANT; + else + huff_counts->field_type=FIELD_INTERVALL; + huff_counts->pack_type=0; + } + else + { + my_free(huff_counts->tree_buff); + delete_tree(&huff_counts->int_tree, 0); + huff_counts->tree_buff=0; + } + if (tree.element_buffer) + my_free(tree.element_buffer); + } + if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS) + space_fields++; + if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL) + fill_zero_fields++; + field_count[huff_counts->field_type]++; + } + DBUG_PRINT("info", ("normal: %3d empty-space: %3d " + "empty-zero: %3d empty-fill: %3d", + field_count[FIELD_NORMAL],space_fields, + field_count[FIELD_SKIP_ZERO],fill_zero_fields)); + DBUG_PRINT("info", ("pre-space: %3d end-space: %3d " + "intervall-fields: %3d zero: %3d", + field_count[FIELD_SKIP_PRESPACE], + field_count[FIELD_SKIP_ENDSPACE], + field_count[FIELD_INTERVALL], + field_count[FIELD_ZERO])); + if (verbose) + printf("\nnormal: %3d empty-space: %3d " + "empty-zero: %3d empty-fill: %3d\n" + "pre-space: %3d end-space: %3d " + "intervall-fields: %3d zero: %3d\n", + field_count[FIELD_NORMAL],space_fields, + field_count[FIELD_SKIP_ZERO],fill_zero_fields, + field_count[FIELD_SKIP_PRESPACE], + field_count[FIELD_SKIP_ENDSPACE], + field_count[FIELD_INTERVALL], + field_count[FIELD_ZERO]); + DBUG_VOID_RETURN; +} + + +/* Test if we can use space-compression and empty-field-compression */ + +static int +test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records, + uint max_space_length, my_off_t *space_counts, + my_off_t tot_space_count, enum en_fieldtype field_type) +{ + int min_pos; + uint length_bits,i; + my_off_t space_count,min_space_count,min_pack,new_length,skip; + + length_bits=max_bit(max_space_length); + + /* Default no end_space-packing */ + space_count=huff_counts->counts[(uint) ' ']; + min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count); + min_pack=calc_packed_length(huff_counts,0); + min_pos= -2; + huff_counts->counts[(uint) ' ']=space_count; + + /* Test with allways space-count */ + new_length=huff_counts->bytes_packed+length_bits*records/8; + if (new_length+1 < min_pack) + { + min_pos= -1; + min_pack=new_length; + min_space_count=space_count; + } + /* Test with length-flag */ + for (skip=0L, i=0 ; i < 8 ; i++) + { + if (space_counts[i]) + { + if (i) + huff_counts->counts[(uint) ' ']+=space_counts[i]; + skip+=huff_counts->pre_space[i]; + new_length=calc_packed_length(huff_counts,0)+ + (records+(records-skip)*(1+length_bits))/8; + if (new_length < min_pack) + { + min_pos=(int) i; + min_pack=new_length; + min_space_count=huff_counts->counts[(uint) ' ']; + } + } + } + + huff_counts->counts[(uint) ' ']=min_space_count; + huff_counts->bytes_packed=min_pack; + switch (min_pos) { + case -2: + return(0); /* No space-compress */ + case -1: /* Always space-count */ + huff_counts->field_type=field_type; + huff_counts->min_space=0; + huff_counts->length_bits=max_bit(max_space_length); + break; + default: + huff_counts->field_type=field_type; + huff_counts->min_space=(uint) min_pos; + huff_counts->pack_type|=PACK_TYPE_SELECTED; + huff_counts->length_bits=max_bit(max_space_length); + break; + } + return(1); /* Using space-compress */ +} + + + /* Make a huff_tree of each huff_count */ + +static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees) +{ + uint tree; + HUFF_TREE *huff_tree; + DBUG_ENTER("make_huff_trees"); + + if (!(huff_tree=(HUFF_TREE*) my_malloc(PSI_NOT_INSTRUMENTED, + trees*sizeof(HUFF_TREE), MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(0); + + for (tree=0 ; tree < trees ; tree++) + { + if (make_huff_tree(huff_tree+tree,huff_counts+tree)) + { + while (tree--) + my_free(huff_tree[tree].element_buffer); + my_free(huff_tree); + DBUG_RETURN(0); + } + } + DBUG_RETURN(huff_tree); +} + +/* + Build a Huffman tree. + + SYNOPSIS + make_huff_tree() + huff_tree The Huffman tree. + huff_counts The counts. + + DESCRIPTION + Build a Huffman tree according to huff_counts->counts or + huff_counts->tree_buff. tree_buff, if non-NULL contains up to + tree_buff_length of distinct column values. In that case, whole + values can be Huffman encoded instead of single bytes. + + RETURN + 0 OK + != 0 Error +*/ + +static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts) +{ + uint i,found,bits_packed,first,last; + my_off_t bytes_packed; + HUFF_ELEMENT *a,*b,*new_huff_el; + + first=last=0; + if (huff_counts->tree_buff) + { + /* Calculate the number of distinct values in tree_buff. */ + found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) / + huff_counts->field_length; + first=0; last=found-1; + } + else + { + /* Count the number of uchar codes found in the column. */ + for (i=found=0 ; i < 256 ; i++) + { + if (huff_counts->counts[i]) + { + if (! found++) + first=i; + last=i; + } + } + if (found < 2) + found=2; + } + + /* When using 'tree_buff' we can have more that 256 values. */ + if (queue.max_elements < found) + { + delete_queue(&queue); + if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0)) + return -1; + } + + /* Allocate or reallocate an element buffer for the Huffman tree. */ + if (!huff_tree->element_buffer) + { + if (!(huff_tree->element_buffer= + (HUFF_ELEMENT*) my_malloc(PSI_NOT_INSTRUMENTED, + found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME)))) + return 1; + } + else + { + HUFF_ELEMENT *temp; + if (!(temp= (HUFF_ELEMENT*) my_realloc(PSI_NOT_INSTRUMENTED, + (uchar*) huff_tree->element_buffer, found*2*sizeof(HUFF_ELEMENT), MYF(MY_WME)))) + return 1; + huff_tree->element_buffer=temp; + } + + huff_counts->tree=huff_tree; + huff_tree->counts=huff_counts; + huff_tree->min_chr=first; + huff_tree->max_chr=last; + huff_tree->char_bits=max_bit(last-first); + huff_tree->offset_bits=max_bit(found-1)+1; + + if (huff_counts->tree_buff) + { + huff_tree->elements=0; + huff_tree->tree_pack_length=(1+15+16+5+5+ + (huff_tree->char_bits+1)*found+ + (huff_tree->offset_bits+1)* + (found-2)+7)/8 + + (uint) (huff_tree->counts->tree_pos- + huff_tree->counts->tree_buff); + /* + Put a HUFF_ELEMENT into the queue for every distinct column value. + + tree_walk() calls save_counts_in_queue() for every element in + 'int_tree'. This takes elements from the target trees element + buffer and places references to them into the buffer of the + priority queue. We insert in column value order, but the order is + in fact irrelevant here. We will establish the correct order + later. + */ + tree_walk(&huff_counts->int_tree, + (int (*)(void*, element_count,void*)) save_counts_in_queue, + (uchar*) huff_tree, left_root_right); + } + else + { + huff_tree->elements=found; + huff_tree->tree_pack_length=(9+9+5+5+ + (huff_tree->char_bits+1)*found+ + (huff_tree->offset_bits+1)* + (found-2)+7)/8; + /* + Put a HUFF_ELEMENT into the queue for every uchar code found in the column. + + The elements are taken from the target trees element buffer. + Instead of using queue_insert(), we just place references to the + elements into the buffer of the priority queue. We insert in byte + value order, but the order is in fact irrelevant here. We will + establish the correct order later. + */ + for (i=first, found=0 ; i <= last ; i++) + { + if (huff_counts->counts[i]) + { + new_huff_el=huff_tree->element_buffer+(found++); + new_huff_el->count=huff_counts->counts[i]; + new_huff_el->a.leaf.null=0; + new_huff_el->a.leaf.element_nr=i; + queue.root[found]=(uchar*) new_huff_el; + } + } + /* + If there is only a single uchar value in this field in all records, + add a second element with zero incidence. This is required to enter + the loop, which builds the Huffman tree. + */ + while (found < 2) + { + new_huff_el=huff_tree->element_buffer+(found++); + new_huff_el->count=0; + new_huff_el->a.leaf.null=0; + if (last) + new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1; + else + new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1; + queue.root[found]=(uchar*) new_huff_el; + } + } + + /* Make a queue from the queue buffer. */ + queue.elements=found; + + /* + Make a priority queue from the queue. Construct its index so that we + have a partially ordered tree. + */ + queue_fix(&queue); + + /* The Huffman algorithm. */ + bytes_packed=0; bits_packed=0; + for (i=1 ; i < found ; i++) + { + /* + Pop the top element from the queue (the one with the least incidence). + Popping from a priority queue includes a re-ordering of the queue, + to get the next least incidence element to the top. + */ + a=(HUFF_ELEMENT*) queue_remove_top(&queue); + /* Copy the next least incidence element */ + b=(HUFF_ELEMENT*) queue_top(&queue); + /* Get a new element from the element buffer. */ + new_huff_el=huff_tree->element_buffer+found+i; + /* The new element gets the sum of the two least incidence elements. */ + new_huff_el->count=a->count+b->count; + /* + The Huffman algorithm assigns another bit to the code for a byte + every time that bytes incidence is combined (directly or indirectly) + to a new element as one of the two least incidence elements. + This means that one more bit per incidence of that uchar is required + in the resulting file. So we add the new combined incidence as the + number of bits by which the result grows. + */ + bits_packed+=(uint) (new_huff_el->count & 7); + bytes_packed+=new_huff_el->count/8; + /* The new element points to its children, lesser in left. */ + new_huff_el->a.nod.left=a; + new_huff_el->a.nod.right=b; + /* + Replace the copied top element by the new element and re-order the + queue. + */ + queue_top(&queue)= (uchar*) new_huff_el; + queue_replace_top(&queue); + } + huff_tree->root=(HUFF_ELEMENT*) queue.root[1]; + huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8; + return 0; +} + +static int compare_tree(void* cmp_arg __attribute__((unused)), + register const uchar *s, register const uchar *t) +{ + uint length; + for (length=global_count->field_length; length-- ;) + if (*s++ != *t++) + return (int) s[-1] - (int) t[-1]; + return 0; +} + +/* + Organize distinct column values and their incidences into a priority queue. + + SYNOPSIS + save_counts_in_queue() + key The column value. + count The incidence of this value. + tree The Huffman tree to be built later. + + DESCRIPTION + We use the element buffer of the targeted tree. The distinct column + values are organized in a priority queue first. The Huffman + algorithm will later organize the elements into a Huffman tree. For + the time being, we just place references to the elements into the + queue buffer. The buffer will later be organized into a priority + queue. + + RETURN + 0 + */ + +static int save_counts_in_queue(uchar *key, element_count count, + HUFF_TREE *tree) +{ + HUFF_ELEMENT *new_huff_el; + + new_huff_el=tree->element_buffer+(tree->elements++); + new_huff_el->count=count; + new_huff_el->a.leaf.null=0; + new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) / + tree->counts->field_length; + queue.root[tree->elements]=(uchar*) new_huff_el; + return 0; +} + + +/* + Calculate length of file if given counts should be used. + + SYNOPSIS + calc_packed_length() + huff_counts The counts for a column of the table(s). + add_tree_lenght If the decode tree length should be added. + + DESCRIPTION + We need to follow the Huffman algorithm until we know, how many bits + are required for each uchar code. But we do not need the resulting + Huffman tree. Hence, we can leave out some steps which are essential + in make_huff_tree(). + + RETURN + Number of bytes required to compress this table column. +*/ + +static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts, + uint add_tree_lenght) +{ + uint i,found,bits_packed,first,last; + my_off_t bytes_packed; + HUFF_ELEMENT element_buffer[256]; + DBUG_ENTER("calc_packed_length"); + + /* + WARNING: We use a small hack for efficiency: Instead of placing + references to HUFF_ELEMENTs into the queue, we just insert + references to the counts of the uchar codes which appeared in this + table column. During the Huffman algorithm they are successively + replaced by references to HUFF_ELEMENTs. This works, because + HUFF_ELEMENTs have the incidence count at their beginning. + Regardless, wether the queue array contains references to counts of + type my_off_t or references to HUFF_ELEMENTs which have the count of + type my_off_t at their beginning, it always points to a count of the + same type. + + Instead of using queue_insert(), we just copy the references into + the buffer of the priority queue. We insert in uchar value order, but + the order is in fact irrelevant here. We will establish the correct + order later. + */ + first=last=0; + for (i=found=0 ; i < 256 ; i++) + { + if (huff_counts->counts[i]) + { + if (! found++) + first=i; + last=i; + /* We start with root[1], which is the queues top element. */ + queue.root[found]=(uchar*) &huff_counts->counts[i]; + } + } + if (!found) + DBUG_RETURN(0); /* Empty tree */ + /* + If there is only a single uchar value in this field in all records, + add a second element with zero incidence. This is required to enter + the loop, which follows the Huffman algorithm. + */ + if (found < 2) + queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1]; + + /* Make a queue from the queue buffer. */ + queue.elements=found; + + bytes_packed=0; bits_packed=0; + /* Add the length of the coding table, which would become part of the file. */ + if (add_tree_lenght) + bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+ + (max_bit(found-1)+1+1)*(found-2) +7)/8; + + /* + Make a priority queue from the queue. Construct its index so that we + have a partially ordered tree. + */ + queue_fix(&queue); + + /* The Huffman algorithm. */ + for (i=0 ; i < found-1 ; i++) + { + my_off_t *a; + my_off_t *b; + HUFF_ELEMENT *new_huff_el; + + /* + Pop the top element from the queue (the one with the least + incidence). Popping from a priority queue includes a re-ordering + of the queue, to get the next least incidence element to the top. + */ + a= (my_off_t*) queue_remove_top(&queue); + /* Copy the next least incidence element. */ + b= (my_off_t*) queue_top(&queue); + /* Create a new element in a local (automatic) buffer. */ + new_huff_el= element_buffer + i; + /* The new element gets the sum of the two least incidence elements. */ + new_huff_el->count= *a + *b; + /* + The Huffman algorithm assigns another bit to the code for a byte + every time that bytes incidence is combined (directly or indirectly) + to a new element as one of the two least incidence elements. + This means that one more bit per incidence of that uchar is required + in the resulting file. So we add the new combined incidence as the + number of bits by which the result grows. + */ + bits_packed+=(uint) (new_huff_el->count & 7); + bytes_packed+=new_huff_el->count/8; + /* + Replace the copied top element by the new element and re-order the + queue. This successively replaces the references to counts by + references to HUFF_ELEMENTs. + */ + queue_top(&queue)= (uchar*) new_huff_el; + queue_replace_top(&queue); + } + DBUG_RETURN(bytes_packed+(bits_packed+7)/8); +} + + + /* Remove trees that don't give any compression */ + +static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees) +{ + uint k,tree_number; + HUFF_COUNTS count,*i,*j,*last_count; + + last_count=huff_counts+trees; + for (tree_number=0, i=huff_counts ; i < last_count ; i++) + { + if (!i->tree->tree_number) + { + i->tree->tree_number= ++tree_number; + if (i->tree_buff) + continue; /* Don't join intervall */ + for (j=i+1 ; j < last_count ; j++) + { + if (! j->tree->tree_number && ! j->tree_buff) + { + for (k=0 ; k < 256 ; k++) + count.counts[k]=i->counts[k]+j->counts[k]; + if (calc_packed_length(&count,1) <= + i->tree->bytes_packed + j->tree->bytes_packed+ + i->tree->tree_pack_length+j->tree->tree_pack_length+ + ALLOWED_JOIN_DIFF) + { + memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256); + my_free(j->tree->element_buffer); + j->tree->element_buffer=0; + j->tree=i->tree; + bmove((uchar*) i->counts,(uchar*) count.counts, + sizeof(count.counts[0])*256); + if (make_huff_tree(i->tree,i)) + return (uint) -1; + } + } + } + } + } + DBUG_PRINT("info", ("Original trees: %d After join: %d", + trees, tree_number)); + if (verbose) + printf("Original trees: %d After join: %d\n", trees, tree_number); + return tree_number; /* Return trees left */ +} + + +/* + Fill in huff_tree encode tables. + + SYNOPSIS + make_huff_decode_table() + huff_tree An array of HUFF_TREE which are to be encoded. + trees The number of HUFF_TREE in the array. + + RETURN + 0 success + != 0 error +*/ + +static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees) +{ + uint elements; + for ( ; trees-- ; huff_tree++) + { + if (huff_tree->tree_number > 0) + { + elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256; + if (!(huff_tree->code = + (ulonglong*) my_malloc(PSI_NOT_INSTRUMENTED, + elements* (sizeof(ulonglong) + sizeof(uchar)), + MYF(MY_WME | MY_ZEROFILL)))) + return 1; + huff_tree->code_len=(uchar*) (huff_tree->code+elements); + make_traverse_code_tree(huff_tree, huff_tree->root, + 8 * sizeof(ulonglong), 0); + } + } + return 0; +} + + +static void make_traverse_code_tree(HUFF_TREE *huff_tree, + HUFF_ELEMENT *element, + uint size, ulonglong code) +{ + uint chr; + if (!element->a.leaf.null) + { + chr=element->a.leaf.element_nr; + huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size); + huff_tree->code[chr]= (code >> size); + if (huff_tree->height < 8 * sizeof(ulonglong) - size) + huff_tree->height= 8 * sizeof(ulonglong) - size; + } + else + { + size--; + make_traverse_code_tree(huff_tree,element->a.nod.left,size,code); + make_traverse_code_tree(huff_tree, element->a.nod.right, size, + code + (((ulonglong) 1) << size)); + } + return; +} + + +/* + Convert a value into binary digits. + + SYNOPSIS + bindigits() + value The value. + length The number of low order bits to convert. + + NOTE + The result string is in static storage. It is reused on every call. + So you cannot use it twice in one expression. + + RETURN + A pointer to a static NUL-terminated string. + */ + +static char *bindigits(ulonglong value, uint bits) +{ + static char digits[72]; + char *ptr= digits; + uint idx= bits; + + DBUG_ASSERT(idx < sizeof(digits)); + while (idx) + *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1); + *ptr= '\0'; + return digits; +} + + +/* + Convert a value into hexadecimal digits. + + SYNOPSIS + hexdigits() + value The value. + + NOTE + The result string is in static storage. It is reused on every call. + So you cannot use it twice in one expression. + + RETURN + A pointer to a static NUL-terminated string. + */ + +static char *hexdigits(ulonglong value) +{ + static char digits[20]; + char *ptr= digits; + uint idx= 2 * sizeof(value); /* Two hex digits per byte. */ + + DBUG_ASSERT(idx < sizeof(digits)); + while (idx) + { + if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9') + *(ptr - 1)+= 'a' - '9' - 1; + } + *ptr= '\0'; + return digits; +} + + + /* Write header to new packed data file */ + +static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees, + my_off_t tot_elements,my_off_t filelength) +{ + uchar *buff= (uchar*) file_buffer.pos; + + bzero(buff,HEAD_LENGTH); + memcpy(buff,maria_pack_file_magic,4); + int4store(buff+4,head_length); + int4store(buff+8, mrg->min_pack_length); + int4store(buff+12,mrg->max_pack_length); + int4store(buff+16,tot_elements); + int4store(buff+20,intervall_length); + int2store(buff+24,trees); + buff[26]=(char) mrg->ref_length; + /* Save record pointer length */ + buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2); + if (test_only) + return 0; + my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)); + return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0; +} + + /* Write fieldinfo to new packed file */ + +static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees) +{ + reg1 uint i; + uint huff_tree_bits; + huff_tree_bits=max_bit(trees ? trees-1 : 0); + + DBUG_PRINT("info", (" ")); + DBUG_PRINT("info", ("column types:")); + DBUG_PRINT("info", ("FIELD_NORMAL 0")); + DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1")); + DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2")); + DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3")); + DBUG_PRINT("info", ("FIELD_BLOB 4")); + DBUG_PRINT("info", ("FIELD_CONSTANT 5")); + DBUG_PRINT("info", ("FIELD_INTERVALL 6")); + DBUG_PRINT("info", ("FIELD_ZERO 7")); + DBUG_PRINT("info", ("FIELD_VARCHAR 8")); + DBUG_PRINT("info", ("FIELD_CHECK 9")); + DBUG_PRINT("info", (" ")); + DBUG_PRINT("info", ("pack type as a set of flags:")); + DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1")); + DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2")); + DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4")); + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + { + printf("\n"); + printf("column types:\n"); + printf("FIELD_NORMAL 0\n"); + printf("FIELD_SKIP_ENDSPACE 1\n"); + printf("FIELD_SKIP_PRESPACE 2\n"); + printf("FIELD_SKIP_ZERO 3\n"); + printf("FIELD_BLOB 4\n"); + printf("FIELD_CONSTANT 5\n"); + printf("FIELD_INTERVALL 6\n"); + printf("FIELD_ZERO 7\n"); + printf("FIELD_VARCHAR 8\n"); + printf("FIELD_CHECK 9\n"); + printf("\n"); + printf("pack type as a set of flags:\n"); + printf("PACK_TYPE_SELECTED 1\n"); + printf("PACK_TYPE_SPACE_FIELDS 2\n"); + printf("PACK_TYPE_ZERO_FILL 4\n"); + printf("\n"); + } + for (i=0 ; i++ < fields ; counts++) + { + write_bits((ulonglong) (int) counts->field_type, 5); + write_bits(counts->pack_type,6); + if (counts->pack_type & PACK_TYPE_ZERO_FILL) + write_bits(counts->max_zero_fill,5); + else + write_bits(counts->length_bits,5); + write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits); + DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u " + "lbits: %2u tree: %2u length: %4u", + i , counts->field_type, counts->pack_type, + counts->max_zero_fill, counts->length_bits, + counts->tree->tree_number, counts->field_length)); + if (verbose >= 2) + printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u " + "tree: %2u length: %4u\n", i , counts->field_type, + counts->pack_type, counts->max_zero_fill, counts->length_bits, + counts->tree->tree_number, counts->field_length); + } + flush_bits(); + return; +} + + /* Write all huff_trees to new datafile. Return tot count of + elements in all trees + Returns 0 on error */ + +static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees) +{ + uint i,int_length; + uint tree_no; + uint codes; + uint errors= 0; + uint *packed_tree,*offset,length; + my_off_t elements; + + /* Find the highest number of elements in the trees. */ + for (i=length=0 ; i < trees ; i++) + if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length) + length=huff_tree[i].elements; + /* + Allocate a buffer for packing a decode tree. Two numbers per element + (left child and right child). + */ + if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2))) + { + my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2); + return 0; + } + + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + printf("\n"); + tree_no= 0; + intervall_length=0; + for (elements=0; trees-- ; huff_tree++) + { + /* Skip columns that have been joined with other columns. */ + if (huff_tree->tree_number == 0) + continue; /* Deleted tree */ + tree_no++; + DBUG_PRINT("info", (" ")); + if (verbose >= 3) + printf("\n"); + /* Count the total number of elements (byte codes or column values). */ + elements+=huff_tree->elements; + huff_tree->max_offset=2; + /* Build a tree of offsets and codes for decoding in 'packed_tree'. */ + if (huff_tree->elements <= 1) + offset=packed_tree; + else + offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree); + + /* This should be the same as 'length' above. */ + huff_tree->offset_bits=max_bit(huff_tree->max_offset); + + /* + Since we check this during collecting the distinct column values, + this should never happen. + */ + if (huff_tree->max_offset >= IS_OFFSET) + { /* This should be impossible */ + fprintf(stderr, "Tree offset got too big: %d, aborted\n", + huff_tree->max_offset); + my_afree(packed_tree); + return 0; + } + + DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu " + "char_bits: %u\n", + (ulong) (file_buffer.pos - file_buffer.buffer), + huff_tree->elements, (ulong) (offset - packed_tree), + huff_tree->char_bits)); + if (!huff_tree->counts->tree_buff) + { + /* We do a uchar compression on this column. Mark with bit 0. */ + write_bits(0,1); + write_bits(huff_tree->min_chr,8); + write_bits(huff_tree->elements,9); + write_bits(huff_tree->char_bits,5); + write_bits(huff_tree->offset_bits,5); + int_length=0; + } + else + { + int_length=(uint) (huff_tree->counts->tree_pos - + huff_tree->counts->tree_buff); + /* We have distinct column values for this column. Mark with bit 1. */ + write_bits(1,1); + write_bits(huff_tree->elements,15); + write_bits(int_length,16); + write_bits(huff_tree->char_bits,5); + write_bits(huff_tree->offset_bits,5); + intervall_length+=int_length; + } + DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u " + "offset_bits: %2u %s: %5u codelen: %2u", + tree_no, huff_tree->elements, huff_tree->char_bits, + huff_tree->offset_bits, huff_tree->counts->tree_buff ? + "bufflen" : "min_chr", huff_tree->counts->tree_buff ? + int_length : huff_tree->min_chr, huff_tree->height)); + if (verbose >= 2) + printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u " + "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements, + huff_tree->char_bits, huff_tree->offset_bits, + huff_tree->counts->tree_buff ? "bufflen" : "min_chr", + huff_tree->counts->tree_buff ? int_length : + huff_tree->min_chr, huff_tree->height); + + /* Check that the code tree length matches the element count. */ + length=(uint) (offset-packed_tree); + if (length != huff_tree->elements*2-2) + { + fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n", + length, huff_tree->elements * 2 - 2); + errors++; + break; + } + + for (i=0 ; i < length ; i++) + { + if (packed_tree[i] & IS_OFFSET) + write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits), + huff_tree->offset_bits+1); + else + write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1); + DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x", + i, (packed_tree[i] & IS_OFFSET) ? + " -> " : "", (packed_tree[i] & IS_OFFSET) ? + packed_tree[i] - IS_OFFSET + i : packed_tree[i])); + if (verbose >= 3) + printf("tree[0x%04x]: %s0x%04x\n", + i, (packed_tree[i] & IS_OFFSET) ? " -> " : "", + (packed_tree[i] & IS_OFFSET) ? + packed_tree[i] - IS_OFFSET + i : packed_tree[i]); + } + flush_bits(); + + /* + Display coding tables and check their correctness. + */ + codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256; + for (i= 0; i < codes; i++) + { + ulonglong code; + uint bits; + uint len; + uint idx; + + if (! (len= huff_tree->code_len[i])) + continue; + DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i, + hexdigits(huff_tree->code[i]), huff_tree->code_len[i], + bindigits(huff_tree->code[i], + huff_tree->code_len[i]))); + if (verbose >= 3) + printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i, + hexdigits(huff_tree->code[i]), huff_tree->code_len[i], + bindigits(huff_tree->code[i], huff_tree->code_len[i])); + + /* Check that the encode table decodes correctly. */ + code= 0; + bits= 0; + idx= 0; + DBUG_EXECUTE_IF("forcechkerr1", len--;); + DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code);); + DBUG_EXECUTE_IF("forcechkerr3", idx= length;); + for (;;) + { + if (! len) + { + fflush(stdout); + fprintf(stderr, "error: code 0x%s with %u bits not found\n", + hexdigits(huff_tree->code[i]), huff_tree->code_len[i]); + errors++; + break; + } + code<<= 1; + code|= (huff_tree->code[i] >> (--len)) & 1; + bits++; + if (bits > 8 * sizeof(code)) + { + fflush(stdout); + fprintf(stderr, "error: Huffman code too long: %u/%u\n", + bits, (uint) (8 * sizeof(code))); + errors++; + break; + } + idx+= (uint) code & 1; + if (idx >= length) + { + fflush(stdout); + fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length); + errors++; + break; + } + if (packed_tree[idx] & IS_OFFSET) + idx+= packed_tree[idx] & ~IS_OFFSET; + else + break; /* Hit a leaf. This contains the result value. */ + } + if (errors) + break; + + DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;); + if (packed_tree[idx] != i) + { + fflush(stdout); + fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n", + packed_tree[idx], i); + errors++; + break; + } + } /*end for (codes)*/ + if (errors) + break; + + /* Write column values in case of distinct column value compression. */ + if (huff_tree->counts->tree_buff) + { + for (i=0 ; i < int_length ; i++) + { + write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8); + DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x", + i, (uchar) huff_tree->counts->tree_buff[i])); + if (verbose >= 3) + printf("column_values[0x%04x]: 0x%02x\n", + i, (uchar) huff_tree->counts->tree_buff[i]); + } + } + flush_bits(); + } + DBUG_PRINT("info", (" ")); + if (verbose >= 2) + printf("\n"); + my_afree(packed_tree); + if (errors) + { + fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"); + return 0; + } + return elements; +} + + +static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element, + uint *offset) +{ + uint *prev_offset; + + prev_offset= offset; + /* + 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null, + then there is no left child and, hence no right child either. This + is a property of a binary tree. An element is either a node with two + childs, or a leaf without childs. + + The current element is always a node with two childs. Go left first. + */ + if (!element->a.nod.left->a.leaf.null) + { + /* Store the uchar code or the index of the column value. */ + prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr; + offset+=2; + } + else + { + /* + Recursively traverse the tree to the left. Mark it as an offset to + another tree node (in contrast to a uchar code or column value index). + */ + prev_offset[0]= IS_OFFSET+2; + offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2); + } + + /* Now, check the right child. */ + if (!element->a.nod.right->a.leaf.null) + { + /* Store the uchar code or the index of the column value. */ + prev_offset[1]=element->a.nod.right->a.leaf.element_nr; + return offset; + } + else + { + /* + Recursively traverse the tree to the right. Mark it as an offset to + another tree node (in contrast to a uchar code or column value index). + */ + uint temp=(uint) (offset-prev_offset-1); + prev_offset[1]= IS_OFFSET+ temp; + if (huff_tree->max_offset < temp) + huff_tree->max_offset = temp; + return make_offset_code_tree(huff_tree,element->a.nod.right,offset); + } +} + + /* Get number of bits neaded to represent value */ + +static uint max_bit(register uint value) +{ + reg2 uint power=1; + + while ((value>>=1)) + power++; + return (power); +} + + +static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts) +{ + int error; + uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length; + uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes; + my_off_t record_count; + char llbuf[32]; + ulong length,pack_length; + uchar *record,*pos,*end_pos,*record_pos,*start_pos; + HUFF_COUNTS *count,*end_count; + HUFF_TREE *tree; + MARIA_HA *isam_file=mrg->file[0]; + uint pack_version= (uint) isam_file->s->pack.version; + DBUG_ENTER("compress_maria_file"); + + /* Allocate a buffer for the records (excluding blobs). */ + if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength))) + return -1; + + end_count=huff_counts+isam_file->s->base.fields; + min_record_length= (uint) ~0; + max_record_length=0; + null_bytes= isam_file->s->base.null_bytes; + + /* + Calculate the maximum number of bits required to pack the records. + Remember to understand 'max_zero_fill' as 'min_zero_fill'. + The tree height determines the maximum number of bits per value. + Some fields skip leading or trailing spaces or zeroes. The skipped + number of bytes is encoded by 'length_bits' bits. + Empty blobs and varchar are encoded with a single 1 bit. Other blobs + and varchar get a leading 0 bit. + */ + max_calc_length= null_bytes; + for (i= 0 ; i < isam_file->s->base.fields ; i++) + { + if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL)) + huff_counts[i].max_zero_fill=0; + if (huff_counts[i].field_type == FIELD_CONSTANT || + huff_counts[i].field_type == FIELD_ZERO || + huff_counts[i].field_type == FIELD_CHECK) + continue; + if (huff_counts[i].field_type == FIELD_INTERVALL) + max_calc_length+=huff_counts[i].tree->height; + else if (huff_counts[i].field_type == FIELD_BLOB || + huff_counts[i].field_type == FIELD_VARCHAR) + max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1; + else + max_calc_length+= + (huff_counts[i].field_length - huff_counts[i].max_zero_fill)* + huff_counts[i].tree->height+huff_counts[i].length_bits; + } + max_calc_length= (max_calc_length + 7) / 8; + pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length); + record_count=0; + /* 'max_blob_length' is the max length of all blobs of a record. */ + pack_blob_length= isam_file->s->base.blobs ? + _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0; + max_pack_length=pack_ref_length+pack_blob_length; + + DBUG_PRINT("fields", ("===")); + mrg_reset(mrg); + while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE) + { + ulong tot_blob_length=0; + if (! error) + { + if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length + + null_bytes)) + break; + record_pos= file_buffer.pos; + file_buffer.pos+= max_pack_length; + if (null_bytes) + { + /* Copy null bits 'as is' */ + memcpy(file_buffer.pos, record, null_bytes); + file_buffer.pos+= null_bytes; + } + for (start_pos=record+null_bytes, count= huff_counts; + count < end_count ; + count++) + { + end_pos=start_pos+(field_length=count->field_length); + tree=count->tree; + + DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u " + "lbits: %2u tree: %2u length: %4u", + (ulong) (count - huff_counts + 1), + count->field_type, + count->pack_type, count->max_zero_fill, + count->length_bits, count->tree->tree_number, + count->field_length)); + + /* Check if the column contains spaces only. */ + if (count->pack_type & PACK_TYPE_SPACE_FIELDS) + { + for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ; + if (pos == end_pos) + { + DBUG_PRINT("fields", + ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1")); + DBUG_PRINT("fields", ("---")); + write_bits(1,1); + start_pos=end_pos; + continue; + } + DBUG_PRINT("fields", + ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1")); + write_bits(0,1); + } + end_pos-=count->max_zero_fill; + field_length-=count->max_zero_fill; + + switch (count->field_type) { + case FIELD_SKIP_ZERO: + if (!memcmp(start_pos, zero_string, field_length)) + { + DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1")); + write_bits(1,1); + start_pos=end_pos; + break; + } + DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1")); + write_bits(0,1); + /* Fall through */ + case FIELD_NORMAL: + DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes", + (ulong) (end_pos - start_pos))); + for ( ; start_pos < end_pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + break; + case FIELD_SKIP_ENDSPACE: + for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ; + length= (ulong) (end_pos - pos); + if (count->pack_type & PACK_TYPE_SELECTED) + { + if (length > count->min_space) + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1")); + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(1,1); + write_bits(length,count->length_bits); + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE not more than min_space, " + "bits: 1")); + write_bits(0,1); + pos=end_pos; + } + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(length,count->length_bits); + } + /* Encode all significant bytes. */ + DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes", + (ulong) (pos - start_pos))); + for ( ; start_pos < pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + start_pos=end_pos; + break; + case FIELD_SKIP_PRESPACE: + for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ; + length= (ulong) (pos - start_pos); + if (count->pack_type & PACK_TYPE_SELECTED) + { + if (length > count->min_space) + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE more than min_space, bits: 1")); + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(1,1); + write_bits(length,count->length_bits); + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE not more than min_space, " + "bits: 1")); + pos=start_pos; + write_bits(0,1); + } + } + else + { + DBUG_PRINT("fields", + ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u", + length, field_length, count->length_bits)); + write_bits(length,count->length_bits); + } + /* Encode all significant bytes. */ + DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes", + (ulong) (end_pos - start_pos))); + for (start_pos=pos ; start_pos < end_pos ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + break; + case FIELD_CONSTANT: + case FIELD_ZERO: + case FIELD_CHECK: + DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK")); + start_pos=end_pos; + break; + case FIELD_INTERVALL: + global_count=count; + pos=(uchar*) tree_search(&count->int_tree, start_pos, + count->int_tree.custom_arg); + intervall=(uint) (pos - count->tree_buff)/field_length; + DBUG_PRINT("fields", ("FIELD_INTERVALL")); + DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u", + intervall, hexdigits(tree->code[intervall]), + (uint) tree->code_len[intervall])); + write_bits(tree->code[intervall],(uint) tree->code_len[intervall]); + start_pos=end_pos; + break; + case FIELD_BLOB: + { + ulong blob_length= _ma_calc_blob_length(field_length- + portable_sizeof_char_ptr, + start_pos); + /* Empty blobs are encoded with a single 1 bit. */ + if (!blob_length) + { + DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1")); + write_bits(1,1); + } + else + { + uchar *blob,*blob_end; + DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1")); + write_bits(0,1); + /* Write the blob length. */ + DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u", + blob_length, count->length_bits)); + write_bits(blob_length,count->length_bits); + memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*)); + blob_end=blob+blob_length; + /* Encode the blob bytes. */ + for ( ; blob < blob_end ; blob++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *blob, hexdigits(tree->code[(uchar) *blob]), + (uint) tree->code_len[(uchar) *blob], + bindigits(tree->code[(uchar) *start_pos], + (uint)tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *blob], + (uint) tree->code_len[(uchar) *blob]); + } + tot_blob_length+=blob_length; + } + start_pos= end_pos; + break; + } + case FIELD_VARCHAR: + { + uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1); + ulong col_length= (var_pack_length == 1 ? + (uint) *(uchar*) start_pos : + uint2korr(start_pos)); + /* Empty varchar are encoded with a single 1 bit. */ + if (!col_length) + { + DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1")); + write_bits(1,1); /* Empty varchar */ + } + else + { + uchar *end= start_pos + var_pack_length + col_length; + DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1")); + write_bits(0,1); + /* Write the varchar length. */ + DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u", + col_length, count->length_bits)); + write_bits(col_length,count->length_bits); + /* Encode the varchar bytes. */ + for (start_pos+= var_pack_length ; start_pos < end ; start_pos++) + { + DBUG_PRINT("fields", + ("value: 0x%02x code: 0x%s bits: %2u bin: %s", + (uchar) *start_pos, + hexdigits(tree->code[(uchar) *start_pos]), + (uint) tree->code_len[(uchar) *start_pos], + bindigits(tree->code[(uchar) *start_pos], + (uint)tree->code_len[(uchar) *start_pos]))); + write_bits(tree->code[(uchar) *start_pos], + (uint) tree->code_len[(uchar) *start_pos]); + } + } + start_pos= end_pos; + break; + } + case FIELD_LAST: + case FIELD_enum_val_count: + abort(); /* Impossible */ + } + start_pos+=count->max_zero_fill; + DBUG_PRINT("fields", ("---")); + } + flush_bits(); + length=(ulong) (file_buffer.pos - record_pos) - max_pack_length; + pack_length= _ma_save_pack_length(pack_version, record_pos, length); + if (pack_blob_length) + pack_length+= _ma_save_pack_length(pack_version, + record_pos + pack_length, + tot_blob_length); + DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu " + "length-bytes: %lu", (ulong) record_count, length, + tot_blob_length, pack_length)); + DBUG_PRINT("fields", ("===")); + + /* Correct file buffer if the header was smaller */ + if (pack_length != max_pack_length) + { + bmove(record_pos+pack_length,record_pos+max_pack_length,length); + file_buffer.pos-= (max_pack_length-pack_length); + } + if (length < (ulong) min_record_length) + min_record_length=(uint) length; + if (length > (ulong) max_record_length) + max_record_length=(uint) length; + record_count++; + if (write_loop && record_count % WRITE_COUNT == 0) + { + printf("%lu\r", (ulong) record_count); + fflush(stdout); + } + } + else if (error != HA_ERR_RECORD_DELETED) + break; + } + if (error == HA_ERR_END_OF_FILE) + error=0; + else + { + fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error); + } + if (verbose >= 2) + printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)); + + my_safe_afree(record, isam_file->s->base.reclength); + mrg->ref_length=max_pack_length; + mrg->min_pack_length=max_record_length ? min_record_length : 0; + mrg->max_pack_length=max_record_length; + DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0)); +} + + +static char *make_new_name(char *new_name, char *old_name) +{ + return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4); +} + +static char *make_old_name(char *new_name, char *old_name) +{ + return fn_format(new_name,old_name,"",OLD_EXT,2+4); +} + + /* rutines for bit writing buffer */ + +static void init_file_buffer(File file, pbool read_buffer) +{ + file_buffer.file=file; + file_buffer.buffer= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, + ALIGN_SIZE(RECORD_CACHE_SIZE), MYF(MY_WME)); + file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8; + file_buffer.pos_in_file=0; + error_on_write=0; + if (read_buffer) + { + + file_buffer.pos=file_buffer.end; + file_buffer.bits=0; + } + else + { + file_buffer.pos=file_buffer.buffer; + file_buffer.bits=BITS_SAVED; + } + file_buffer.bitbucket= 0; +} + + +static int flush_buffer(ulong neaded_length) +{ + ulong length; + + /* + file_buffer.end is 8 bytes lower than the real end of the buffer. + This is done so that the end-of-buffer condition does not need to be + checked for every uchar (see write_bits()). Consequently, + file_buffer.pos can become greater than file_buffer.end. The + algorithms in the other functions ensure that there will never be + more than 8 bytes written to the buffer without an end-of-buffer + check. So the buffer cannot be overrun. But we need to check for the + near-to-buffer-end condition to avoid a negative result, which is + casted to unsigned and thus becomes giant. + */ + if ((file_buffer.pos < file_buffer.end) && + ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length)) + return 0; + length=(ulong) (file_buffer.pos-file_buffer.buffer); + file_buffer.pos=file_buffer.buffer; + file_buffer.pos_in_file+=length; + if (test_only) + return 0; + if (error_on_write|| my_write(file_buffer.file, + (const uchar*) file_buffer.buffer, + length, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + error_on_write=1; + return 1; + } + + if (neaded_length != ~(ulong) 0 && + (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length) + { + uchar *tmp; + neaded_length+=256; /* some margin */ + tmp= (uchar*) my_realloc(PSI_NOT_INSTRUMENTED, file_buffer.buffer, + neaded_length,MYF(MY_WME)); + if (!tmp) + return 1; + file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer)); + file_buffer.buffer= tmp; + file_buffer.end= (tmp+neaded_length-8); + } + return 0; +} + + +static void end_file_buffer(void) +{ + my_free(file_buffer.buffer); +} + + /* output `bits` low bits of `value' */ + +static void write_bits(register ulonglong value, register uint bits) +{ + DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) || + (bits == 8 * sizeof(value))); + + if ((file_buffer.bits-= (int) bits) >= 0) + { + file_buffer.bitbucket|= value << file_buffer.bits; + } + else + { + reg3 ulonglong bit_buffer; + bits= (uint) -file_buffer.bits; + bit_buffer= (file_buffer.bitbucket | + ((bits != 8 * sizeof(value)) ? (value >> bits) : 0)); +#if BITS_SAVED == 64 + *file_buffer.pos++= (uchar) (bit_buffer >> 56); + *file_buffer.pos++= (uchar) (bit_buffer >> 48); + *file_buffer.pos++= (uchar) (bit_buffer >> 40); + *file_buffer.pos++= (uchar) (bit_buffer >> 32); +#endif + *file_buffer.pos++= (uchar) (bit_buffer >> 24); + *file_buffer.pos++= (uchar) (bit_buffer >> 16); + *file_buffer.pos++= (uchar) (bit_buffer >> 8); + *file_buffer.pos++= (uchar) (bit_buffer); + + if (bits != 8 * sizeof(value)) + value&= (((ulonglong) 1) << bits) - 1; + if (file_buffer.pos >= file_buffer.end) + flush_buffer(~ (ulong) 0); + file_buffer.bits=(int) (BITS_SAVED - bits); + file_buffer.bitbucket= value << (BITS_SAVED - bits); + } + return; +} + + /* Flush bits in bit_buffer to buffer */ + +static void flush_bits(void) +{ + int bits; + ulonglong bit_buffer; + + bits= file_buffer.bits & ~7; + bit_buffer= file_buffer.bitbucket >> bits; + bits= BITS_SAVED - bits; + while (bits > 0) + { + bits-= 8; + *file_buffer.pos++= (uchar) (bit_buffer >> bits); + } + if (file_buffer.pos >= file_buffer.end) + flush_buffer(~ (ulong) 0); + file_buffer.bits= BITS_SAVED; + file_buffer.bitbucket= 0; +} + + +/**************************************************************************** +** functions to handle the joined files +****************************************************************************/ + +static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg, + my_off_t new_length, + ha_checksum crc) +{ + MARIA_SHARE *share=isam_file->s; + uint options=mi_uint2korr(share->state.header.options); + uint key; + DBUG_ENTER("save_state"); + + options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA; + mi_int2store(share->state.header.options,options); + /* Save the original file type of we have to undo the packing later */ + share->state.header.org_data_file_type= share->state.header.data_file_type; + share->state.header.data_file_type= COMPRESSED_RECORD; + + share->state.state.data_file_length=new_length; + share->state.state.del=0; + share->state.state.empty=0; + share->state.dellink= HA_OFFSET_ERROR; + share->state.split=(ha_rows) mrg->records; + share->state.version=(ulong) time((time_t*) 0); + if (share->base.born_transactional) + share->state.create_rename_lsn= share->state.is_of_horizon= + share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS; + if (! maria_is_all_keys_active(share->state.key_map, share->base.keys)) + { + /* + Some indexes are disabled, cannot use current key_file_length value + as an estimate of upper bound of index file size. Use packed data file + size instead. + */ + share->state.state.key_file_length= new_length; + } + /* + If there are no disabled indexes, keep key_file_length value from + original file so "aria_chk -rq" can use this value (this is necessary + because index size cannot be easily calculated for fulltext keys) + */ + maria_clear_all_keys_active(share->state.key_map); + for (key=0 ; key < share->base.keys ; key++) + share->state.key_root[key]= HA_OFFSET_ERROR; + share->state.key_del= HA_OFFSET_ERROR; + share->state.state.checksum= crc; /* Save crc in file */ + share->changed=1; /* Force write of header */ + share->state.open_count=0; + share->global_changed=0; + my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0)); + if (share->base.keys) + isamchk_neaded=1; + DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file, + &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)); +} + + +static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length, + ha_checksum crc) +{ + MARIA_STATE_INFO state; + MARIA_HA *isam_file=mrg->file[0]; + uint options; + DBUG_ENTER("save_state_mrg"); + + state= isam_file->s->state; + options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD | + HA_OPTION_READ_ONLY_DATA); + mi_int2store(state.header.options,options); + /* Save the original file type of we have to undo the packing later */ + state.header.org_data_file_type= state.header.data_file_type; + state.header.data_file_type= COMPRESSED_RECORD; + + state.state.data_file_length=new_length; + state.state.del=0; + state.state.empty=0; + state.state.records=state.split=(ha_rows) mrg->records; + state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn= + LSN_NEEDS_NEW_STATE_LSNS; + + /* See comment above in save_state about key_file_length handling. */ + if (mrg->src_file_has_indexes_disabled) + { + isam_file->s->state.state.key_file_length= + MY_MAX(isam_file->s->state.state.key_file_length, new_length); + } + state.dellink= HA_OFFSET_ERROR; + state.version=(ulong) time((time_t*) 0); + maria_clear_all_keys_active(state.key_map); + state.state.checksum=crc; + if (isam_file->s->base.keys) + isamchk_neaded=1; + state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */ + DBUG_RETURN (_ma_state_info_write_sub(file, &state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)); +} + + +/* reset for mrg_rrnd */ + +static void mrg_reset(PACK_MRG_INFO *mrg) +{ + if (mrg->current) + { + maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0); + mrg->current=0; + } +} + +static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf) +{ + int error; + MARIA_HA *isam_info; + + if (!info->current) + { + isam_info= *(info->current=info->file); + info->end=info->current+info->count; + maria_reset(isam_info); + maria_extra(isam_info, HA_EXTRA_CACHE, 0); + if ((error= maria_scan_init(isam_info))) + return(error); + } + else + isam_info= *info->current; + + for (;;) + { + if (!(error= maria_scan(isam_info, buf)) || + error != HA_ERR_END_OF_FILE) + return (error); + maria_scan_end(isam_info); + maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0); + if (info->current+1 == info->end) + return(HA_ERR_END_OF_FILE); + info->current++; + isam_info= *info->current; + maria_reset(isam_info); + maria_extra(isam_info,HA_EXTRA_CACHE, 0); + if ((error= maria_scan_init(isam_info))) + return(error); + } +} + + +static int mrg_close(PACK_MRG_INFO *mrg) +{ + uint i; + int error=0; + DBUG_ENTER("mrg_close"); + + for (i=0 ; i < mrg->count ; i++) + error|=maria_close(mrg->file[i]); + if (mrg->free_file) + my_free(mrg->file); + DBUG_RETURN(error); +} + + +#if !defined(DBUG_OFF) +/* + Fake the counts to get big Huffman codes. + + SYNOPSIS + fakebigcodes() + huff_counts A pointer to the counts array. + end_count A pointer past the counts array. + + DESCRIPTION + + Huffman coding works by removing the two least frequent values from + the list of values and add a new value with the sum of their + incidences in a loop until only one value is left. Every time a + value is reused for a new value, it gets one more bit for its + encoding. Hence, the least frequent values get the longest codes. + + To get a maximum code length for a value, two of the values must + have an incidence of 1. As their sum is 2, the next infrequent value + must have at least an incidence of 2, then 4, 8, 16 and so on. This + means that one needs 2**n bytes (values) for a code length of n + bits. However, using more distinct values forces the use of longer + codes, or reaching the code length with less total bytes (values). + + To get 64(32)-bit codes, I sort the counts by decreasing incidence. + I assign counts of 1 to the two most frequent values, a count of 2 + for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All + the remaining values get 1. That way every possible uchar has an + assigned code, though not all codes are used if not all uchar values + are present in the column. + + This strategy would work with distinct column values too, but + requires that at least 64(32) values are present. To make things + easier here, I cancel all distinct column values and force byte + compression for all columns. + + RETURN + void +*/ + +static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count) +{ + HUFF_COUNTS *count; + my_off_t *cur_count_p; + my_off_t *end_count_p; + my_off_t **cur_sort_p; + my_off_t **end_sort_p; + my_off_t *sort_counts[256]; + my_off_t total; + DBUG_ENTER("fakebigcodes"); + + for (count= huff_counts; count < end_count; count++) + { + /* + Remove distinct column values. + */ + if (huff_counts->tree_buff) + { + my_free(huff_counts->tree_buff); + delete_tree(&huff_counts->int_tree, 0); + huff_counts->tree_buff= NULL; + DBUG_PRINT("fakebigcodes", ("freed distinct column values")); + } + + /* + Sort counts by decreasing incidence. + */ + cur_count_p= count->counts; + end_count_p= cur_count_p + 256; + cur_sort_p= sort_counts; + while (cur_count_p < end_count_p) + *(cur_sort_p++)= cur_count_p++; + (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp); + + /* + Assign faked counts. + */ + cur_sort_p= sort_counts; +#if SIZEOF_LONG_LONG > 4 + end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1; +#else + end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2; +#endif + /* Most frequent value gets a faked count of 1. */ + **(cur_sort_p++)= 1; + total= 1; + while (cur_sort_p < end_sort_p) + { + **(cur_sort_p++)= total; + total<<= 1; + } + /* Set the last value. */ + **(cur_sort_p++)= --total; + /* + Set the remaining counts. + */ + end_sort_p= sort_counts + 256; + while (cur_sort_p < end_sort_p) + **(cur_sort_p++)= 1; + } + DBUG_VOID_RETURN; +} + + +/* + Compare two counts for reverse sorting. + + SYNOPSIS + fakecmp() + count1 One count. + count2 Another count. + + RETURN + 1 count1 < count2 + 0 count1 == count2 + -1 count1 > count2 +*/ + +static int fakecmp(my_off_t **count1, my_off_t **count2) +{ + return ((**count1 < **count2) ? 1 : + (**count1 > **count2) ? -1 : 0); +} +#endif + +#include "ma_check_standalone.h" + diff --git a/storage/maria/aria_read_log.c b/storage/maria/aria_read_log.c new file mode 100644 index 00000000..c0c76ed5 --- /dev/null +++ b/storage/maria/aria_read_log.c @@ -0,0 +1,440 @@ +/* Copyright (C) 2007 MySQL AB + Copyright (C) 2010 Monty Program Ab + Copyright (C) 2020 MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "ma_recovery.h" +#include <my_getopt.h> + +#define LOG_FLAGS 0 + +static const char *load_default_groups[]= { "aria_read_log",0 }; +static void get_options(int *argc,char * * *argv); +#ifndef DBUG_OFF +#if defined(_WIN32) +const char *default_dbug_option= "d:t:O,\\aria_read_log.trace"; +#else +const char *default_dbug_option= "d:t:o,/tmp/aria_read_log.trace"; +#endif +#endif /* DBUG_OFF */ +static my_bool opt_display_only, opt_apply, opt_silent, opt_apply_undo; +static my_bool opt_check, opt_start_from_checkpoint; +static my_bool opt_print_aria_log_control; +static const char *opt_tmpdir; +static ulong opt_translog_buffer_size; +static ulonglong opt_page_buffer_size; +static ulonglong opt_start_from_lsn, opt_lsn_redo_end, opt_lsn_undo_end; +static char *start_from_lsn_buf, *lsn_redo_end_buf, *lsn_undo_end_buf; +static MY_TMPDIR maria_chk_tmpdir; + +/* + Get lsn from file number and offset + Format supported: + ulonglong + uint,0xhex +*/ + +static ulonglong get_lsn(const char *lsn_str) +{ + ulong file; + ulong pos; + if (sscanf(lsn_str, " %lu,0x%lx", &file, &pos) == 2) + return MAKE_LSN(file, pos); + if (sscanf(lsn_str, " %lu", &pos) == 1) + return (ulonglong) pos; + return ~(ulonglong) 0; /* Error */ +} + +static my_bool get_lsn_arg(const char *lsn_string, ulonglong *lsn, + const char *name) +{ + ulonglong value; + value= get_lsn(lsn_string); + if (value != ~(ulonglong) 0) + { + *lsn= value; + return 0; + } + fprintf(stderr, + "Wrong value '%s' for option %s. Value should be in format: " + "number,0xhexnumber\n", + lsn_string, name); + return 1; +} + + +int main(int argc, char **argv) +{ + LSN lsn; + char **default_argv; + uint warnings_count; + MY_INIT(argv[0]); + + maria_data_root= "."; + sf_leaking_memory=1; /* don't report memory leaks on early exits */ + load_defaults_or_exit("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc, &argv); + + maria_in_recovery= TRUE; + + if (maria_init()) + { + fprintf(stderr, "Can't init Aria engine (%d)\n", errno); + goto err; + } + maria_block_size= 0; /* Use block size from file */ + if (opt_print_aria_log_control) + { + if (print_aria_log_control()) + goto err; + goto end; + } + /* we don't want to create a control file, it MUST exist */ + if (ma_control_file_open(FALSE, TRUE, TRUE)) + { + fprintf(stderr, "Can't open control file (%d)\n", errno); + goto err; + } + if (last_logno == FILENO_IMPOSSIBLE) + { + fprintf(stderr, "Can't find any log\n"); + goto err; + } + if (init_pagecache(maria_pagecache, (size_t)opt_page_buffer_size, 0, 0, + maria_block_size, 0, MY_WME) == 0) + { + fprintf(stderr, "Got error in init_pagecache() (errno: %d)\n", errno); + goto err; + } + /* + If log handler does not find the "last_logno" log it will return error, + which is good. + But if it finds a log and this log was crashed, it will create a new log, + which is useless. TODO: start log handler in read-only mode. + */ + if (init_pagecache(maria_log_pagecache, opt_translog_buffer_size, + 0, 0, TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0 || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, TRANSLOG_DEFAULT_FLAGS, + opt_display_only)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + goto err; + } + + if (opt_display_only) + printf("You are using --display-only, NOTHING will be written to disk\n"); + + lsn= translog_first_lsn_in_log(); + if (lsn == LSN_ERROR) + { + fprintf(stderr, "Opening transaction log failed\n"); + goto end; + } + if (lsn == LSN_IMPOSSIBLE) + { + fprintf(stdout, "The transaction log is empty\n"); + } + if (opt_start_from_checkpoint && !opt_start_from_lsn && + last_checkpoint_lsn != LSN_IMPOSSIBLE) + { + lsn= LSN_IMPOSSIBLE; /* LSN set in maria_apply_log() */ + fprintf(stdout, "Starting from checkpoint " LSN_FMT "\n", + LSN_IN_PARTS(last_checkpoint_lsn)); + } + else + fprintf(stdout, "The transaction log starts from lsn " LSN_FMT "\n", + LSN_IN_PARTS(lsn)); + + if (opt_start_from_lsn) + { + if (opt_start_from_lsn < (ulonglong) lsn) + { + fprintf(stderr, "start_from_lsn is too small. Aborting\n"); + maria_end(); + goto err; + } + lsn= (LSN) opt_start_from_lsn; + fprintf(stdout, "Starting reading log from lsn " LSN_FMT "\n", + LSN_IN_PARTS(lsn)); + } + + fprintf(stdout, "TRACE of the last aria_read_log\n"); + if (maria_apply_log(lsn, opt_lsn_redo_end, opt_lsn_undo_end, + opt_apply ? MARIA_LOG_APPLY : + (opt_check ? MARIA_LOG_CHECK : + MARIA_LOG_DISPLAY_HEADER), opt_silent ? NULL : stdout, + FALSE, FALSE, &warnings_count)) + goto err; + if (warnings_count == 0) + fprintf(stdout, "%s: SUCCESS\n", my_progname_short); + else + fprintf(stdout, "%s: DOUBTFUL (%u warnings, check previous output)\n", + my_progname_short, warnings_count); + +end: + maria_end(); + free_tmpdir(&maria_chk_tmpdir); + free_defaults(default_argv); + my_end(0); + sf_leaking_memory=0; + exit(0); + return 0; /* No compiler warning */ + +err: + /* don't touch anything more, in case we hit a bug */ + fprintf(stderr, "%s: FAILED\n", my_progname_short); + free_tmpdir(&maria_chk_tmpdir); + free_defaults(default_argv); + exit(1); +} + + +#include "ma_check_standalone.h" + +enum options_mc { + OPT_CHARSETS_DIR=256, OPT_FORCE_CRASH, OPT_TRANSLOG_BUFFER_SIZE +}; + +static struct my_option my_long_options[] = +{ + {"apply", 'a', + "Apply log to tables: modifies tables! you should make a backup first! " + " Displays a lot of information if not run with --silent", + (uchar **) &opt_apply, (uchar **) &opt_apply, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR, + "Directory where character sets are.", + (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"check", 'c', + "if --display-only, check if record is fully readable (for debugging)", + (uchar **) &opt_check, (uchar **) &opt_check, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"force-crash", OPT_FORCE_CRASH, "Force crash after # recovery events", + &maria_recovery_force_crash_counter, 0,0, GET_ULONG, REQUIRED_ARG, + 0, 0, ~(long) 0, 0, 0, 0}, +#endif + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"display-only", 'd', "display brief info read from records' header", + &opt_display_only, &opt_display_only, 0, GET_BOOL, + NO_ARG,0, 0, 0, 0, 0, 0}, + { "end-lsn", 'e', "Alias for lsn-redo-end", + &lsn_redo_end_buf, &lsn_redo_end_buf, 0, GET_STR, REQUIRED_ARG, 0, 0, + 0, 0, 0, 0 }, + { "lsn-redo-end", 'e', "Stop applying at this lsn during redo. If " + "this option is used UNDO:s will not be applied unless --lsn-undo-end is " + "given", &lsn_redo_end_buf, + &lsn_redo_end_buf, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0 }, + { "lsn-undo-end", 'E', "Stop applying undo after this lsn has been applied", + &lsn_undo_end_buf, &lsn_undo_end_buf, 0, GET_STR, REQUIRED_ARG, 0, 0, + 0, 0, 0, 0 }, + {"aria-log-dir-path", 'h', + "Path to the directory where to store transactional log", + (char **) &maria_data_root, (char **) &maria_data_root, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "page-buffer-size", 'P', + "The size of the buffer used for index blocks for Aria tables", + &opt_page_buffer_size, &opt_page_buffer_size, 0, + GET_ULL, REQUIRED_ARG, PAGE_BUFFER_INIT, + PAGE_BUFFER_INIT, SIZE_T_MAX, MALLOC_OVERHEAD, (long) IO_SIZE, 0}, + { "print-log-control-file", 'l', + "Print the content of the aria_log_control_file", + &opt_print_aria_log_control, &opt_print_aria_log_control, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { "start-from-lsn", 'o', "Start reading log from this lsn", + &opt_start_from_lsn, &opt_start_from_lsn, + 0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 }, + {"start-from-checkpoint", 'C', "Start applying from last checkpoint", + &opt_start_from_checkpoint, &opt_start_from_checkpoint, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Print less information during apply/undo phase", + &opt_silent, &opt_silent, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"tables-to-redo", 'T', + "List of tables separated with , that we should apply REDO on. Use this if you only want to recover some tables", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"tmpdir", 't', "Path for temporary files. Multiple paths can be specified, " + "separated by " +#if defined( _WIN32) + "semicolon (;)" +#else + "colon (:)" +#endif + , (char**) &opt_tmpdir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + { "translog-buffer-size", OPT_TRANSLOG_BUFFER_SIZE, + "The size of the buffer used for transaction log for Aria tables", + &opt_translog_buffer_size, &opt_translog_buffer_size, 0, + GET_ULONG, REQUIRED_ARG, (long) TRANSLOG_PAGECACHE_SIZE, + 1024L*1024L, (long) ~(ulong) 0, (long) MALLOC_OVERHEAD, + (long) IO_SIZE, 0}, + {"undo", 'u', + "Apply UNDO records to tables. (disable with --disable-undo). " + "Will be automatically set if lsn-undo-end is used", + (uchar **) &opt_apply_undo, (uchar **) &opt_apply_undo, 0, + GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Print more information during apply/undo phase", + &maria_recovery_verbose, &maria_recovery_verbose, 0, + GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +static void print_version(void) +{ + printf("%s Ver 1.5 for %s on %s\n", + my_progname_short, SYSTEM_TYPE, MACHINE_TYPE); +} + + +static void usage(void) +{ + print_version(); + puts("Copyright (C) 2007 MySQL AB, 2009-2011 Monty Program Ab, 2020 MariaDB Corporation"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Display or apply log records from a Aria transaction log"); + puts("found in the current directory (for now)"); +#ifndef IDENTICAL_PAGES_AFTER_RECOVERY + puts("\nNote: Aria is compiled without -DIDENTICAL_PAGES_AFTER_RECOVERY\n" + "which means that the table files are not byte-to-byte identical to\n" + "files created during normal execution. This should be ok, except for\n" + "test scripts that tries to compare files before and after recovery."); +#endif + printf("\nUsage: %s OPTIONS [-d | -a] -h `aria_log_directory`\n", + my_progname_short); + printf("or\n"); + printf("Usage: %s OPTIONS -h `aria_log_directory` " + "--print-log-control-file\n\n", + my_progname_short); + + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + + +static uchar* my_hash_get_string(const uchar *record, size_t *length, + my_bool first __attribute__ ((unused))) +{ + *length= (size_t) (strcend((const char*) record,',')- (const char*) record); + return (uchar*) record; +} + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument, + const char *filename __attribute__((unused))) +{ + switch (opt->id) { + case '?': + usage(); + exit(0); + case 'V': + print_version(); + exit(0); + case 'E': + opt_apply_undo= TRUE; + break; + case 'T': + { + char *pos; + if (!my_hash_inited(&tables_to_redo)) + { + my_hash_init2(PSI_INSTRUMENT_ME, &tables_to_redo, 16, &my_charset_bin, + 16, 0, 0, my_hash_get_string, 0, 0, HASH_UNIQUE); + } + do + { + pos= strcend(argument, ','); + if (pos != argument) /* Skip empty strings */ + my_hash_insert(&tables_to_redo, (uchar*) argument); + argument= pos+1; + } while (*(pos++)); + break; + } +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + my_bool need_help= 0, need_abort= 0; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (start_from_lsn_buf) + { + if (get_lsn_arg(start_from_lsn_buf, &opt_start_from_lsn, + "start-from-lsn")) + need_abort= 1; + } + if (lsn_redo_end_buf) + { + if (get_lsn_arg(lsn_redo_end_buf, &opt_lsn_redo_end, + "lsn-redo-end")) + need_abort= 1; + } + if (lsn_undo_end_buf) + { + if (get_lsn_arg(lsn_undo_end_buf, &opt_lsn_undo_end, + "lsn-undo-end")) + need_abort= 1; + } + + if (!opt_apply) + opt_apply_undo= FALSE; + if (!opt_apply_undo) + opt_lsn_undo_end= LSN_MAX; + + if (*argc > 0) + { + need_help= 1; + fprintf(stderr, "Too many arguments given\n"); + } + if ((opt_display_only + opt_apply + opt_print_aria_log_control) != 1) + { + need_abort= 1; + fprintf(stderr, + "You must use one and only one of the options 'display-only', \n" + "'print-log-control-file' and 'apply'\n"); + } + + if (need_help || need_abort) + { + fflush(stderr); + if (need_help) + usage(); + exit(1); + } + if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir)) + exit(1); + maria_tmpdir= &maria_chk_tmpdir; +} diff --git a/storage/maria/aria_s3_copy.cc b/storage/maria/aria_s3_copy.cc new file mode 100644 index 00000000..77c41ba4 --- /dev/null +++ b/storage/maria/aria_s3_copy.cc @@ -0,0 +1,348 @@ +/* Copyright (C) 2019 MariaDB corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ + +/* + Allow copying of Aria tables to and from S3 and also delete them from S3 +*/ + +#include <my_global.h> +#include <m_string.h> +#include "maria_def.h" +#include <aria_backup.h> +#include <my_getopt.h> +#include <my_check_opt.h> +#include <mysys_err.h> +#include <mysqld_error.h> +#include <zlib.h> +#include <libmarias3/marias3.h> +#include "s3_func.h" + +static const char *op_types[]= {"to_s3", "from_s3", "delete_from_s3", NullS}; +static TYPELIB op_typelib= {array_elements(op_types)-1,"", op_types, NULL}; +#define OP_IMPOSSIBLE array_elements(op_types) + +static const char *load_default_groups[]= { "aria_s3_copy", 0 }; +static const char *opt_s3_access_key, *opt_s3_secret_key; +static const char *opt_s3_region="eu-north-1"; +static const char *opt_s3_host_name= DEFAULT_AWS_HOST_NAME; +static const char *opt_database; +static const char *opt_s3_bucket="MariaDB"; +static my_bool opt_compression, opt_verbose, opt_force, opt_s3_debug; +static my_bool opt_s3_use_http; +static ulong opt_operation= OP_IMPOSSIBLE, opt_protocol_version= 1; +static ulong opt_block_size; +static ulong opt_s3_port; +static char **default_argv=0; +static ms3_st *global_s3_client= 0; + + +static struct my_option my_long_options[] = +{ + {"help", '?', "Display this help and exit.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, + 0, 0, 0, 0, 0}, + {"s3_access_key", 'k', "AWS access key ID", + (char**) &opt_s3_access_key, (char**) &opt_s3_access_key, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"s3_region", 'r', "AWS region", + (char**) &opt_s3_region, (char**) &opt_s3_region, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"s3_secret_key", 'K', "AWS secret access key ID", + (char**) &opt_s3_secret_key, (char**) &opt_s3_secret_key, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"s3_bucket", 'b', "AWS prefix for tables", + (char**) &opt_s3_bucket, (char**) &opt_s3_bucket, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"s3_host_name", 'h', "Host name to S3 provider", + (char**) &opt_s3_host_name, (char**) &opt_s3_host_name, 0, + GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"s3_port", 'p', "Port number to connect to (0 means use default)", + (char**) &opt_s3_port, (char**) &opt_s3_port, 0, GET_ULONG, REQUIRED_ARG, + 0, 0, 65536, 0, 1, 0 }, + {"s3_use_http", 'P', "If true, force use of HTTP protocol", + (char**) &opt_s3_use_http, (char**) &opt_s3_use_http, + 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"compress", 'c', "Use compression", &opt_compression, &opt_compression, + 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"op", 'o', "Operation to execute. One of 'from_s3', 'to_s3' or " + "'delete_from_s3'", + &opt_operation, &opt_operation, &op_typelib, + GET_ENUM, REQUIRED_ARG, OP_IMPOSSIBLE, 0, 0, 0, 0, 0}, + {"database", 'd', + "Database for copied table (second prefix). " + "If not given, the directory of the table file is used", + &opt_database, &opt_database, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"s3_block_size", 'B', "Block size for data/index blocks in s3", + &opt_block_size, &opt_block_size, 0, GET_ULONG, REQUIRED_ARG, + 4*1024*1024, 64*1024, 16*1024*1024, MALLOC_OVERHEAD, 1024, 0 }, + {"s3_protocol_version", 'L', + "Protocol used to communication with S3. One of \"Auto\", \"Amazon\" or \"Original\".", + &opt_protocol_version, &opt_protocol_version, &s3_protocol_typelib, + GET_ENUM, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"force", 'f', "Force copy even if target exists", + &opt_force, &opt_force, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Write more information", &opt_verbose, &opt_verbose, + 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"s3_debug",0, "Output debug log from marias3 to stdout", + &opt_s3_debug, &opt_s3_debug, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +}; + + +static bool get_database_from_path(char *to, size_t to_length, const char *path); + + +static void print_version(void) +{ + printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, + MACHINE_TYPE); +} + +static void usage(void) +{ + print_version(); + puts("\nThis software comes with NO WARRANTY: " + " see the PUBLIC for details.\n"); + puts("Copy an Aria table to and from s3"); + printf("Usage: %s --aws-access-key=# --aws-secret-access-key=# --aws-region=# " + "--op=(from_s3 | to_s3 | delete_from_s3) [OPTIONS] tables[.MAI]\n", + my_progname_short); + print_defaults("my", load_default_groups); + puts(""); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} + + +ATTRIBUTE_NORETURN static void my_exit(int exit_code) +{ + if (global_s3_client) + { + ms3_deinit(global_s3_client); + global_s3_client= 0; + } + free_defaults(default_argv); + s3_deinit_library(); + my_end(MY_CHECK_ERROR); + exit(exit_code); +} + +extern "C" my_bool get_one_option(const struct my_option *opt + __attribute__((unused)), + const char *argument, const char *filename) +{ + switch (opt->id) { + case 'V': + print_version(); + my_exit(0); + case '?': + usage(); + my_exit(0); + case '#': + DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/aria_s3_copy.trace"); + break; + } + return 0; +} + + +static void get_options(int *argc, char ***argv) +{ + int ho_error; + + load_defaults_or_exit("my", load_default_groups, argc, argv); + default_argv= *argv; + + if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option))) + my_exit(ho_error); + + if (*argc == 0) + { + usage(); + my_exit(-1); + } + + if (!opt_s3_access_key) + { + fprintf(stderr, "--aws-access-key was not given\n"); + my_exit(-1); + } + if (!opt_s3_secret_key) + { + fprintf(stderr, "--aws-secret-access-key was not given\n"); + my_exit(-1); + } + if (opt_operation == OP_IMPOSSIBLE) + { + fprintf(stderr, "You must specify an operation with --op=[from_s3|to_s3|delete_from_s3]\n"); + my_exit(-1); + } + if (opt_s3_debug) + ms3_debug(); + +} /* get_options */ + + +int main(int argc, char** argv) +{ + MY_INIT(argv[0]); + get_options(&argc,(char***) &argv); + size_t block_size= opt_block_size; + + s3_init_library(); + if (!(global_s3_client= ms3_init(opt_s3_access_key, + opt_s3_secret_key, + opt_s3_region, opt_s3_host_name))) + { + fprintf(stderr, "Can't open connection to S3, error: %d %s", errno, + ms3_error(errno)); + my_exit(1); + } + + ms3_set_option(global_s3_client, MS3_OPT_BUFFER_CHUNK_SIZE, &block_size); + + if (opt_protocol_version) + { + uint8_t protocol_version= (uint8_t) opt_protocol_version; + ms3_set_option(global_s3_client, MS3_OPT_FORCE_PROTOCOL_VERSION, + &protocol_version); + } + if (opt_s3_port) + { + int port= (int) opt_s3_port; + ms3_set_option(global_s3_client, MS3_OPT_PORT_NUMBER, &port); + } + if (opt_s3_use_http) + ms3_set_option(global_s3_client, MS3_OPT_USE_HTTP, NULL); + + + for (; *argv ; argv++) + { + char database[FN_REFLEN], table_name[FN_REFLEN], *path; + const char *db; + + path= *argv; + + fn_format(table_name, path, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + + /* Get database from option, path or current directory */ + if (!(db= opt_database)) + { + if (get_database_from_path(database, sizeof(database), path)) + { + fprintf(stderr, "Aborting copying of %s\n", path); + my_exit(-1); + } + db= database; + } + + switch (opt_operation) { + case 0: + /* Don't copy .frm file for partioned table */ + if (aria_copy_to_s3(global_s3_client, opt_s3_bucket, path, + db, table_name, opt_block_size, opt_compression, + opt_force, opt_verbose, !strstr(table_name, "#P#"))) + { + fprintf(stderr, "Aborting copying of %s\n", path); + my_exit(-1); + } + break; + case 1: + if (aria_copy_from_s3(global_s3_client, opt_s3_bucket, path, + db, opt_compression, opt_force, opt_verbose)) + { + fprintf(stderr, "Aborting copying of %s\n", path); + my_exit(-1); + } + break; + case 2: + if (aria_delete_from_s3(global_s3_client, opt_s3_bucket, db, + table_name, opt_verbose)) + { + fprintf(stderr, "Aborting copying of %s\n", path); + my_exit(-1); + } + break; + } + } + my_exit(0); + return 0; +} + + +/** + Calculate database name base on path of Aria file + + @return 0 ok + @return 1 error +*/ + +static bool get_database_from_path(char *to, size_t to_length, + const char *path) +{ + S3_INFO s3; + if (!set_database_and_table_from_path(&s3, path)) + { + strmake(to, s3.database.str, MY_MIN(s3.database.length, to_length-1)); + return 0; + } + + if (my_getwd(to, to_length-1, MYF(MY_WME))) + return 1; + return get_database_from_path(to, to_length, to); +} + + +#include "ma_check_standalone.h" + +/* + Declare all symbols from libmyisam.a, to ensure that we don't have + to include the library as it pulls in ha_myisam.cc +*/ + +const char *ft_boolean_syntax= 0; +ulong ft_min_word_len=0, ft_max_word_len=0; +const HA_KEYSEG ft_keysegs[FT_SEGS]= { +{ + 0, /* charset */ + HA_FT_WLEN, /* start */ + 0, /* null_pos */ + 0, /* Bit pos */ + HA_VAR_LENGTH_PART | HA_PACK_KEY, /* flag */ + HA_FT_MAXBYTELEN, /* length */ + 63, /* language (will be overwritten +) */ + HA_KEYTYPE_VARTEXT2, /* type */ + 0, /* null_bit */ + 2, 0 /* bit_start, bit_length */ +}, +{ + 0, 0, 0, 0, HA_NO_SORT, HA_FT_WLEN, 63, HA_FT_WTYPE, 0, 0, 0 +} +}; + +struct st_mysql_ftparser ft_default_parser= +{ + MYSQL_FTPARSER_INTERFACE_VERSION, 0, 0, 0 +}; + +C_MODE_START +int is_stopword(const char *word, size_t len) { return 0; } +C_MODE_END diff --git a/storage/maria/file_formats.txt b/storage/maria/file_formats.txt new file mode 100644 index 00000000..927e8ad9 --- /dev/null +++ b/storage/maria/file_formats.txt @@ -0,0 +1,71 @@ +# +# This should contain a description of the file format for most Maria files +# + +# Description of the header in the index file + +Header, 24 bytes + +Pos Length + +0 4 file_version +4 2 options +6 2 header_length +8 2 state_info_length +10 2 base_info_length +12 2 base_pos +14 2 key_parts +16 2 unique_key_parts +18 1 keys +19 1 uniques +20 1 language +21 1 fulltext_keys +22 1 data_file_type +23 1 org_data_file_type + + +Status part + +24 2 open_count +26 2 state_changed +28 7 create_rename_lsn + 7 is_of_horizon + 7 skip_redo_lsn + 8 state.records + 8 state->state.del + 8 state->split + 8 state->dellink + 8 state->first_bitmap_with_space + 8 state->state.key_file_length + 8 state->state.data_file_length + 8 state->state.empty + 8 state->state.key_empty + 8 state->auto_increment + 8 state->state.checksum + 4 state->process + 4 state->unique + 4 state->status + 4 state->update_count + + 1 state->sortkey + 1 reserved + +for each key + 8 state->key_root[i] + + 8 state->key_del + 4 state->sec_index_changed + 4 state->sec_index_used + 4 state->version + 8 state->key_map + 8 state->create_time + 8 state->recover_time + 8 state->check_time + 8 state->records_at_analyze + +for each key + 4 reserved + +for each key part + 8 state->rec_per_key_part[i] + 4 state->nulls_per_key_part[i] diff --git a/storage/maria/ft_maria.c b/storage/maria/ft_maria.c new file mode 100644 index 00000000..1b07a8d5 --- /dev/null +++ b/storage/maria/ft_maria.c @@ -0,0 +1,48 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* + This function is for interface functions between fulltext and maria +*/ + +#include "ma_ftdefs.h" + +FT_INFO *maria_ft_init_search(uint flags, void *info, uint keynr, + uchar *query, size_t query_len, + CHARSET_INFO *cs, uchar *record) +{ + FT_INFO *res; + if (flags & FT_BOOL) + res= maria_ft_init_boolean_search((MARIA_HA *) info, keynr, query, + (uint)query_len, cs); + else + res= maria_ft_init_nlq_search((MARIA_HA *) info, keynr, query, (uint)query_len, + flags, record); + return res; +} + +const struct _ft_vft _ma_ft_vft_nlq = { + maria_ft_nlq_read_next, maria_ft_nlq_find_relevance, + maria_ft_nlq_close_search, maria_ft_nlq_get_relevance, + maria_ft_nlq_reinit_search +}; +const struct _ft_vft _ma_ft_vft_boolean = { + maria_ft_boolean_read_next, maria_ft_boolean_find_relevance, + maria_ft_boolean_close_search, maria_ft_boolean_get_relevance, + maria_ft_boolean_reinit_search +}; + diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc new file mode 100644 index 00000000..66dd9867 --- /dev/null +++ b/storage/maria/ha_maria.cc @@ -0,0 +1,4336 @@ +/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (C) 2008-2009 Sun Microsystems, Inc. + Copyright (c) 2009, 2021, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#define MYSQL_SERVER 1 +#include <my_global.h> +#include <m_ctype.h> +#include <my_dir.h> +#include <myisampack.h> +#include <my_bit.h> +#include "ha_maria.h" +#include "trnman_public.h" +#include "trnman.h" + +C_MODE_START +#include "maria_def.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "ma_recovery.h" +C_MODE_END +#include "ma_trnman.h" + +//#include "sql_priv.h" +#include "protocol.h" +#include "sql_class.h" +#include "key.h" +#include "log.h" +#include "sql_parse.h" +#include "debug.h" + +/* + Note that in future versions, only *transactional* Maria tables can + rollback, so this flag should be up or down conditionally. +*/ +#ifdef ARIA_HAS_TRANSACTIONS +#define TRANSACTION_STATE +#else +#define TRANSACTION_STATE HA_NO_TRANSACTIONS +#endif + +#define THD_TRN (TRN*) thd_get_ha_data(thd, maria_hton) + +ulong pagecache_division_limit, pagecache_age_threshold, pagecache_file_hash_size; +ulonglong pagecache_buffer_size; +const char *zerofill_error_msg= + "Table is probably from another system and must be zerofilled or repaired ('REPAIR TABLE table_name') to be usable on this system"; + +/** + As the auto-repair is initiated when opened from the SQL layer + (open_unireg_entry(), check_and_repair()), it does not happen when Maria's + Recovery internally opens the table to apply log records to it, which is + good. It would happen only after Recovery, if the table is still + corrupted. +*/ +ulonglong maria_recover_options= HA_RECOVER_NONE; +handlerton *maria_hton; + +/* bits in maria_recover_options */ +const char *maria_recover_names[]= +{ + /* + Compared to MyISAM, "default" was renamed to "normal" as it collided with + SET var=default which sets to the var's default i.e. what happens when the + var is not set i.e. HA_RECOVER_NONE. + OFF flag is ignored. + */ + "NORMAL", "BACKUP", "FORCE", "QUICK", "OFF", NullS +}; +TYPELIB maria_recover_typelib= +{ + array_elements(maria_recover_names) - 1, "", + maria_recover_names, NULL +}; + +const char *maria_stats_method_names[]= +{ + "nulls_unequal", "nulls_equal", + "nulls_ignored", NullS +}; +TYPELIB maria_stats_method_typelib= +{ + array_elements(maria_stats_method_names) - 1, "", + maria_stats_method_names, NULL +}; + +/* transactions log purge mode */ +const char *maria_translog_purge_type_names[]= +{ + "immediate", "external", "at_flush", NullS +}; +TYPELIB maria_translog_purge_type_typelib= +{ + array_elements(maria_translog_purge_type_names) - 1, "", + maria_translog_purge_type_names, NULL +}; + +/* transactional log directory sync */ +const char *maria_sync_log_dir_names[]= +{ + "NEVER", "NEWFILE", "ALWAYS", NullS +}; +TYPELIB maria_sync_log_dir_typelib= +{ + array_elements(maria_sync_log_dir_names) - 1, "", + maria_sync_log_dir_names, NULL +}; + +/* transactional log group commit */ +const char *maria_group_commit_names[]= +{ + "none", "hard", "soft", NullS +}; +TYPELIB maria_group_commit_typelib= +{ + array_elements(maria_group_commit_names) - 1, "", + maria_group_commit_names, NULL +}; + +/** Interval between background checkpoints in seconds */ +static ulong checkpoint_interval; +static void update_checkpoint_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); +static void update_maria_group_commit(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); +static void update_maria_group_commit_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); +/** After that many consecutive recovery failures, remove logs */ +static ulong force_start_after_recovery_failures; +static void update_log_file_size(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save); + +/* The 4096 is there because of MariaDB privilege tables */ +static MYSQL_SYSVAR_ULONG(block_size, maria_block_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Block size to be used for Aria index pages.", 0, 0, + MARIA_KEY_BLOCK_LENGTH, 4096, + MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH); + +static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval, + PLUGIN_VAR_RQCMDARG, + "Interval between tries to do an automatic checkpoints. In seconds; 0 means" + " 'no automatic checkpoints' which makes sense only for testing.", + NULL, update_checkpoint_interval, 30, 0, UINT_MAX, 1); + +static MYSQL_SYSVAR_ULONG(checkpoint_log_activity, maria_checkpoint_min_log_activity, + PLUGIN_VAR_RQCMDARG, + "Number of bytes that the transaction log has to grow between checkpoints before a new " + "checkpoint is written to the log.", + NULL, NULL, 1024*1024, 0, UINT_MAX, 1); + +static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures, + force_start_after_recovery_failures, + /* + Read-only because setting it on the fly has no useful effect, + should be set on command-line. + */ + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of consecutive log recovery failures after which logs will be" + " automatically deleted to cure the problem; 0 (the default) disables" + " the feature.", NULL, NULL, 0, 0, UINT_MAX8, 1); + +static MYSQL_SYSVAR_BOOL(page_checksum, maria_page_checksums, 0, + "Maintain page checksums (can be overridden per table " + "with PAGE_CHECKSUM clause in CREATE TABLE)", 0, 0, 1); + +/* It is only command line argument */ +static MYSQL_SYSVAR_CONST_STR(log_dir_path, maria_data_root, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to the directory where to store transactional log", + NULL, NULL, mysql_real_data_home); + +static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size, + PLUGIN_VAR_RQCMDARG, + "Limit for transaction log size", + NULL, update_log_file_size, TRANSLOG_FILE_SIZE, + TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE); + +static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit, + PLUGIN_VAR_RQCMDARG, + "Specifies Aria group commit mode. " + "Possible values are \"none\" (no group commit), " + "\"hard\" (with waiting to actual commit), " + "\"soft\" (no wait for commit (DANGEROUS!!!))", + NULL, update_maria_group_commit, + TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib); + +static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval, + PLUGIN_VAR_RQCMDARG, + "Interval between commits in microseconds (1/1000000 sec)." + " 0 stands for no waiting" + " for other threads to come and do a commit in \"hard\" mode and no" + " sync()/commit at all in \"soft\" mode. Option has only an effect" + " if aria_group_commit is used", + NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1); + +static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type, + PLUGIN_VAR_RQCMDARG, + "Specifies how Aria transactional log will be purged", + NULL, NULL, TRANSLOG_PURGE_IMMIDIATE, + &maria_translog_purge_type_typelib); + +static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size, + maria_max_temp_length, PLUGIN_VAR_RQCMDARG, + "Don't use the fast sort index method to created index if the " + "temporary file would get bigger than this.", + 0, 0, MAX_FILE_SIZE & ~((ulonglong) (1*MB-1)), + 0, MAX_FILE_SIZE, 1*MB); + +static MYSQL_SYSVAR_ULONG(pagecache_age_threshold, + pagecache_age_threshold, PLUGIN_VAR_RQCMDARG, + "This characterizes the number of hits a hot block has to be untouched " + "until it is considered aged enough to be downgraded to a warm block. " + "This specifies the percentage ratio of that number of hits to the " + "total number of blocks in the page cache.", 0, 0, + 300, 100, ~ (ulong) 0L, 100); + +static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the buffer used for index blocks for Aria tables. " + "Increase this to get better index handling (for all reads and " + "multiple writes) to as much as you can afford.", 0, 0, + KEY_CACHE_SIZE, 8192*16L, ~(ulonglong) 0, 1); + +static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit, + PLUGIN_VAR_RQCMDARG, + "The minimum percentage of warm blocks in key cache", 0, 0, + 100, 1, 100, 1); + +static MYSQL_SYSVAR_ULONG(pagecache_file_hash_size, pagecache_file_hash_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of hash buckets for open and changed files. If you have a lot of Aria " + "files open you should increase this for faster flush of changes. A good " + "value is probably 1/10 of number of possible open Aria files.", 0,0, + 512, 128, 16384, 1); + +static MYSQL_SYSVAR_SET(recover_options, maria_recover_options, PLUGIN_VAR_OPCMDARG, + "Specifies how corrupted tables should be automatically repaired", + NULL, NULL, HA_RECOVER_BACKUP|HA_RECOVER_QUICK, &maria_recover_typelib); + +static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG, + "Number of threads to use when repairing Aria tables. The value of 1 " + "disables parallel repair.", + 0, 0, 1, 1, 128, 1); + +static MYSQL_THDVAR_ULONGLONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG, + "The buffer that is allocated when sorting the index when doing a " + "REPAIR or when creating indexes with CREATE INDEX or ALTER TABLE.", + NULL, NULL, + SORT_BUFFER_INIT, MARIA_MIN_SORT_MEMORY, SIZE_T_MAX/16, 1); + +static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG, + "Specifies how Aria index statistics collection code should treat " + "NULLs", 0, 0, 0, &maria_stats_method_typelib); + +static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir, PLUGIN_VAR_RQCMDARG, + "Controls syncing directory after log file growth and new file " + "creation", NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE, + &maria_sync_log_dir_typelib); + +#ifdef USE_ARIA_FOR_TMP_TABLES +#define USE_ARIA_FOR_TMP_TABLES_VAL 1 +#else +#define USE_ARIA_FOR_TMP_TABLES_VAL 0 +#endif +my_bool use_maria_for_temp_tables= USE_ARIA_FOR_TMP_TABLES_VAL; + +static MYSQL_SYSVAR_BOOL(used_for_temp_tables, + use_maria_for_temp_tables, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT, + "Whether temporary tables should be MyISAM or Aria", 0, 0, + 1); + +static MYSQL_SYSVAR_BOOL(encrypt_tables, maria_encrypt_tables, + PLUGIN_VAR_OPCMDARG, + "Encrypt tables (only for tables with ROW_FORMAT=PAGE (default) " + "and not FIXED/DYNAMIC)", + 0, 0, 0); + +#if defined HAVE_PSI_INTERFACE && !defined EMBEDDED_LIBRARY + +static PSI_mutex_info all_aria_mutexes[]= +{ + { &key_THR_LOCK_maria, "THR_LOCK_maria", PSI_FLAG_GLOBAL}, + { &key_LOCK_soft_sync, "LOCK_soft_sync", PSI_FLAG_GLOBAL}, + { &key_LOCK_trn_list, "LOCK_trn_list", PSI_FLAG_GLOBAL}, + { &key_SHARE_BITMAP_lock, "SHARE::bitmap::bitmap_lock", 0}, + { &key_SORT_INFO_mutex, "SORT_INFO::mutex", 0}, + { &key_TRANSLOG_BUFFER_mutex, "TRANSLOG_BUFFER::mutex", 0}, + { &key_TRANSLOG_DESCRIPTOR_dirty_buffer_mask_lock, "TRANSLOG_DESCRIPTOR::dirty_buffer_mask_lock", 0}, + { &key_TRANSLOG_DESCRIPTOR_sent_to_disk_lock, "TRANSLOG_DESCRIPTOR::sent_to_disk_lock", 0}, + { &key_TRANSLOG_DESCRIPTOR_log_flush_lock, "TRANSLOG_DESCRIPTOR::log_flush_lock", 0}, + { &key_TRANSLOG_DESCRIPTOR_file_header_lock, "TRANSLOG_DESCRIPTOR::file_header_lock", 0}, + { &key_TRANSLOG_DESCRIPTOR_unfinished_files_lock, "TRANSLOG_DESCRIPTOR::unfinished_files_lock", 0}, + { &key_TRANSLOG_DESCRIPTOR_purger_lock, "TRANSLOG_DESCRIPTOR::purger_lock", 0}, + { &key_SHARE_intern_lock, "SHARE::intern_lock", 0}, + { &key_SHARE_key_del_lock, "SHARE::key_del_lock", 0}, + { &key_SHARE_close_lock, "SHARE::close_lock", 0}, + { &key_SERVICE_THREAD_CONTROL_lock, "SERVICE_THREAD_CONTROL::LOCK_control", 0}, + { &key_TRN_state_lock, "TRN::state_lock", 0}, + { &key_PAGECACHE_cache_lock, "PAGECACHE::cache_lock", 0} +}; + +static PSI_cond_info all_aria_conds[]= +{ + { &key_COND_soft_sync, "COND_soft_sync", PSI_FLAG_GLOBAL}, + { &key_SHARE_key_del_cond, "SHARE::key_del_cond", 0}, + { &key_SERVICE_THREAD_CONTROL_cond, "SERVICE_THREAD_CONTROL::COND_control", 0}, + { &key_SORT_INFO_cond, "SORT_INFO::cond", 0}, + { &key_SHARE_BITMAP_cond, "BITMAP::bitmap_cond", 0}, + { &key_TRANSLOG_BUFFER_waiting_filling_buffer, "TRANSLOG_BUFFER::waiting_filling_buffer", 0}, + { &key_TRANSLOG_BUFFER_prev_sent_to_disk_cond, "TRANSLOG_BUFFER::prev_sent_to_disk_cond", 0}, + { &key_TRANSLOG_DESCRIPTOR_log_flush_cond, "TRANSLOG_DESCRIPTOR::log_flush_cond", 0}, + { &key_TRANSLOG_DESCRIPTOR_new_goal_cond, "TRANSLOG_DESCRIPTOR::new_goal_cond", 0} +}; + +static PSI_rwlock_info all_aria_rwlocks[]= +{ + { &key_KEYINFO_root_lock, "KEYINFO::root_lock", 0}, + { &key_SHARE_mmap_lock, "SHARE::mmap_lock", 0}, + { &key_TRANSLOG_DESCRIPTOR_open_files_lock, "TRANSLOG_DESCRIPTOR::open_files_lock", 0} +}; + +static PSI_thread_info all_aria_threads[]= +{ + { &key_thread_checkpoint, "checkpoint_background", PSI_FLAG_GLOBAL}, + { &key_thread_soft_sync, "soft_sync_background", PSI_FLAG_GLOBAL}, + { &key_thread_find_all_keys, "thr_find_all_keys", 0} +}; + +static PSI_file_info all_aria_files[]= +{ + { &key_file_translog, "translog", 0}, + { &key_file_kfile, "MAI", 0}, + { &key_file_dfile, "MAD", 0}, + { &key_file_control, "control", PSI_FLAG_GLOBAL} +}; + +# ifdef HAVE_PSI_STAGE_INTERFACE +static PSI_stage_info *all_aria_stages[]= +{ + & stage_waiting_for_a_resource +}; +# endif /* HAVE_PSI_STAGE_INTERFACE */ + +static void init_aria_psi_keys(void) +{ + const char* category= "aria"; + int count; + + count= array_elements(all_aria_mutexes); + mysql_mutex_register(category, all_aria_mutexes, count); + + count= array_elements(all_aria_rwlocks); + mysql_rwlock_register(category, all_aria_rwlocks, count); + + count= array_elements(all_aria_conds); + mysql_cond_register(category, all_aria_conds, count); + + count= array_elements(all_aria_threads); + mysql_thread_register(category, all_aria_threads, count); + + count= array_elements(all_aria_files); + mysql_file_register(category, all_aria_files, count); +# ifdef HAVE_PSI_STAGE_INTERFACE + count= array_elements(all_aria_stages); + mysql_stage_register(category, all_aria_stages, count); +# endif /* HAVE_PSI_STAGE_INTERFACE */ +} +#else +#define init_aria_psi_keys() /* no-op */ +#endif /* HAVE_PSI_INTERFACE */ + +const LEX_CSTRING MA_CHECK_INFO= { STRING_WITH_LEN("info") }; +const LEX_CSTRING MA_CHECK_WARNING= { STRING_WITH_LEN("warning") }; +const LEX_CSTRING MA_CHECK_ERROR= { STRING_WITH_LEN("error") }; + +/***************************************************************************** +** MARIA tables +*****************************************************************************/ + +static handler *maria_create_handler(handlerton *hton, + TABLE_SHARE * table, + MEM_ROOT *mem_root) +{ + return new (mem_root) ha_maria(hton, table); +} + + +static void _ma_check_print(HA_CHECK *param, const LEX_CSTRING *msg_type, + const char *msgbuf) +{ + if (msg_type == &MA_CHECK_INFO) + sql_print_information("%s.%s: %s", param->db_name, param->table_name, + msgbuf); + else if (msg_type == &MA_CHECK_WARNING) + sql_print_warning("%s.%s: %s", param->db_name, param->table_name, + msgbuf); + else + sql_print_error("%s.%s: %s", param->db_name, param->table_name, msgbuf); +} + + +// collect errors printed by maria_check routines + +static void _ma_check_print_msg(HA_CHECK *param, const LEX_CSTRING *msg_type, + const char *fmt, va_list args) +{ + THD *thd= (THD *) param->thd; + Protocol *protocol= thd->protocol; + size_t length, msg_length; + char msgbuf[MYSQL_ERRMSG_SIZE]; + char name[NAME_LEN * 2 + 2]; + + if (param->testflag & T_SUPPRESS_ERR_HANDLING) + return; + + msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args); + msgbuf[sizeof(msgbuf) - 1]= 0; // healthy paranoia + + DBUG_PRINT(msg_type->str, ("message: %s", msgbuf)); + + if (!thd->vio_ok()) + { + _ma_check_print(param, msg_type, msgbuf); + return; + } + + if (param->testflag & + (T_CREATE_MISSING_KEYS | T_SAFE_REPAIR | T_AUTO_REPAIR)) + { + myf flag= 0; + if (msg_type == &MA_CHECK_INFO) + flag= ME_NOTE; + else if (msg_type == &MA_CHECK_WARNING) + flag= ME_WARNING; + my_message(ER_NOT_KEYFILE, msgbuf, MYF(flag)); + if (thd->variables.log_warnings > 2) + _ma_check_print(param, msg_type, msgbuf); + return; + } + length= (uint) (strxmov(name, param->db_name, ".", param->table_name, + NullS) - name); + /* + TODO: switch from protocol to push_warning here. The main reason we didn't + it yet is parallel repair, which threads have no THD object accessible via + current_thd. + + Also we likely need to lock mutex here (in both cases with protocol and + push_warning). + */ + protocol->prepare_for_resend(); + protocol->store(name, (uint)length, system_charset_info); + protocol->store(param->op_name, strlen(param->op_name), system_charset_info); + protocol->store(msg_type, system_charset_info); + protocol->store(msgbuf, msg_length, system_charset_info); + if (protocol->write()) + sql_print_error("Failed on my_net_write, writing to stderr instead: %s.%s: %s\n", + param->db_name, param->table_name, msgbuf); + else if (thd->variables.log_warnings > 2) + _ma_check_print(param, msg_type, msgbuf); + + return; +} + + +/* + Convert TABLE object to Maria key and column definition + + SYNOPSIS + table2maria() + table_arg in TABLE object. + keydef_out out Maria key definition. + recinfo_out out Maria column definition. + records_out out Number of fields. + + DESCRIPTION + This function will allocate and initialize Maria key and column + definition for further use in ma_create or for a check for underlying + table conformance in merge engine. + + The caller needs to free *recinfo_out after use. Since *recinfo_out + and *keydef_out are allocated with a my_multi_malloc, *keydef_out + is freed automatically when *recinfo_out is freed. + + RETURN VALUE + 0 OK + # error code +*/ + +static int table2maria(TABLE *table_arg, data_file_type row_type, + MARIA_KEYDEF **keydef_out, + MARIA_COLUMNDEF **recinfo_out, uint *records_out, + MARIA_CREATE_INFO *create_info) +{ + uint i, j, recpos, minpos, fieldpos, temp_length, length; + enum ha_base_keytype type= HA_KEYTYPE_BINARY; + uchar *record; + KEY *pos; + MARIA_KEYDEF *keydef; + MARIA_COLUMNDEF *recinfo, *recinfo_pos; + HA_KEYSEG *keyseg; + TABLE_SHARE *share= table_arg->s; + uint options= share->db_options_in_use; + DBUG_ENTER("table2maria"); + + if (row_type == BLOCK_RECORD) + options|= HA_OPTION_PACK_RECORD; + + if (!(my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME), + recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF), + keydef_out, share->keys * sizeof(MARIA_KEYDEF), + &keyseg, + (share->key_parts + share->keys) * sizeof(HA_KEYSEG), + NullS))) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */ + keydef= *keydef_out; + recinfo= *recinfo_out; + pos= table_arg->key_info; + for (i= 0; i < share->keys; i++, pos++) + { + keydef[i].flag= (uint16) (pos->flags & (HA_NOSAME | HA_FULLTEXT | + HA_SPATIAL)); + keydef[i].key_alg= pos->algorithm == HA_KEY_ALG_UNDEF ? + (pos->flags & HA_SPATIAL ? HA_KEY_ALG_RTREE : HA_KEY_ALG_BTREE) : + pos->algorithm; + keydef[i].block_length= pos->block_size; + keydef[i].seg= keyseg; + keydef[i].keysegs= pos->user_defined_key_parts; + for (j= 0; j < pos->user_defined_key_parts; j++) + { + Field *field= pos->key_part[j].field; + + if (!table_arg->field[field->field_index]->stored_in_db()) + { + my_free(*recinfo_out); + if (table_arg->s->long_unique_table) + { + my_error(ER_TOO_LONG_KEY, MYF(0), table_arg->file->max_key_length()); + DBUG_RETURN(HA_ERR_INDEX_COL_TOO_LONG); + } + my_error(ER_KEY_BASED_ON_GENERATED_VIRTUAL_COLUMN, MYF(0)); + DBUG_RETURN(HA_ERR_UNSUPPORTED); + } + + type= field->key_type(); + keydef[i].seg[j].flag= pos->key_part[j].key_part_flag; + + if (options & HA_OPTION_PACK_KEYS || + (pos->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY | + HA_SPACE_PACK_USED))) + { + if (pos->key_part[j].length > 8 && + (type == HA_KEYTYPE_TEXT || + type == HA_KEYTYPE_NUM || + (type == HA_KEYTYPE_BINARY && !field->zero_pack()))) + { + /* No blobs here */ + if (j == 0) + keydef[i].flag|= HA_PACK_KEY; + if (!(field->flags & ZEROFILL_FLAG) && + (field->type() == MYSQL_TYPE_STRING || + field->type() == MYSQL_TYPE_VAR_STRING || + ((int) (pos->key_part[j].length - field->decimals())) >= 4)) + keydef[i].seg[j].flag|= HA_SPACE_PACK; + } + else if (j == 0 && (!(pos->flags & HA_NOSAME) || pos->key_length > 16)) + keydef[i].flag|= HA_BINARY_PACK_KEY; + } + keydef[i].seg[j].type= (int) type; + keydef[i].seg[j].start= pos->key_part[j].offset; + keydef[i].seg[j].length= pos->key_part[j].length; + keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_length= 0; + keydef[i].seg[j].bit_pos= 0; + keydef[i].seg[j].language= field->charset()->number; + + if (field->null_ptr) + { + keydef[i].seg[j].null_bit= field->null_bit; + keydef[i].seg[j].null_pos= (uint) (field->null_ptr- + (uchar*) table_arg->record[0]); + } + else + { + keydef[i].seg[j].null_bit= 0; + keydef[i].seg[j].null_pos= 0; + } + if (field->type() == MYSQL_TYPE_BLOB || + field->type() == MYSQL_TYPE_GEOMETRY) + { + keydef[i].seg[j].flag|= HA_BLOB_PART; + /* save number of bytes used to pack length */ + keydef[i].seg[j].bit_start= (uint) (field->pack_length() - + portable_sizeof_char_ptr); + } + else if (field->type() == MYSQL_TYPE_BIT) + { + keydef[i].seg[j].bit_length= ((Field_bit *) field)->bit_len; + keydef[i].seg[j].bit_start= ((Field_bit *) field)->bit_ofs; + keydef[i].seg[j].bit_pos= (uint) (((Field_bit *) field)->bit_ptr - + (uchar*) table_arg->record[0]); + } + } + keyseg+= pos->user_defined_key_parts; + } + if (table_arg->found_next_number_field) + keydef[share->next_number_index].flag|= HA_AUTO_KEY; + record= table_arg->record[0]; + recpos= 0; + recinfo_pos= recinfo; + create_info->null_bytes= table_arg->s->null_bytes; + + while (recpos < (uint) share->stored_rec_length) + { + Field **field, *found= 0; + minpos= share->reclength; + length= 0; + + for (field= table_arg->field; *field; field++) + { + if ((fieldpos= (*field)->offset(record)) >= recpos && + fieldpos <= minpos) + { + /* skip null fields */ + if (!(temp_length= (*field)->pack_length_in_rec())) + continue; /* Skip null-fields */ + if (! found || fieldpos < minpos || + (fieldpos == minpos && temp_length < length)) + { + minpos= fieldpos; + found= *field; + length= temp_length; + } + } + } + DBUG_PRINT("loop", ("found: %p recpos: %d minpos: %d length: %d", + found, recpos, minpos, length)); + if (!found) + break; + + if (found->flags & BLOB_FLAG) + recinfo_pos->type= FIELD_BLOB; + else if (found->type() == MYSQL_TYPE_TIMESTAMP) + recinfo_pos->type= FIELD_NORMAL; + else if (found->type() == MYSQL_TYPE_VARCHAR) + recinfo_pos->type= FIELD_VARCHAR; + else if (!(options & HA_OPTION_PACK_RECORD) || + (found->zero_pack() && (found->flags & PRI_KEY_FLAG))) + recinfo_pos->type= FIELD_NORMAL; + else if (found->zero_pack()) + recinfo_pos->type= FIELD_SKIP_ZERO; + else + recinfo_pos->type= ((length <= 3 || + (found->flags & ZEROFILL_FLAG)) ? + FIELD_NORMAL : + found->type() == MYSQL_TYPE_STRING || + found->type() == MYSQL_TYPE_VAR_STRING ? + FIELD_SKIP_ENDSPACE : + FIELD_SKIP_PRESPACE); + if (found->null_ptr) + { + recinfo_pos->null_bit= found->null_bit; + recinfo_pos->null_pos= (uint) (found->null_ptr - + (uchar*) table_arg->record[0]); + } + else + { + recinfo_pos->null_bit= 0; + recinfo_pos->null_pos= 0; + } + (recinfo_pos++)->length= (uint16) length; + recpos= minpos + length; + DBUG_PRINT("loop", ("length: %d type: %d", + recinfo_pos[-1].length,recinfo_pos[-1].type)); + } + *records_out= (uint) (recinfo_pos - recinfo); + DBUG_RETURN(0); +} + + +/* + Check for underlying table conformance + + SYNOPSIS + maria_check_definition() + t1_keyinfo in First table key definition + t1_recinfo in First table record definition + t1_keys in Number of keys in first table + t1_recs in Number of records in first table + t2_keyinfo in Second table key definition + t2_recinfo in Second table record definition + t2_keys in Number of keys in second table + t2_recs in Number of records in second table + strict in Strict check switch + + DESCRIPTION + This function compares two Maria definitions. By intention it was done + to compare merge table definition against underlying table definition. + It may also be used to compare dot-frm and MAI definitions of Maria + table as well to compare different Maria table definitions. + + For merge table it is not required that number of keys in merge table + must exactly match number of keys in underlying table. When calling this + function for underlying table conformance check, 'strict' flag must be + set to false, and converted merge definition must be passed as t1_*. + + Otherwise 'strict' flag must be set to 1 and it is not required to pass + converted dot-frm definition as t1_*. + + RETURN VALUE + 0 - Equal definitions. + 1 - Different definitions. + + TODO + - compare FULLTEXT keys; + - compare SPATIAL keys; + - compare FIELD_SKIP_ZERO which is converted to FIELD_NORMAL correctly + (should be correctly detected in table2maria). + + FIXME: + maria_check_definition() is never used! CHECK TABLE does not detect the + corruption! Do maria_check_definition() like check_definition() is done + by MyISAM (related to MDEV-25803). +*/ + +int maria_check_definition(MARIA_KEYDEF *t1_keyinfo, + MARIA_COLUMNDEF *t1_recinfo, + uint t1_keys, uint t1_recs, + MARIA_KEYDEF *t2_keyinfo, + MARIA_COLUMNDEF *t2_recinfo, + uint t2_keys, uint t2_recs, bool strict) +{ + uint i, j; + DBUG_ENTER("maria_check_definition"); + if ((strict ? t1_keys != t2_keys : t1_keys > t2_keys)) + { + DBUG_PRINT("error", ("Number of keys differs: t1_keys=%u, t2_keys=%u", + t1_keys, t2_keys)); + DBUG_RETURN(1); + } + if (t1_recs != t2_recs) + { + DBUG_PRINT("error", ("Number of recs differs: t1_recs=%u, t2_recs=%u", + t1_recs, t2_recs)); + DBUG_RETURN(1); + } + for (i= 0; i < t1_keys; i++) + { + HA_KEYSEG *t1_keysegs= t1_keyinfo[i].seg; + HA_KEYSEG *t2_keysegs= t2_keyinfo[i].seg; + if (t1_keyinfo[i].flag & HA_FULLTEXT && t2_keyinfo[i].flag & HA_FULLTEXT) + continue; + else if (t1_keyinfo[i].flag & HA_FULLTEXT || + t2_keyinfo[i].flag & HA_FULLTEXT) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_fulltext= %d, t2_fulltext=%d", + MY_TEST(t1_keyinfo[i].flag & HA_FULLTEXT), + MY_TEST(t2_keyinfo[i].flag & HA_FULLTEXT))); + DBUG_RETURN(1); + } + if (t1_keyinfo[i].flag & HA_SPATIAL && t2_keyinfo[i].flag & HA_SPATIAL) + continue; + else if (t1_keyinfo[i].flag & HA_SPATIAL || + t2_keyinfo[i].flag & HA_SPATIAL) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_spatial= %d, t2_spatial=%d", + MY_TEST(t1_keyinfo[i].flag & HA_SPATIAL), + MY_TEST(t2_keyinfo[i].flag & HA_SPATIAL))); + DBUG_RETURN(1); + } + if (t1_keyinfo[i].keysegs != t2_keyinfo[i].keysegs || + t1_keyinfo[i].key_alg != t2_keyinfo[i].key_alg) + { + DBUG_PRINT("error", ("Key %d has different definition", i)); + DBUG_PRINT("error", ("t1_keysegs=%d, t1_key_alg=%d", + t1_keyinfo[i].keysegs, t1_keyinfo[i].key_alg)); + DBUG_PRINT("error", ("t2_keysegs=%d, t2_key_alg=%d", + t2_keyinfo[i].keysegs, t2_keyinfo[i].key_alg)); + DBUG_RETURN(1); + } + for (j= t1_keyinfo[i].keysegs; j--;) + { + uint8 t1_keysegs_j__type= t1_keysegs[j].type; + /* + Table migration from 4.1 to 5.1. In 5.1 a *TEXT key part is + always HA_KEYTYPE_VARTEXT2. In 4.1 we had only the equivalent of + HA_KEYTYPE_VARTEXT1. Since we treat both the same on MyISAM + level, we can ignore a mismatch between these types. + */ + if ((t1_keysegs[j].flag & HA_BLOB_PART) && + (t2_keysegs[j].flag & HA_BLOB_PART)) + { + if ((t1_keysegs_j__type == HA_KEYTYPE_VARTEXT2) && + (t2_keysegs[j].type == HA_KEYTYPE_VARTEXT1)) + t1_keysegs_j__type= HA_KEYTYPE_VARTEXT1; /* purecov: tested */ + else if ((t1_keysegs_j__type == HA_KEYTYPE_VARBINARY2) && + (t2_keysegs[j].type == HA_KEYTYPE_VARBINARY1)) + t1_keysegs_j__type= HA_KEYTYPE_VARBINARY1; /* purecov: inspected */ + } + + if (t1_keysegs_j__type != t2_keysegs[j].type || + t1_keysegs[j].language != t2_keysegs[j].language || + t1_keysegs[j].null_bit != t2_keysegs[j].null_bit || + t1_keysegs[j].length != t2_keysegs[j].length) + { + DBUG_PRINT("error", ("Key segment %d (key %d) has different " + "definition", j, i)); + DBUG_PRINT("error", ("t1_type=%d, t1_language=%d, t1_null_bit=%d, " + "t1_length=%d", + t1_keysegs[j].type, t1_keysegs[j].language, + t1_keysegs[j].null_bit, t1_keysegs[j].length)); + DBUG_PRINT("error", ("t2_type=%d, t2_language=%d, t2_null_bit=%d, " + "t2_length=%d", + t2_keysegs[j].type, t2_keysegs[j].language, + t2_keysegs[j].null_bit, t2_keysegs[j].length)); + + DBUG_RETURN(1); + } + } + } + + for (i= 0; i < t1_recs; i++) + { + MARIA_COLUMNDEF *t1_rec= &t1_recinfo[i]; + MARIA_COLUMNDEF *t2_rec= &t2_recinfo[i]; + /* + FIELD_SKIP_ZERO can be changed to FIELD_NORMAL in maria_create, + see NOTE1 in ma_create.c + */ + if ((t1_rec->type != t2_rec->type && + !(t1_rec->type == (int) FIELD_SKIP_ZERO && + t1_rec->length == 1 && + t2_rec->type == (int) FIELD_NORMAL)) || + t1_rec->length != t2_rec->length || + t1_rec->null_bit != t2_rec->null_bit) + { + DBUG_PRINT("error", ("Field %d has different definition", i)); + DBUG_PRINT("error", ("t1_type=%d, t1_length=%d, t1_null_bit=%d", + t1_rec->type, t1_rec->length, t1_rec->null_bit)); + DBUG_PRINT("error", ("t2_type=%d, t2_length=%d, t2_null_bit=%d", + t2_rec->type, t2_rec->length, t2_rec->null_bit)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +extern "C" { + +int _ma_killed_ptr(HA_CHECK *param) +{ + if (!param->thd || likely(thd_killed((THD*)param->thd)) == 0) + return 0; + my_errno= HA_ERR_ABORTED_BY_USER; + return 1; +} + + +/* + Report progress to mysqld + + This is a bit more complex than what a normal progress report + function normally is. + + The reason is that this is called by enable_index/repair which + is one stage in ALTER TABLE and we can't use the external + stage/max_stage for this. + + thd_progress_init/thd_progress_next_stage is to be called by + high level commands like CHECK TABLE or REPAIR TABLE, not + by sub commands like enable_index(). + + In ma_check.c it's easier to work with stages than with a total + progress, so we use internal stage/max_stage here to keep the + code simple. +*/ + +void _ma_report_progress(HA_CHECK *param, ulonglong progress, + ulonglong max_progress) +{ + if (param->thd) + thd_progress_report((THD*)param->thd, + progress + max_progress * param->stage, + max_progress * param->max_stage); +} + + +void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_error"); + param->error_printed++; + param->out_flag |= O_DATA_LOST; + if (param->testflag & T_SUPPRESS_ERR_HANDLING) + DBUG_VOID_RETURN; + va_start(args, fmt); + _ma_check_print_msg(param, &MA_CHECK_ERROR, fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + + +void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_info"); + va_start(args, fmt); + _ma_check_print_msg(param, &MA_CHECK_INFO, fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + + +void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_warning"); + param->warning_printed++; + param->out_flag |= O_DATA_LOST; + va_start(args, fmt); + _ma_check_print_msg(param, &MA_CHECK_WARNING, fmt, args); + va_end(args); + DBUG_VOID_RETURN; +} + +/* + Create a transaction object + + SYNOPSIS + info Maria handler + + RETURN + 0 ok + # Error number (HA_ERR_OUT_OF_MEM) +*/ + +static int maria_create_trn_for_mysql(MARIA_HA *info) +{ + THD *thd= ((TABLE*) info->external_ref)->in_use; + TRN *trn= THD_TRN; + DBUG_ENTER("maria_create_trn_for_mysql"); + + if (!trn) /* no transaction yet - open it now */ + { + trn= trnman_new_trn(& thd->transaction->wt); + if (unlikely(!trn)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + thd_set_ha_data(thd, maria_hton, trn); + if (thd->variables.option_bits & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + trans_register_ha(thd, TRUE, maria_hton, trn->trid); + } + _ma_set_trn_for_table(info, trn); + if (!trnman_increment_locked_tables(trn)) + { + trans_register_ha(thd, FALSE, maria_hton, trn->trid); + trnman_new_statement(trn); + } +#ifdef EXTRA_DEBUG + if (info->lock_type == F_WRLCK && + ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED)) + { + trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED | + TRN_STATE_TABLES_CAN_CHANGE); + (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), + thd->query_length()); + } + else + { + DBUG_PRINT("info", ("lock_type: %d trnman_flags: %u", + info->lock_type, trnman_get_flags(trn))); + } + +#endif + DBUG_RETURN(0); +} + +my_bool ma_killed_in_mariadb(MARIA_HA *info) +{ + return (((TABLE*) (info->external_ref))->in_use->killed != 0); +} + +void maria_debug_crash_here(const char *keyword) +{ +#ifndef DBUG_OFF + debug_crash_here(keyword); +#endif /* DBUG_OFF */ +} + +} /* extern "C" */ + +/** + Transactional table doing bulk insert with one single UNDO + (UNDO_BULK_INSERT) and with repair. +*/ +#define BULK_INSERT_SINGLE_UNDO_AND_REPAIR 1 +/** + Transactional table doing bulk insert with one single UNDO + (UNDO_BULK_INSERT) and without repair. +*/ +#define BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR 2 +/** + None of BULK_INSERT_SINGLE_UNDO_AND_REPAIR and + BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR. +*/ +#define BULK_INSERT_NONE 0 + +ha_maria::ha_maria(handlerton *hton, TABLE_SHARE *table_arg): +handler(hton, table_arg), file(0), +int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER | + HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | + HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY | + HA_FILE_BASED | HA_CAN_GEOMETRY | TRANSACTION_STATE | + HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS | HA_CAN_REPAIR | + HA_CAN_VIRTUAL_COLUMNS | HA_CAN_EXPORT | + HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT | + HA_CAN_TABLES_WITHOUT_ROLLBACK), +can_enable_indexes(0), bulk_insert_single_undo(BULK_INSERT_NONE) +{} + + +handler *ha_maria::clone(const char *name __attribute__((unused)), + MEM_ROOT *mem_root) +{ + ha_maria *new_handler= + static_cast <ha_maria *>(handler::clone(file->s->open_file_name.str, + mem_root)); + if (new_handler) + { + new_handler->file->state= file->state; + /* maria_create_trn_for_mysql() is never called for clone() tables */ + new_handler->file->trn= file->trn; + DBUG_ASSERT(new_handler->file->trn_prev == 0 && + new_handler->file->trn_next == 0); + } + return new_handler; +} + + +static const char *ha_maria_exts[]= +{ + MARIA_NAME_IEXT, + MARIA_NAME_DEXT, + NullS +}; + + +const char *ha_maria::index_type(uint key_number) +{ + return ((table->key_info[key_number].flags & HA_FULLTEXT) ? + "FULLTEXT" : + (table->key_info[key_number].flags & HA_SPATIAL) ? + "SPATIAL" : + (table->key_info[key_number].algorithm == HA_KEY_ALG_RTREE) ? + "RTREE" : "BTREE"); +} + + +ulong ha_maria::index_flags(uint inx, uint part, bool all_parts) const +{ + ulong flags; + if (table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) + flags= 0; + else + if ((table_share->key_info[inx].flags & HA_SPATIAL || + table_share->key_info[inx].algorithm == HA_KEY_ALG_RTREE)) + { + /* All GIS scans are non-ROR scans. We also disable IndexConditionPushdown */ + flags= HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | + HA_READ_ORDER | HA_KEYREAD_ONLY | HA_KEY_SCAN_NOT_ROR; + } + else + { + flags= HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | + HA_READ_ORDER | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN; + } + return flags; +} + + +double ha_maria::scan_time() +{ + if (file->s->data_file_type == BLOCK_RECORD) + return (ulonglong2double(stats.data_file_length - file->s->block_size) / + file->s->block_size) + 2; + return handler::scan_time(); +} + +/* + We need to be able to store at least 2 keys on an index page as the + splitting algorithms depends on this. (With only one key on a page + we also can't use any compression, which may make the index file much + larger) + We use MARIA_MAX_KEY_LENGTH to limit the key size as we don't want to use + too much stack when searching in the b_tree. + + We also need to reserve place for a record pointer (8) and 3 bytes + per key segment to store the length of the segment + possible null bytes. + These extra bytes are required here so that maria_create() will surely + accept any keys created which the returned key data storage length. +*/ + +uint ha_maria::max_supported_key_length() const +{ + return maria_max_key_length(); +} + +/* Name is here without an extension */ + +int ha_maria::open(const char *name, int mode, uint test_if_locked) +{ + uint i; + +#ifdef NOT_USED + /* + If the user wants to have memory mapped data files, add an + open_flag. Do not memory map temporary tables because they are + expected to be inserted and thus extended a lot. Memory mapping is + efficient for files that keep their size, but very inefficient for + growing files. Using an open_flag instead of calling ma_extra(... + HA_EXTRA_MMAP ...) after maxs_open() has the advantage that the + mapping is not repeated for every open, but just done on the initial + open, when the MyISAM share is created. Every time the server + requires to open a new instance of a table it calls this method. We + will always supply HA_OPEN_MMAP for a permanent table. However, the + Maria storage engine will ignore this flag if this is a secondary + open of a table that is in use by other threads already (if the + Maria share exists already). + */ + if (!(test_if_locked & HA_OPEN_TMP_TABLE) && opt_maria_use_mmap) + test_if_locked|= HA_OPEN_MMAP; +#endif + + if (maria_recover_options & HA_RECOVER_ANY) + { + /* user asked to trigger a repair if table was not properly closed */ + test_if_locked|= HA_OPEN_ABORT_IF_CRASHED; + } + + if (aria_readonly) + test_if_locked|= HA_OPEN_IGNORE_MOVED_STATE; + + if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER, + s3_open_args()))) + { + if (my_errno == HA_ERR_OLD_FILE) + { + push_warning(current_thd, Sql_condition::WARN_LEVEL_NOTE, + ER_CRASHED_ON_USAGE, + zerofill_error_msg); + } + return (my_errno ? my_errno : -1); + } + if (aria_readonly) + file->s->options|= HA_OPTION_READ_ONLY_DATA; + + file->s->chst_invalidator= query_cache_invalidate_by_MyISAM_filename_ref; + /* Set external_ref, mainly for temporary tables */ + file->external_ref= (void*) table; // For ma_killed() + + if (test_if_locked & (HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_TMP_TABLE)) + maria_extra(file, HA_EXTRA_NO_WAIT_LOCK, 0); + + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (!(test_if_locked & HA_OPEN_WAIT_IF_LOCKED)) + maria_extra(file, HA_EXTRA_WAIT_LOCK, 0); + if ((data_file_type= file->s->data_file_type) != STATIC_RECORD) + int_table_flags |= HA_REC_NOT_IN_SEQ; + if (!file->s->base.born_transactional) + { + /* + INSERT DELAYED cannot work with transactional tables (because it cannot + stand up to "when client gets ok the data is safe on disk": the record + may not even be inserted). In the future, we could enable it back (as a + client doing INSERT DELAYED knows the specificities; but we then should + make sure to regularly commit in the delayed_insert thread). + */ + int_table_flags|= HA_CAN_INSERT_DELAYED | HA_NO_TRANSACTIONS; + } + else + int_table_flags|= HA_CRASH_SAFE; + + if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + int_table_flags |= HA_HAS_NEW_CHECKSUM; + + /* + We can only do online backup on transactional tables with checksum. + Checksums are needed to avoid half writes. + */ + if (file->s->options & HA_OPTION_PAGE_CHECKSUM && + file->s->base.born_transactional) + int_table_flags |= HA_CAN_ONLINE_BACKUPS; + + /* + For static size rows, tell MariaDB that we will access all bytes + in the record when writing it. This signals MariaDB to initialize + the full row to ensure we don't get any errors from valgrind and + that all bytes in the row is properly reset. + */ + if (file->s->data_file_type == STATIC_RECORD && + (file->s->has_varchar_fields || file->s->has_null_fields)) + int_table_flags|= HA_RECORD_MUST_BE_CLEAN_ON_WRITE; + + for (i= 0; i < table->s->keys; i++) + { + plugin_ref parser= table->key_info[i].parser; + if (table->key_info[i].flags & HA_USES_PARSER) + file->s->keyinfo[i].parser= + (struct st_mysql_ftparser *)plugin_decl(parser)->info; + table->key_info[i].block_size= file->s->keyinfo[i].block_length; + } + my_errno= 0; + + /* Count statistics of usage for newly open normal files */ + if (file->s->reopen == 1 && ! (test_if_locked & HA_OPEN_TMP_TABLE)) + { + if (file->s->delay_key_write) + feature_files_opened_with_delayed_keys++; + } + + return my_errno; +} + + +int ha_maria::close(void) +{ + MARIA_HA *tmp= file; + if (!tmp) + return 0; + DBUG_ASSERT(file->trn == 0 || file->trn == &dummy_transaction_object); + DBUG_ASSERT(file->trn_next == 0 && file->trn_prev == 0); + file= 0; + return maria_close(tmp); +} + + +int ha_maria::write_row(const uchar * buf) +{ + /* + If we have an auto_increment column and we are writing a changed row + or a new row, then update the auto_increment value in the record. + */ + if (table->next_number_field && buf == table->record[0]) + { + int error; + if ((error= update_auto_increment())) + return error; + } + return maria_write(file, buf); +} + + +int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt) +{ + int error, fatal_error; + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + MARIA_SHARE *share= file->s; + const char *old_proc_info; + TRN *old_trn= file->trn; + + if (!file || !param) return HA_ADMIN_INTERNAL_ERROR; + + unmap_file(file); + register_handler(file); + maria_chk_init(param); + param->thd= thd; + param->op_name= "check"; + param->db_name= table->s->db.str; + param->table_name= table->alias.c_ptr(); + param->testflag= check_opt->flags | T_CHECK | T_SILENT; + param->stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method); + + if (!(table->db_stat & HA_READ_ONLY)) + param->testflag |= T_STATISTICS; + param->using_global_keycache= 1; + + if (!maria_is_crashed(file) && + (((param->testflag & T_CHECK_ONLY_CHANGED) && + !(share->state.changed & (STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR)) && + share->state.open_count == 0) || + ((param->testflag & T_FAST) && (share->state.open_count == + (uint) (share->global_changed ? 1 : + 0))))) + return HA_ADMIN_ALREADY_DONE; + + maria_chk_init_for_check(param, file); + param->max_allowed_lsn= translog_get_horizon(); + + if ((file->s->state.changed & (STATE_CRASHED_FLAGS | STATE_MOVED)) == + STATE_MOVED) + { + _ma_check_print_error(param, "%s", zerofill_error_msg); + return HA_ADMIN_CORRUPT; + } + + old_proc_info= thd_proc_info(thd, "Checking status"); + thd_progress_init(thd, 3); + error= maria_chk_status(param, file); // Not fatal + /* maria_chk_size() will flush the page cache for this file */ + if (maria_chk_size(param, file)) + error= 1; + if (!error) + error|= maria_chk_del(param, file, param->testflag); + thd_proc_info(thd, "Checking keys"); + thd_progress_next_stage(thd); + if (!error) + error= maria_chk_key(param, file); + thd_proc_info(thd, "Checking data"); + thd_progress_next_stage(thd); + if (!error) + { + if ((!(param->testflag & T_QUICK) && + ((share->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) || + (param->testflag & (T_EXTEND | T_MEDIUM)))) || maria_is_crashed(file)) + { + ulonglong old_testflag= param->testflag; + param->testflag |= T_MEDIUM; + + /* BLOCK_RECORD does not need a cache as it is using the page cache */ + if (file->s->data_file_type != BLOCK_RECORD) + error= init_io_cache(¶m->read_cache, file->dfile.file, + my_default_record_cache_size, READ_CACHE, + share->pack.header_length, 1, MYF(MY_WME)); + if (!error) + error= maria_chk_data_link(param, file, + MY_TEST(param->testflag & T_EXTEND)); + + if (file->s->data_file_type != BLOCK_RECORD) + end_io_cache(¶m->read_cache); + param->testflag= old_testflag; + } + } + fatal_error= error; + if (param->error_printed && + param->error_printed == (param->skip_lsn_error_count + + param->not_visible_rows_found) && + !(share->state.changed & (STATE_CRASHED_FLAGS | STATE_IN_REPAIR))) + { + _ma_check_print_error(param, "%s", zerofill_error_msg); + /* This ensures that a future REPAIR TABLE will only do a zerofill */ + file->update|= STATE_MOVED; + share->state.changed|= STATE_MOVED; + fatal_error= 0; + } + if (!fatal_error) + { + if ((share->state.changed & (STATE_CHANGED | STATE_MOVED | + STATE_CRASHED_FLAGS | + STATE_IN_REPAIR | STATE_NOT_ANALYZED)) || + (param->testflag & T_STATISTICS) || maria_is_crashed(file)) + { + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + mysql_mutex_lock(&share->intern_lock); + DBUG_PRINT("info", ("Resetting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR); + if (!(table->db_stat & HA_READ_ONLY)) + { + int tmp; + if ((tmp= maria_update_state_info(param, file, + UPDATE_TIME | UPDATE_OPEN_COUNT | + UPDATE_STAT))) + error= tmp; + } + mysql_mutex_unlock(&share->intern_lock); + info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE | + HA_STATUS_CONST); + + /* + Write a 'table is ok' message to error log if table is ok and + we have written to error log that table was getting checked + */ + if (!error && !(table->db_stat & HA_READ_ONLY) && + !maria_is_crashed(file) && thd->error_printed_to_log && + (param->warning_printed || param->error_printed || + param->note_printed)) + _ma_check_print_info(param, "Table is fixed"); + } + } + else if (!maria_is_crashed(file) && !thd->killed) + { + maria_mark_crashed(file); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + + /* Reset trn, that may have been set by repair */ + if (old_trn && old_trn != file->trn) + { + DBUG_ASSERT(old_trn->used_instances == 0); + _ma_set_trn_for_table(file, old_trn); + } + thd_proc_info(thd, old_proc_info); + thd_progress_end(thd); + return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK; +} + + +/* + Analyze the key distribution in the table + As the table may be only locked for read, we have to take into account that + two threads may do an analyze at the same time! +*/ + +int ha_maria::analyze(THD *thd, HA_CHECK_OPT * check_opt) +{ + int error= 0; + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + MARIA_SHARE *share= file->s; + const char *old_proc_info; + + if (!param) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(param); + param->thd= thd; + param->op_name= "analyze"; + param->db_name= table->s->db.str; + param->table_name= table->alias.c_ptr(); + param->testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS | + T_DONT_CHECK_CHECKSUM); + param->using_global_keycache= 1; + param->stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method); + + if (!(share->state.changed & STATE_NOT_ANALYZED)) + return HA_ADMIN_ALREADY_DONE; + + old_proc_info= thd_proc_info(thd, "Scanning"); + thd_progress_init(thd, 1); + error= maria_chk_key(param, file); + if (!error) + { + mysql_mutex_lock(&share->intern_lock); + error= maria_update_state_info(param, file, UPDATE_STAT); + mysql_mutex_unlock(&share->intern_lock); + } + else if (!maria_is_crashed(file) && !thd->killed) + maria_mark_crashed(file); + thd_proc_info(thd, old_proc_info); + thd_progress_end(thd); + return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK; +} + +int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + ha_rows start_records; + const char *old_proc_info; + + if (!file || !param) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(param); + param->thd= thd; + param->op_name= "repair"; + file->error_count=0; + + /* + The following can only be true if the table was marked as STATE_MOVED + during a CHECK TABLE and the table has not been used since then + */ + if ((file->s->state.changed & STATE_MOVED) && + !(file->s->state.changed & STATE_CRASHED_FLAGS)) + { + param->db_name= table->s->db.str; + param->table_name= table->alias.c_ptr(); + param->testflag= check_opt->flags; + _ma_check_print_info(param, "Running zerofill on moved table"); + return zerofill(thd, check_opt); + } + + param->testflag= ((check_opt->flags & ~(T_EXTEND)) | + T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM | + (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT)); + param->orig_sort_buffer_length= THDVAR(thd, sort_buffer_size); + param->backup_time= check_opt->start_time; + start_records= file->state->records; + old_proc_info= thd_proc_info(thd, "Checking table"); + thd_progress_init(thd, 1); + while ((error= repair(thd, param, 0)) && param->retry_repair) + { + param->retry_repair= 0; + file->state->records= start_records; + if (test_all_bits(param->testflag, + (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK))) + { + param->testflag&= ~(T_RETRY_WITHOUT_QUICK | T_QUICK); + /* Ensure we don't loose any rows when retrying without quick */ + param->testflag|= T_SAFE_REPAIR; + if (thd->vio_ok()) + _ma_check_print_info(param, "Retrying repair without quick"); + else + sql_print_information("Retrying repair of: '%s' without quick", + table->s->path.str); + continue; + } + param->testflag &= ~T_QUICK; + if (param->testflag & T_REP_BY_SORT) + { + param->testflag= (param->testflag & ~T_REP_BY_SORT) | T_REP; + if (thd->vio_ok()) + _ma_check_print_info(param, "Retrying repair with keycache"); + sql_print_information("Retrying repair of: '%s' with keycache", + table->s->path.str); + continue; + } + break; + } + /* + Commit is needed in the case of tables are locked to ensure that repair + is registered in the recovery log + */ + if (implicit_commit(thd, TRUE)) + error= HA_ADMIN_COMMIT_ERROR; + + if (!error && start_records != file->state->records && + !(check_opt->flags & T_VERY_SILENT)) + { + char llbuff[22], llbuff2[22]; + sql_print_information("Found %s of %s rows when repairing '%s'", + llstr(file->state->records, llbuff), + llstr(start_records, llbuff2), + table->s->path.str); + } + thd_proc_info(thd, old_proc_info); + thd_progress_end(thd); + return error; +} + +int ha_maria::zerofill(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + TRN *old_trn; + MARIA_SHARE *share= file->s; + + if (!file || !param) + return HA_ADMIN_INTERNAL_ERROR; + + unmap_file(file); + old_trn= file->trn; + maria_chk_init(param); + param->thd= thd; + param->op_name= "zerofill"; + param->testflag= check_opt->flags | T_SILENT | T_ZEROFILL; + param->orig_sort_buffer_length= THDVAR(thd, sort_buffer_size); + param->db_name= table->s->db.str; + param->table_name= table->alias.c_ptr(); + + error=maria_zerofill(param, file, share->open_file_name.str); + + /* Reset trn, that may have been set by repair */ + if (old_trn && old_trn != file->trn) + _ma_set_trn_for_table(file, old_trn); + + if (!error) + { + TrID create_trid= trnman_get_min_safe_trid(); + mysql_mutex_lock(&share->intern_lock); + share->state.changed|= STATE_NOT_MOVABLE; + maria_update_state_info(param, file, UPDATE_TIME | UPDATE_OPEN_COUNT); + _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE, create_trid, + TRUE, TRUE); + mysql_mutex_unlock(&share->intern_lock); + } + return error; +} + +int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt) +{ + int error; + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + + if (!file || !param) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(param); + param->thd= thd; + param->op_name= "optimize"; + param->testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE | + T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX); + param->orig_sort_buffer_length= THDVAR(thd, sort_buffer_size); + thd_progress_init(thd, 1); + if ((error= repair(thd, param, 1)) && param->retry_repair) + { + sql_print_warning("Warning: Optimize table got errno %d on %s.%s, retrying", + my_errno, param->db_name, param->table_name); + param->testflag &= ~T_REP_BY_SORT; + error= repair(thd, param, 0); + } + thd_progress_end(thd); + return error; +} + + +int ha_maria::repair(THD *thd, HA_CHECK *param, bool do_optimize) +{ + int error= 0; + ulonglong local_testflag= param->testflag; + bool optimize_done= !do_optimize, statistics_done= 0, full_repair_done= 0; + const char *old_proc_info= thd->proc_info; + char fixed_name[FN_REFLEN]; + MARIA_SHARE *share= file->s; + ha_rows rows= file->state->records; + TRN *old_trn= file->trn; + my_bool locking= 0; + DBUG_ENTER("ha_maria::repair"); + + /* + Normally this method is entered with a properly opened table. If the + repair fails, it can be repeated with more elaborate options. Under + special circumstances it can happen that a repair fails so that it + closed the data file and cannot re-open it. In this case file->dfile + is set to -1. We must not try another repair without an open data + file. (Bug #25289) + */ + if (file->dfile.file == -1) + { + sql_print_information("Retrying repair of: '%s' failed. " + "Please try REPAIR EXTENDED or aria_chk", + table->s->path.str); + DBUG_RETURN(HA_ADMIN_FAILED); + } + + /* + If transactions was not enabled for a transactional table then + file->s->status is not up to date. This is needed for repair_by_sort + to work + */ + if (share->base.born_transactional && !share->now_transactional) + _ma_copy_nontrans_state_information(file); + + param->db_name= table->s->db.str; + param->table_name= table->alias.c_ptr(); + param->tmpfile_createflag= O_RDWR | O_TRUNC; + param->using_global_keycache= 1; + param->thd= thd; + param->tmpdir= &mysql_tmpdir_list; + param->out_flag= 0; + share->state.dupp_key= MI_MAX_KEY; + strmov(fixed_name, share->open_file_name.str); + unmap_file(file); + + /* + Don't lock tables if we have used LOCK TABLE or if we come from + enable_index() + */ + if (!thd->locked_tables_mode && ! (param->testflag & T_NO_LOCKS)) + { + locking= 1; + if (maria_lock_database(file, table->s->tmp_table ? F_EXTRA_LCK : F_WRLCK)) + { + _ma_check_print_error(param, ER_THD(thd, ER_CANT_LOCK), my_errno); + DBUG_RETURN(HA_ADMIN_FAILED); + } + } + + if (!do_optimize || + (((share->data_file_type == BLOCK_RECORD) ? + (share->state.changed & STATE_NOT_OPTIMIZED_ROWS) : + (file->state->del || + share->state.split != file->state->records)) && + (!(param->testflag & T_QUICK) || + (share->state.changed & (STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_OPTIMIZED_ROWS))))) + { + ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ? + maria_get_mask_all_keys_active(share->base.keys) : + share->state.key_map); + ulonglong save_testflag= param->testflag; + if (maria_test_if_sort_rep(file, file->state->records, key_map, 0) && + (local_testflag & T_REP_BY_SORT)) + { + local_testflag |= T_STATISTICS; + param->testflag |= T_STATISTICS; // We get this for free + statistics_done= 1; + /* TODO: Remove BLOCK_RECORD test when parallel works with blocks */ + if (THDVAR(thd,repair_threads) > 1 && + share->data_file_type != BLOCK_RECORD) + { + char buf[40]; + /* TODO: respect maria_repair_threads variable */ + my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map)); + thd_proc_info(thd, buf); + param->testflag|= T_REP_PARALLEL; + error= maria_repair_parallel(param, file, fixed_name, + MY_TEST(param->testflag & T_QUICK)); + /* to reset proc_info, as it was pointing to local buffer */ + thd_proc_info(thd, "Repair done"); + } + else + { + thd_proc_info(thd, "Repair by sorting"); + param->testflag|= T_REP_BY_SORT; + error= maria_repair_by_sort(param, file, fixed_name, + MY_TEST(param->testflag & T_QUICK)); + } + if (error && file->create_unique_index_by_sort && + share->state.dupp_key != MAX_KEY) + { + my_errno= HA_ERR_FOUND_DUPP_KEY; + print_keydup_error(table, &table->key_info[share->state.dupp_key], + MYF(0)); + } + } + else + { + thd_proc_info(thd, "Repair with keycache"); + param->testflag &= ~(T_REP_BY_SORT | T_REP_PARALLEL); + error= maria_repair(param, file, fixed_name, + MY_TEST(param->testflag & T_QUICK)); + } + param->testflag= save_testflag | (param->testflag & T_RETRY_WITHOUT_QUICK); + optimize_done= 1; + /* + set full_repair_done if we re-wrote all rows and all keys + (and thus removed all transid's from the table + */ + full_repair_done= !MY_TEST(param->testflag & T_QUICK); + } + if (!error) + { + if ((local_testflag & T_SORT_INDEX) && + (share->state.changed & STATE_NOT_SORTED_PAGES)) + { + optimize_done= 1; + thd_proc_info(thd, "Sorting index"); + error= maria_sort_index(param, file, fixed_name); + } + if (!error && !statistics_done && (local_testflag & T_STATISTICS)) + { + if (share->state.changed & STATE_NOT_ANALYZED) + { + optimize_done= 1; + thd_proc_info(thd, "Analyzing"); + error= maria_chk_key(param, file); + } + else + local_testflag &= ~T_STATISTICS; // Don't update statistics + } + } + thd_proc_info(thd, "Saving state"); + if (full_repair_done && !error && + !(param->testflag & T_NO_CREATE_RENAME_LSN)) + { + /* Set trid (needed if the table was moved from another system) */ + share->state.create_trid= trnman_get_min_safe_trid(); + } + mysql_mutex_lock(&share->intern_lock); + if (!error) + { + if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file)) + { + DBUG_PRINT("info", ("Resetting crashed state")); + share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED_FLAGS | + STATE_IN_REPAIR | STATE_MOVED); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + } + /* + repair updates share->state.state. Ensure that file->state is up to date + */ + if (file->state != &share->state.state) + *file->state= share->state.state; + + if (share->base.auto_key) + _ma_update_auto_increment_key(param, file, 1); + if (optimize_done) + error= maria_update_state_info(param, file, + UPDATE_TIME | UPDATE_OPEN_COUNT | + (local_testflag & + T_STATISTICS ? UPDATE_STAT : 0)); + /* File is repaired; Mark the file as moved to this system */ + (void) _ma_set_uuid(share, 0); + + info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE | + HA_STATUS_CONST); + if (rows != file->state->records && !(param->testflag & T_VERY_SILENT)) + { + char llbuff[22], llbuff2[22]; + _ma_check_print_warning(param, "Number of rows changed from %s to %s", + llstr(rows, llbuff), + llstr(file->state->records, llbuff2)); + } + } + else + { + maria_mark_crashed_on_repair(file); + file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED; + maria_update_state_info(param, file, 0); + } + mysql_mutex_unlock(&share->intern_lock); + thd_proc_info(thd, old_proc_info); + thd_progress_end(thd); // Mark done + if (locking) + maria_lock_database(file, F_UNLCK); + + /* Reset trn, that may have been set by repair */ + if (old_trn && old_trn != file->trn) + _ma_set_trn_for_table(file, old_trn); + error= error ? HA_ADMIN_FAILED : + (optimize_done ? + (write_log_record_for_repair(param, file) ? HA_ADMIN_FAILED : + HA_ADMIN_OK) : HA_ADMIN_ALREADY_DONE); + DBUG_RETURN(error); +} + + +/* + Assign table indexes to a specific key cache. +*/ + +int ha_maria::assign_to_keycache(THD * thd, HA_CHECK_OPT *check_opt) +{ +#if 0 && NOT_IMPLEMENTED + PAGECACHE *new_pagecache= check_opt->pagecache; + const char *errmsg= 0; + int error= HA_ADMIN_OK; + ulonglong map; + TABLE_LIST *table_list= table->pos_in_table_list; + DBUG_ENTER("ha_maria::assign_to_keycache"); + + table->keys_in_use_for_query.clear_all(); + + if (table_list->process_index_hints(table)) + DBUG_RETURN(HA_ADMIN_FAILED); + map= ~(ulonglong) 0; + if (!table->keys_in_use_for_query.is_clear_all()) + /* use all keys if there's no list specified by the user through hints */ + map= table->keys_in_use_for_query.to_ulonglong(); + + if ((error= maria_assign_to_pagecache(file, map, new_pagecache))) + { + char buf[STRING_BUFFER_USUAL_SIZE]; + my_snprintf(buf, sizeof(buf), + "Failed to flush to index file (errno: %d)", error); + errmsg= buf; + error= HA_ADMIN_CORRUPT; + } + + if (error != HA_ADMIN_OK) + { + /* Send error to user */ + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + if (!param) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(param); + param->thd= thd; + param->op_name= "assign_to_keycache"; + param->db_name= table->s->db.str; + param->table_name= table->s->table_name.str; + param->testflag= 0; + _ma_check_print_error(param, errmsg); + } + DBUG_RETURN(error); +#else + return HA_ADMIN_NOT_IMPLEMENTED; +#endif +} + + +/* + Preload pages of the index file for a table into the key cache. +*/ + +int ha_maria::preload_keys(THD * thd, HA_CHECK_OPT *check_opt) +{ + ulonglong map; + TABLE_LIST *table_list= table->pos_in_table_list; + + DBUG_ENTER("ha_maria::preload_keys"); + + table->keys_in_use_for_query.clear_all(); + + if (table_list->process_index_hints(table)) + DBUG_RETURN(HA_ADMIN_FAILED); + + map= ~(ulonglong) 0; + /* Check validity of the index references */ + if (!table->keys_in_use_for_query.is_clear_all()) + /* use all keys if there's no list specified by the user through hints */ + map= table->keys_in_use_for_query.to_ulonglong(); + + maria_extra(file, HA_EXTRA_PRELOAD_BUFFER_SIZE, + (void*) &thd->variables.preload_buff_size); + + int error; + + if ((error= maria_preload(file, map, table_list->ignore_leaves))) + { + char buf[MYSQL_ERRMSG_SIZE+20]; + const char *errmsg; + + switch (error) { + case HA_ERR_NON_UNIQUE_BLOCK_SIZE: + errmsg= "Indexes use different block sizes"; + break; + case HA_ERR_OUT_OF_MEM: + errmsg= "Failed to allocate buffer"; + break; + default: + my_snprintf(buf, sizeof(buf), + "Failed to read from index file (errno: %d)", my_errno); + errmsg= buf; + } + + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + if (!param) + return HA_ADMIN_INTERNAL_ERROR; + + maria_chk_init(param); + param->thd= thd; + param->op_name= "preload_keys"; + param->db_name= table->s->db.str; + param->table_name= table->s->table_name.str; + param->testflag= 0; + _ma_check_print_error(param, "%s", errmsg); + DBUG_RETURN(HA_ADMIN_FAILED); + } + DBUG_RETURN(HA_ADMIN_OK); +} + + +/* + Disable indexes, making it persistent if requested. + + SYNOPSIS + disable_indexes() + mode mode of operation: + HA_KEY_SWITCH_NONUNIQ disable all non-unique keys + HA_KEY_SWITCH_ALL disable all keys + HA_KEY_SWITCH_NONUNIQ_SAVE dis. non-uni. and make persistent + HA_KEY_SWITCH_ALL_SAVE dis. all keys and make persistent + + IMPLEMENTATION + HA_KEY_SWITCH_NONUNIQ is not implemented. + HA_KEY_SWITCH_ALL_SAVE is not implemented. + + RETURN + 0 ok + HA_ERR_WRONG_COMMAND mode not implemented. +*/ + +int ha_maria::disable_indexes(uint mode) +{ + int error; + + if (mode == HA_KEY_SWITCH_ALL) + { + /* call a storage engine function to switch the key map */ + error= maria_disable_indexes(file); + } + else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) + { + maria_extra(file, HA_EXTRA_NO_KEYS, 0); + info(HA_STATUS_CONST); // Read new key info + error= 0; + } + else + { + /* mode not implemented */ + error= HA_ERR_WRONG_COMMAND; + } + return error; +} + + +/* + Enable indexes, making it persistent if requested. + + SYNOPSIS + enable_indexes() + mode mode of operation: + HA_KEY_SWITCH_NONUNIQ enable all non-unique keys + HA_KEY_SWITCH_ALL enable all keys + HA_KEY_SWITCH_NONUNIQ_SAVE en. non-uni. and make persistent + HA_KEY_SWITCH_ALL_SAVE en. all keys and make persistent + + DESCRIPTION + Enable indexes, which might have been disabled by disable_index() before. + The modes without _SAVE work only if both data and indexes are empty, + since the MARIA repair would enable them persistently. + To be sure in these cases, call handler::delete_all_rows() before. + + IMPLEMENTATION + HA_KEY_SWITCH_NONUNIQ is not implemented. + HA_KEY_SWITCH_ALL_SAVE is not implemented. + + RETURN + 0 ok + !=0 Error, among others: + HA_ERR_CRASHED data or index is non-empty. Delete all rows and retry. + HA_ERR_WRONG_COMMAND mode not implemented. +*/ + +int ha_maria::enable_indexes(uint mode) +{ + int error; + ha_rows start_rows= file->state->records; + DBUG_PRINT("info", ("ha_maria::enable_indexes mode: %d", mode)); + if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys)) + { + /* All indexes are enabled already. */ + return 0; + } + + if (mode == HA_KEY_SWITCH_ALL) + { + error= maria_enable_indexes(file); + /* + Do not try to repair on error, + as this could make the enabled state persistent, + but mode==HA_KEY_SWITCH_ALL forbids it. + */ + } + else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE) + { + THD *thd= table->in_use; + HA_CHECK *param= (HA_CHECK*) thd->alloc(sizeof *param); + if (!param) + return HA_ADMIN_INTERNAL_ERROR; + + const char *save_proc_info= thd_proc_info(thd, "Creating index"); + + maria_chk_init(param); + param->op_name= "recreating_index"; + param->testflag= (T_SILENT | T_REP_BY_SORT | T_QUICK | + T_CREATE_MISSING_KEYS | T_SAFE_REPAIR); + /* + Don't lock and unlock table if it's locked. + Normally table should be locked. This test is mostly for safety. + */ + if (likely(file->lock_type != F_UNLCK)) + param->testflag|= T_NO_LOCKS; + + if (file->create_unique_index_by_sort) + param->testflag|= T_CREATE_UNIQUE_BY_SORT; + + if (bulk_insert_single_undo == BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR) + { + bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_REPAIR; + /* + Don't bump create_rename_lsn, because UNDO_BULK_INSERT + should not be skipped in case of crash during repair. + */ + param->testflag|= T_NO_CREATE_RENAME_LSN; + } + + param->myf_rw &= ~MY_WAIT_IF_FULL; + param->orig_sort_buffer_length= THDVAR(thd,sort_buffer_size); + param->stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method); + param->tmpdir= &mysql_tmpdir_list; + + /* + Don't retry repair if we get duplicate key error if + create_unique_index_by_sort is enabled + This can be set when doing an ALTER TABLE and enabling unique keys + */ + if ((error= (repair(thd, param, 0) != HA_ADMIN_OK)) && param->retry_repair && + (my_errno != HA_ERR_FOUND_DUPP_KEY || + !file->create_unique_index_by_sort)) + { + sql_print_warning("Warning: Enabling keys got errno %d on %s.%s, " + "retrying", + my_errno, param->db_name, param->table_name); + /* Repairing by sort failed. Now try standard repair method. */ + param->testflag &= ~T_REP_BY_SORT; + file->state->records= start_rows; + error= (repair(thd, param, 0) != HA_ADMIN_OK); + /* + If the standard repair succeeded, clear all error messages which + might have been set by the first repair. They can still be seen + with SHOW WARNINGS then. + */ + if (!error) + thd->clear_error(); + } + info(HA_STATUS_CONST); + thd_proc_info(thd, save_proc_info); + } + else + { + /* mode not implemented */ + error= HA_ERR_WRONG_COMMAND; + } + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_enable_index", + { + DBUG_PRINT("maria_crash_enable_index", ("now")); + DBUG_SUICIDE(); + }); + return error; +} + + +/* + Test if indexes are disabled. + + + SYNOPSIS + indexes_are_disabled() + no parameters + + + RETURN + 0 indexes are not disabled + 1 all indexes are disabled + [2 non-unique indexes are disabled - NOT YET IMPLEMENTED] +*/ + +int ha_maria::indexes_are_disabled(void) +{ + return maria_indexes_are_disabled(file); +} + + +/* + prepare for a many-rows insert operation + e.g. - disable indexes (if they can be recreated fast) or + activate special bulk-insert optimizations + + SYNOPSIS + start_bulk_insert(rows, flags) + rows Rows to be inserted + 0 if we don't know + flags Flags to control index creation + + NOTICE + Do not forget to call end_bulk_insert() later! +*/ + +void ha_maria::start_bulk_insert(ha_rows rows, uint flags) +{ + DBUG_ENTER("ha_maria::start_bulk_insert"); + THD *thd= table->in_use; + MARIA_SHARE *share= file->s; + bool index_disabled= 0; + DBUG_PRINT("info", ("start_bulk_insert: rows %lu", (ulong) rows)); + + /* don't enable row cache if too few rows */ + if ((!rows || rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE) && !has_long_unique()) + { + ulonglong size= thd->variables.read_buff_size, tmp; + if (rows) + { + if (file->state->records) + { + MARIA_INFO maria_info; + maria_status(file, &maria_info, HA_STATUS_NO_LOCK |HA_STATUS_VARIABLE); + set_if_smaller(size, maria_info.mean_reclength * rows); + } + else if (table->s->avg_row_length) + set_if_smaller(size, (size_t) (table->s->avg_row_length * rows)); + } + tmp= (ulong) size; // Safe becasue of limits + maria_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &tmp); + } + + can_enable_indexes= (maria_is_all_keys_active(share->state.key_map, + share->base.keys)); + bulk_insert_single_undo= BULK_INSERT_NONE; + + if (!(specialflag & SPECIAL_SAFE_MODE)) + { + /* + Only disable old index if the table was empty and we are inserting + a lot of rows. + We should not do this for only a few rows as this is slower and + we don't want to update the key statistics based of only a few rows. + Index file rebuild requires an exclusive lock, so if versioning is on + don't do it (see how ha_maria::store_lock() tries to predict repair). + We can repair index only if we have an exclusive (TL_WRITE) lock or + if this is inside an ALTER TABLE, in which case lock_type == TL_UNLOCK. + + To see if table is empty, we shouldn't rely on the old record + count from our transaction's start (if that old count is 0 but + now there are records in the table, we would wrongly destroy + them). So we need to look at share->state.state.records. As a + safety net for now, we don't remove the test of + file->state->records, because there is uncertainty on what will + happen during repair if the two states disagree. + + We also have to check in case of transactional tables that the + user has not used LOCK TABLE on the table twice. + */ + if ((file->state->records == 0) && + (share->state.state.records == 0) && can_enable_indexes && + (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES) && + (file->lock.type == TL_WRITE || file->lock.type == TL_UNLOCK) && + (!share->have_versioning || !share->now_transactional || + file->used_tables->use_count == 1)) + { + /** + @todo for a single-row INSERT SELECT, we will go into repair, which + is more costly (flushes, syncs) than a row write. + */ + if (file->open_flags & HA_OPEN_INTERNAL_TABLE) + { + /* Internal table; If we get a duplicate something is very wrong */ + file->update|= HA_STATE_CHANGED; + index_disabled= share->base.keys > 0; + maria_clear_all_keys_active(file->s->state.key_map); + } + else + { + my_bool all_keys= MY_TEST(flags & HA_CREATE_UNIQUE_INDEX_BY_SORT); + /* + Deactivate all indexes that can be recreated fast. + These include packed keys on which sorting will use more temporary + space than the max allowed file length or for which the unpacked keys + will take much more space than packed keys. + Note that 'rows' may be zero for the case when we don't know how many + rows we will put into the file. + */ + MARIA_SHARE *share= file->s; + MARIA_KEYDEF *key=share->keyinfo; + uint i; + + DBUG_ASSERT(share->state.state.records == 0 && + (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES)); + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!(key->flag & (HA_SPATIAL | HA_AUTO_KEY | HA_RTREE_INDEX)) && + ! maria_too_big_key_for_sort(key,rows) && share->base.auto_key != i+1 && + (all_keys || !(key->flag & HA_NOSAME)) && + table->key_info[i].algorithm != HA_KEY_ALG_LONG_HASH) + { + maria_clear_key_active(share->state.key_map, i); + index_disabled= 1; + file->update|= HA_STATE_CHANGED; + file->create_unique_index_by_sort= all_keys; + } + } + } + if (share->now_transactional) + { + bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR; + write_log_record_for_bulk_insert(file); + /* + Pages currently in the page cache have type PAGECACHE_LSN_PAGE, we + are not allowed to overwrite them with PAGECACHE_PLAIN_PAGE, so + throw them away. It is not losing data, because we just wrote and + forced an UNDO which will for sure empty the table if we crash. The + upcoming unique-key insertions however need a proper index, so we + cannot leave the corrupted on-disk index file, thus we truncate it. + + The following call will log the truncate and update the lsn for the table + to ensure that all redo's before this will be ignored. + */ + maria_delete_all_rows(file); + _ma_tmp_disable_logging_for_table(file, TRUE); + } + } + else if (!file->bulk_insert && + (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT)) + { + maria_init_bulk_insert(file, + (size_t) thd->variables.bulk_insert_buff_size, + rows); + } + } + can_enable_indexes= index_disabled; + DBUG_VOID_RETURN; +} + + +/* + end special bulk-insert optimizations, + which have been activated by start_bulk_insert(). + + SYNOPSIS + end_bulk_insert() + no arguments + + RETURN + 0 OK + != 0 Error +*/ + +int ha_maria::end_bulk_insert() +{ + int first_error, first_errno= 0, error; + my_bool abort= file->s->deleting, empty_table= 0; + uint enable_index_mode= HA_KEY_SWITCH_NONUNIQ_SAVE; + DBUG_ENTER("ha_maria::end_bulk_insert"); + + if ((first_error= maria_end_bulk_insert(file, abort))) + { + first_errno= my_errno; + abort= 1; + } + + if ((error= maria_extra(file, HA_EXTRA_NO_CACHE, 0))) + { + if (!first_error) + { + first_error= error; + first_errno= my_errno; + } + abort= 1; + } + + if (bulk_insert_single_undo != BULK_INSERT_NONE) + { + if (log_not_redoable_operation("BULK_INSERT")) + { + /* Got lock timeout. revert back to empty file and give error */ + if (!first_error) + { + first_error= 1; + first_errno= my_errno; + } + enable_index_mode= HA_KEY_SWITCH_ALL; + empty_table= 1; + /* + Ignore all changed pages, required by _ma_renable_logging_for_table() + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, + FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED); + } + } + + if (!abort && can_enable_indexes) + { + if ((error= enable_indexes(enable_index_mode))) + { + if (!first_error) + { + first_error= 1; + first_errno= my_errno; + } + } + } + if (bulk_insert_single_undo != BULK_INSERT_NONE) + { + /* + Table was transactional just before start_bulk_insert(). + No need to flush pages if we did a repair (which already flushed). + */ + if ((error= _ma_reenable_logging_for_table(file, + bulk_insert_single_undo == + BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR)) && + !empty_table) + { + if (!first_error) + { + first_error= 1; + first_errno= my_errno; + } + } + bulk_insert_single_undo= BULK_INSERT_NONE; // Safety if called again + } + if (empty_table) + maria_delete_all_rows(file); + + can_enable_indexes= 0; + if (first_error) + my_errno= first_errno; + DBUG_RETURN(first_error); +} + + +bool ha_maria::check_and_repair(THD *thd) +{ + int error, crashed; + HA_CHECK_OPT check_opt; + const CSET_STRING query_backup= thd->query_string; + DBUG_ENTER("ha_maria::check_and_repair"); + + check_opt.init(); + check_opt.flags= T_MEDIUM | T_AUTO_REPAIR; + + error= 1; + if (!aria_readonly && + (file->s->state.changed & (STATE_CRASHED_FLAGS | STATE_MOVED)) == + STATE_MOVED) + { + /* Remove error about crashed table */ + thd->get_stmt_da()->clear_warning_info(thd->query_id); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_CRASHED_ON_USAGE, + "Zerofilling moved table %s", table->s->path.str); + sql_print_information("Zerofilling moved table: '%s'", + table->s->path.str); + if (!(error= zerofill(thd, &check_opt))) + DBUG_RETURN(0); + } + + /* + if we got this far - the table is crashed. + but don't auto-repair if maria_recover_options is not set + */ + if (!maria_recover_options) + DBUG_RETURN(error); + + error= 0; + // Don't use quick if deleted rows + if (!file->state->del && (maria_recover_options & HA_RECOVER_QUICK)) + check_opt.flags |= T_QUICK; + + thd->set_query((char*) table->s->table_name.str, + (uint) table->s->table_name.length, system_charset_info); + + if (!(crashed= maria_is_crashed(file))) + { + sql_print_warning("Checking table: '%s'", table->s->path.str); + crashed= check(thd, &check_opt); + } + + if (crashed) + { + bool save_log_all_errors; + sql_print_warning("Recovering table: '%s'", table->s->path.str); + save_log_all_errors= thd->log_all_errors; + thd->log_all_errors|= (thd->variables.log_warnings > 2); + check_opt.flags= + ((maria_recover_options & HA_RECOVER_BACKUP ? T_BACKUP_DATA : 0) | + (maria_recover_options & HA_RECOVER_FORCE ? 0 : T_SAFE_REPAIR) | + T_AUTO_REPAIR); + if (repair(thd, &check_opt)) + error= 1; + thd->log_all_errors= save_log_all_errors; + } + thd->set_query(query_backup); + DBUG_RETURN(error); +} + + +bool ha_maria::is_crashed() const +{ + return (file->s->state.changed & (STATE_CRASHED_FLAGS | STATE_MOVED) || + (my_disable_locking && file->s->state.open_count)); +} + +#define CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING(msg) \ + do { \ + if (file->lock.type == TL_WRITE_CONCURRENT_INSERT && !table->s->sequence) \ + { \ + my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), msg); \ + return 1; \ + } \ + } while(0) + +int ha_maria::update_row(const uchar * old_data, const uchar * new_data) +{ + CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("UPDATE in WRITE CONCURRENT"); + return maria_update(file, old_data, new_data); +} + + +int ha_maria::delete_row(const uchar * buf) +{ + CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("DELETE in WRITE CONCURRENT"); + return maria_delete(file, buf); +} + +int ha_maria::index_read_map(uchar * buf, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + DBUG_ASSERT(inited == INDEX); + register_handler(file); + int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag); + return error; +} + + +int ha_maria::index_read_idx_map(uchar * buf, uint index, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + int error; + register_handler(file); + + /* Use the pushed index condition if it matches the index we're scanning */ + end_range= NULL; + if (index == pushed_idx_cond_keyno) + ma_set_index_cond_func(file, handler_index_cond_check, this); + + error= maria_rkey(file, buf, index, key, keypart_map, find_flag); + + ma_set_index_cond_func(file, NULL, 0); + return error; +} + + +int ha_maria::index_read_last_map(uchar * buf, const uchar * key, + key_part_map keypart_map) +{ + DBUG_ENTER("ha_maria::index_read_last_map"); + DBUG_ASSERT(inited == INDEX); + register_handler(file); + int error= maria_rkey(file, buf, active_index, key, keypart_map, + HA_READ_PREFIX_LAST); + DBUG_RETURN(error); +} + + +int ha_maria::index_next(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + register_handler(file); + int error= maria_rnext(file, buf, active_index); + return error; +} + + +int ha_maria::index_prev(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + register_handler(file); + int error= maria_rprev(file, buf, active_index); + return error; +} + + +int ha_maria::index_first(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + register_handler(file); + int error= maria_rfirst(file, buf, active_index); + return error; +} + + +int ha_maria::index_last(uchar * buf) +{ + DBUG_ASSERT(inited == INDEX); + register_handler(file); + int error= maria_rlast(file, buf, active_index); + return error; +} + + +int ha_maria::index_next_same(uchar * buf, + const uchar *key __attribute__ ((unused)), + uint length __attribute__ ((unused))) +{ + int error; + DBUG_ASSERT(inited == INDEX); + register_handler(file); + /* + TODO: Delete this loop in Maria 1.5 as versioning will ensure this never + happens + */ + do + { + error= maria_rnext_same(file,buf); + } while (error == HA_ERR_RECORD_DELETED); + return error; +} + + +int ha_maria::index_init(uint idx, bool sorted) +{ + active_index=idx; + if (pushed_idx_cond_keyno == idx) + ma_set_index_cond_func(file, handler_index_cond_check, this); + return 0; +} + + +int ha_maria::index_end() +{ + active_index=MAX_KEY; + ma_set_index_cond_func(file, NULL, 0); + in_range_check_pushed_down= FALSE; + ds_mrr.dsmrr_close(); + return 0; +} + + +int ha_maria::rnd_init(bool scan) +{ + if (scan) + return maria_scan_init(file); + return maria_reset(file); // Free buffers +} + + +int ha_maria::rnd_end() +{ + ds_mrr.dsmrr_close(); + /* Safe to call even if we don't have started a scan */ + maria_scan_end(file); + return 0; +} + + +int ha_maria::rnd_next(uchar *buf) +{ + register_handler(file); + return maria_scan(file, buf); +} + + +int ha_maria::remember_rnd_pos() +{ + register_handler(file); + return (*file->s->scan_remember_pos)(file, &remember_pos); +} + + +int ha_maria::restart_rnd_next(uchar *buf) +{ + int error; + register_handler(file); + if ((error= (*file->s->scan_restore_pos)(file, remember_pos))) + return error; + return rnd_next(buf); +} + + +int ha_maria::rnd_pos(uchar *buf, uchar *pos) +{ + register_handler(file); + int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length)); + return error; +} + + +void ha_maria::position(const uchar *record) +{ + my_off_t row_position= maria_position(file); + my_store_ptr(ref, ref_length, row_position); +} + + +int ha_maria::info(uint flag) +{ + MARIA_INFO maria_info; + char name_buff[FN_REFLEN]; + + (void) maria_status(file, &maria_info, flag); + if (flag & HA_STATUS_VARIABLE) + { + stats.records= maria_info.records; + stats.deleted= maria_info.deleted; + stats.data_file_length= maria_info.data_file_length; + stats.index_file_length= maria_info.index_file_length; + stats.delete_length= maria_info.delete_length; + stats.check_time= maria_info.check_time; + stats.mean_rec_length= maria_info.mean_reclength; + stats.checksum= file->state->checksum; + } + if (flag & HA_STATUS_CONST) + { + TABLE_SHARE *share= table->s; + stats.max_data_file_length= maria_info.max_data_file_length; + stats.max_index_file_length= maria_info.max_index_file_length; + stats.create_time= maria_info.create_time; + ref_length= maria_info.reflength; + share->db_options_in_use= maria_info.options; + stats.block_size= maria_block_size; + stats.mrr_length_per_rec= maria_info.reflength + 8; // 8 = MY_MAX(sizeof(void *)) + + /* Update share */ + share->keys_in_use.set_prefix(share->keys); + share->keys_in_use.intersect_extended(maria_info.key_map); + share->keys_for_keyread.intersect(share->keys_in_use); + share->db_record_offset= maria_info.record_offset; + if (share->key_parts) + { + double *from= maria_info.rec_per_key; + KEY *key, *key_end; + for (key= table->key_info, key_end= key + share->keys; + key < key_end ; key++) + { + ulong *to= key->rec_per_key; + /* Some temporary tables does not allocate rec_per_key */ + if (to) + { + for (ulong *end= to+ key->user_defined_key_parts ; + to < end ; + to++, from++) + *to= (ulong) (*from + 0.5); + } + } + } + /* + Set data_file_name and index_file_name to point at the symlink value + if table is symlinked (Ie; Real name is not same as generated name) + */ + data_file_name= index_file_name= 0; + fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_DEXT, + MY_APPEND_EXT | MY_UNPACK_FILENAME); + if (strcmp(name_buff, maria_info.data_file_name) && + maria_info.data_file_name[0]) + data_file_name= maria_info.data_file_name; + fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_IEXT, + MY_APPEND_EXT | MY_UNPACK_FILENAME); + if (strcmp(name_buff, maria_info.index_file_name) && + maria_info.index_file_name[0]) + index_file_name=maria_info.index_file_name; + } + if (flag & HA_STATUS_ERRKEY) + { + errkey= maria_info.errkey; + my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos); + } + if (flag & HA_STATUS_TIME) + stats.update_time= maria_info.update_time; + if (flag & HA_STATUS_AUTO) + stats.auto_increment_value= maria_info.auto_increment; + + return 0; +} + + +int ha_maria::extra(enum ha_extra_function operation) +{ + int tmp; + TRN *old_trn= file->trn; + if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD) + return 0; +#ifdef NOT_USED + if (operation == HA_EXTRA_MMAP && !opt_maria_use_mmap) + return 0; +#endif + if (operation == HA_EXTRA_WRITE_CACHE && has_long_unique()) + return 0; + + /* + We have to set file->trn here because in some cases we call + extern_lock(F_UNLOCK) (which resets file->trn) followed by maria_close() + without calling commit/rollback in between. If file->trn is not set + we can't remove file->share from the transaction list in the extra() call. + + In current code we don't have to do this for HA_EXTRA_PREPARE_FOR_RENAME + as this is only used the intermediate table used by ALTER TABLE which + is not part of the transaction (it's not in the TRN list). Better to + keep this for now, to not break anything in a stable release. + When HA_EXTRA_PREPARE_FOR_RENAME is not handled below, we can change + the warnings in _ma_remove_table_from_trnman() to asserts. + + table->in_use is not set in the case this is a done as part of closefrm() + as part of drop table. + */ + + if (file->s->now_transactional && table->in_use && + (operation == HA_EXTRA_PREPARE_FOR_DROP || + operation == HA_EXTRA_PREPARE_FOR_RENAME || + operation == HA_EXTRA_PREPARE_FOR_FORCED_CLOSE)) + { + THD *thd= table->in_use; + file->trn= THD_TRN; + } + DBUG_ASSERT(file->s->base.born_transactional || file->trn == 0 || + file->trn == &dummy_transaction_object); + + tmp= maria_extra(file, operation, 0); + /* + Restore trn if it was changed above. + Note that table could be removed from trn->used_tables and + trn->used_instances if trn was set and some of the above operations + was used. This is ok as the table should not be part of any transaction + after this and thus doesn't need to be part of any of the above lists. + */ + file->trn= old_trn; + return tmp; +} + +int ha_maria::reset(void) +{ + ma_set_index_cond_func(file, NULL, 0); + ds_mrr.dsmrr_close(); + if (file->trn) + { + /* Next statement is a new statement. Ensure it's logged */ + trnman_set_flags(file->trn, + trnman_get_flags(file->trn) & ~TRN_STATE_INFO_LOGGED); + } + return maria_reset(file); +} + +/* To be used with WRITE_CACHE and EXTRA_CACHE */ + +int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size) +{ + if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE) + return 0; + return maria_extra(file, operation, (void*) &cache_size); +} + + +bool ha_maria::auto_repair(int error) const +{ + /* Always auto-repair moved tables (error == HA_ERR_OLD_FILE) */ + return ((MY_TEST(maria_recover_options & HA_RECOVER_ANY) && + error == HA_ERR_CRASHED_ON_USAGE) || + error == HA_ERR_OLD_FILE); + +} + + +int ha_maria::delete_all_rows() +{ + THD *thd= table->in_use; + TRN *trn= file->trn; + CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("TRUNCATE in WRITE CONCURRENT"); +#ifdef EXTRA_DEBUG + if (trn && ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED)) + { + trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED | + TRN_STATE_TABLES_CAN_CHANGE); + (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + } +#endif + /* + If we are under LOCK TABLES, we have to do a commit as + delete_all_rows() can't be rolled back + */ + if (table->in_use->locked_tables_mode && trn && + trnman_has_locked_tables(trn)) + { + int error; + if ((error= implicit_commit(thd, 1))) + return error; + } + + /* Note that this can't be rolled back */ + return maria_delete_all_rows(file); +} + + +int ha_maria::delete_table(const char *name) +{ + THD *thd= current_thd; + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + return maria_delete_table(name); +} + + +/* This is mainly for temporary tables, so no logging necessary */ + +void ha_maria::drop_table(const char *name) +{ + DBUG_ASSERT(!file || file->s->temporary); + (void) ha_close(); + (void) maria_delete_table_files(name, 1, MY_WME); +} + + +void ha_maria::change_table_ptr(TABLE *table_arg, TABLE_SHARE *share) +{ + handler::change_table_ptr(table_arg, share); + if (file) + file->external_ref= table_arg; +} + + +int ha_maria::external_lock(THD *thd, int lock_type) +{ + int result= 0, result2; + DBUG_ENTER("ha_maria::external_lock"); + file->external_ref= (void*) table; // For ma_killed() + /* + We don't test now_transactional because it may vary between lock/unlock + and thus confuse our reference counting. + It is critical to skip non-transactional tables: user-visible temporary + tables get an external_lock() when read/written for the first time, but no + corresponding unlock (they just stay locked and are later dropped while + locked); if a tmp table was transactional, "SELECT FROM non_tmp, tmp" + would never commit as its "locked_tables" count would stay 1. + When Maria has has_transactions()==TRUE, open_temporary_table() + (sql_base.cc) will use TRANSACTIONAL_TMP_TABLE and thus the + external_lock(F_UNLCK) will happen and we can then allow the user to + create transactional temporary tables. + */ + if (file->s->base.born_transactional) + { + /* Transactional table */ + if (lock_type != F_UNLCK) + { + if (file->trn) + { + /* This can only happen with tables created with clone() */ + DBUG_PRINT("info",("file->trn: %p", file->trn)); + trnman_increment_locked_tables(file->trn); + } + + if (!thd->transaction->on) + { + /* + No need to log REDOs/UNDOs. If this is an internal temporary table + which will be renamed to a permanent table (like in ALTER TABLE), + the rename happens after unlocking so will be durable (and the table + will get its create_rename_lsn). + Note: if we wanted to enable users to have an old backup and apply + tons of archived logs to roll-forward, we could then not disable + REDOs/UNDOs in this case. + */ + DBUG_PRINT("info", ("Disabling logging for table")); + _ma_tmp_disable_logging_for_table(file, TRUE); + file->autocommit= 0; + } + else + file->autocommit= !(thd->variables.option_bits & + (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)); +#ifndef ARIA_HAS_TRANSACTIONS + /* + Until Aria has full transactions support, including MVCC support for + delete and update and purging of old states, we have to commit for + every statement. + */ + file->autocommit=1; +#endif + } + else + { + /* We have to test for THD_TRN to protect against implicit commits */ + TRN *trn= (file->trn != &dummy_transaction_object && THD_TRN ? file->trn : 0); + /* End of transaction */ + + /* + We always re-enable, don't rely on thd->transaction.on as it is + sometimes reset to true after unlocking (see mysql_truncate() for a + partitioned table based on Maria). + Note that we can come here without having an exclusive lock on the + table, for example in this case: + external_lock(F_(WR|RD)LCK); thr_lock() which fails due to lock + abortion; external_lock(F_UNLCK). Fortunately, the re-enabling happens + only if we were the thread which disabled logging. + */ + if (_ma_reenable_logging_for_table(file, TRUE)) + DBUG_RETURN(1); + _ma_reset_trn_for_table(file); + /* + Ensure that file->state points to the current number of rows. This + is needed if someone calls maria_info() without first doing an + external lock of the table + */ + file->state= &file->s->state.state; + if (trn) + { + DBUG_PRINT("info", + ("locked_tables: %u", trnman_has_locked_tables(trn))); + DBUG_ASSERT(trnman_has_locked_tables(trn) > 0); + if (trnman_has_locked_tables(trn) && + !trnman_decrement_locked_tables(trn)) + { + /* + OK should not have been sent to client yet (ACID). + This is a bit excessive, ACID requires this only if there are some + changes to commit (rollback shouldn't be tested). + */ + DBUG_ASSERT(!thd->get_stmt_da()->is_sent() || + thd->killed); + /* + If autocommit, commit transaction. This can happen when open and + lock tables as part of creating triggers, in which case commit + is not called. + Until ARIA_HAS_TRANSACTIONS is not defined, always commit. + */ + if (file->autocommit) + { + if (ma_commit(trn)) + result= HA_ERR_COMMIT_ERROR; + thd_set_ha_data(thd, maria_hton, 0); + } + } + trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED); + } + } + } /* if transactional table */ + if ((result2= maria_lock_database(file, !table->s->tmp_table ? + lock_type : ((lock_type == F_UNLCK) ? + F_UNLCK : F_EXTRA_LCK)))) + result= result2; + if (!file->s->base.born_transactional) + file->state= &file->s->state.state; // Restore state if clone + + /* Remember stack end for this thread */ + file->stack_end_ptr= &ha_thd()->mysys_var->stack_ends_here; + DBUG_RETURN(result); +} + +int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type) +{ + TRN *trn; + if (file->s->base.born_transactional) + { + trn= THD_TRN; + DBUG_ASSERT(trn); // this may be called only after external_lock() + DBUG_ASSERT(trnman_has_locked_tables(trn)); + DBUG_ASSERT(lock_type != TL_UNLOCK); + DBUG_ASSERT(file->trn == trn); + + /* + As external_lock() was already called, don't increment locked_tables. + Note that we call the function below possibly several times when + statement starts (once per table). This is ok as long as that function + does cheap operations. Otherwise, we will need to do it only on first + call to start_stmt(). + */ + trnman_new_statement(trn); + +#ifdef EXTRA_DEBUG + if (!(trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED) && + trnman_get_flags(trn) & TRN_STATE_TABLES_CAN_CHANGE) + { + trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED); + (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), + thd->query_length()); + } +#endif + } + return 0; +} + + +/* + Reset THD_TRN and all file->trn related to the transaction + This is needed as some calls, like extra() or external_lock() may access + it before next transaction is started +*/ + +static void reset_thd_trn(THD *thd, MARIA_HA *first_table) +{ + DBUG_ENTER("reset_thd_trn"); + thd_set_ha_data(thd, maria_hton, 0); + MARIA_HA *next; + for (MARIA_HA *table= first_table; table ; table= next) + { + next= table->trn_next; + _ma_reset_trn_for_table(table); + + /* + If table has changed by this statement, invalidate it from the query + cache + */ + if (table->row_changes != table->start_row_changes) + { + table->start_row_changes= table->row_changes; + DBUG_ASSERT(table->s->chst_invalidator != NULL); + (*table->s->chst_invalidator)(table->s->data_file_name.str); + } + } + DBUG_VOID_RETURN; +} + +bool ha_maria::has_active_transaction(THD *thd) +{ + return (maria_hton && THD_TRN); +} + +/** + Performs an implicit commit of the Maria transaction and creates a new + one. + + This can be considered a hack. When Maria loses HA_NO_TRANSACTIONS it will + be participant in the connection's transaction and so the implicit commits + (ha_commit()) (like in end_active_trans()) will do the implicit commit + without need to call this function which can then be removed. + + @param thd THD object + @param new_trn if a new transaction should be created; a new + transaction is not needed when we know that the + tables will be unlocked very soon. +*/ + +int ha_maria::implicit_commit(THD *thd, bool new_trn) +{ +#ifndef MARIA_CANNOT_ROLLBACK +#error this method should be removed +#endif + TRN *trn; + int error; + uint locked_tables; + extern my_bool plugins_are_initialized; + MARIA_HA *used_tables, *trn_next; + DBUG_ENTER("ha_maria::implicit_commit"); + + if (!maria_hton || !plugins_are_initialized || !(trn= THD_TRN)) + DBUG_RETURN(0); + if (!new_trn && (thd->locked_tables_mode == LTM_LOCK_TABLES || + thd->locked_tables_mode == LTM_PRELOCKED_UNDER_LOCK_TABLES)) + { + /* + No commit inside LOCK TABLES. + + Note that we come here only at the end of the top statement + (dispatch_command()), we are never committing inside a sub-statement./ + */ + DBUG_PRINT("info", ("locked_tables, skipping")); + DBUG_RETURN(0); + } + + /* Prepare to move used_instances and locked tables to new TRN object */ + locked_tables= trnman_has_locked_tables(trn); + trnman_reset_locked_tables(trn, 0); + relink_trn_used_instances(&used_tables, trn); + + error= 0; + if (unlikely(ma_commit(trn))) + error= HA_ERR_COMMIT_ERROR; + if (!new_trn) + { + reset_thd_trn(thd, used_tables); + goto end; + } + + /* + We need to create a new transaction and put it in THD_TRN. Indeed, + tables may be under LOCK TABLES, and so they will start the next + statement assuming they have a trn (see ha_maria::start_stmt()). + */ + trn= trnman_new_trn(& thd->transaction->wt); + thd_set_ha_data(thd, maria_hton, trn); + if (unlikely(trn == NULL)) + { + reset_thd_trn(thd, used_tables); + error= HA_ERR_OUT_OF_MEM; + goto end; + } + /* + Move all locked tables to the new transaction + We must do it here as otherwise file->thd and file->state may be + stale pointers. We can't do this in start_stmt() as we don't know + when we should call _ma_setup_live_state() and in some cases, like + in check table, we use the table without calling start_stmt(). + */ + + for (MARIA_HA *handler= used_tables; handler ; + handler= trn_next) + { + trn_next= handler->trn_next; + DBUG_ASSERT(handler->s->base.born_transactional); + + /* If handler uses versioning */ + if (handler->s->lock_key_trees) + { + /* _ma_set_trn_for_table() will be called indirectly */ + if (_ma_setup_live_state(handler)) + error= HA_ERR_OUT_OF_MEM; + } + else + _ma_set_trn_for_table(handler, trn); + } + /* This is just a commit, tables stay locked if they were: */ + trnman_reset_locked_tables(trn, locked_tables); + +end: + DBUG_RETURN(error); +} + + +THR_LOCK_DATA **ha_maria::store_lock(THD *thd, + THR_LOCK_DATA **to, + enum thr_lock_type lock_type) +{ + /* Test if we can fix test below */ + DBUG_ASSERT(lock_type != TL_UNLOCK && + (lock_type == TL_IGNORE || file->lock.type == TL_UNLOCK)); + if (lock_type != TL_IGNORE && file->lock.type == TL_UNLOCK) + { + const enum enum_sql_command sql_command= thd->lex->sql_command; + /* + We have to disable concurrent inserts for INSERT ... SELECT or + INSERT/UPDATE/DELETE with sub queries if we are using statement based + logging. We take the safe route here and disable this for all commands + that only does reading that are not SELECT. + */ + if (lock_type <= TL_READ_HIGH_PRIORITY && + !thd->is_current_stmt_binlog_format_row() && + (sql_command != SQLCOM_SELECT && + sql_command != SQLCOM_LOCK_TABLES) && + (thd->variables.option_bits & OPTION_BIN_LOG) && + mysql_bin_log.is_open()) + lock_type= TL_READ_NO_INSERT; + else if (lock_type == TL_WRITE_CONCURRENT_INSERT) + { + const enum enum_duplicates duplicates= thd->lex->duplicates; + /* + Explanation for the 3 conditions below, in order: + + - Bulk insert may use repair, which will cause problems if other + threads try to read/insert to the table: disable versioning. + Note that our read of file->state->records is incorrect, as such + variable may have changed when we come to start_bulk_insert() (worse + case: we see != 0 so allow versioning, start_bulk_insert() sees 0 and + uses repair). This is prevented because start_bulk_insert() will not + try repair if we enabled versioning. + - INSERT SELECT ON DUPLICATE KEY UPDATE comes here with + TL_WRITE_CONCURRENT_INSERT but shouldn't because it can do + update/delete of a row and versioning doesn't support that + - same for LOAD DATA CONCURRENT REPLACE. + */ + if ((file->state->records == 0) || + (sql_command == SQLCOM_INSERT_SELECT && duplicates == DUP_UPDATE) || + (sql_command == SQLCOM_LOAD && duplicates == DUP_REPLACE)) + lock_type= TL_WRITE; + } + file->lock.type= lock_type; + } + *to++= &file->lock; + return to; +} + + +void ha_maria::update_create_info(HA_CREATE_INFO *create_info) +{ + ha_maria::info(HA_STATUS_AUTO | HA_STATUS_CONST); + if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) + { + create_info->auto_increment_value= stats.auto_increment_value; + } + create_info->data_file_name= data_file_name; + create_info->index_file_name= index_file_name; + /* + Keep user-specified row_type for ALTER, + but show the actually used one in SHOW + */ + if (create_info->row_type != ROW_TYPE_DEFAULT && + !(thd_sql_command(ha_thd()) == SQLCOM_ALTER_TABLE)) + create_info->row_type= get_row_type(); + /* + Show always page checksums, as this can be forced with + maria_page_checksums variable + */ + if (create_info->page_checksum == HA_CHOICE_UNDEF) + create_info->page_checksum= + (file->s->options & HA_OPTION_PAGE_CHECKSUM) ? HA_CHOICE_YES : + HA_CHOICE_NO; +} + + +enum row_type ha_maria::get_row_type() const +{ + switch (file->s->data_file_type) { + case STATIC_RECORD: return ROW_TYPE_FIXED; + case DYNAMIC_RECORD: return ROW_TYPE_DYNAMIC; + case BLOCK_RECORD: return ROW_TYPE_PAGE; + case COMPRESSED_RECORD: return ROW_TYPE_COMPRESSED; + default: return ROW_TYPE_NOT_USED; + } +} + + +static enum data_file_type maria_row_type(HA_CREATE_INFO *info) +{ + if (info->transactional == HA_CHOICE_YES) + return BLOCK_RECORD; + switch (info->row_type) { + case ROW_TYPE_FIXED: return STATIC_RECORD; + case ROW_TYPE_DYNAMIC: return DYNAMIC_RECORD; + default: return BLOCK_RECORD; + } +} + + +int ha_maria::create(const char *name, TABLE *table_arg, + HA_CREATE_INFO *ha_create_info) +{ + int error; + uint create_flags= 0, record_count= 0, i; + char buff[FN_REFLEN]; + MARIA_KEYDEF *keydef; + MARIA_COLUMNDEF *recinfo; + MARIA_CREATE_INFO create_info; + TABLE_SHARE *share= table_arg->s; + uint options= share->db_options_in_use; + ha_table_option_struct *table_options= table_arg->s->option_struct; + enum data_file_type row_type; + THD *thd= current_thd; + DBUG_ENTER("ha_maria::create"); + + for (i= 0; i < share->keys; i++) + { + if (table_arg->key_info[i].flags & HA_USES_PARSER) + { + create_flags|= HA_CREATE_RELIES_ON_SQL_LAYER; + break; + } + } + /* Note: BLOCK_RECORD is used if table is transactional */ + row_type= maria_row_type(ha_create_info); + if (ha_create_info->transactional == HA_CHOICE_YES && + ha_create_info->row_type != ROW_TYPE_PAGE && + ha_create_info->row_type != ROW_TYPE_NOT_USED && + ha_create_info->row_type != ROW_TYPE_DEFAULT) + push_warning(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_ILLEGAL_HA_CREATE_OPTION, + "Row format set to PAGE because of TRANSACTIONAL=1 option"); + + if (share->table_type == TABLE_TYPE_SEQUENCE) + { + /* For sequences, the simples record type is appropriate */ + row_type= STATIC_RECORD; + ha_create_info->transactional= HA_CHOICE_NO; + } + + bzero((char*) &create_info, sizeof(create_info)); + if ((error= table2maria(table_arg, row_type, &keydef, &recinfo, + &record_count, &create_info))) + DBUG_RETURN(error); /* purecov: inspected */ + create_info.max_rows= share->max_rows; + create_info.reloc_rows= share->min_rows; + create_info.with_auto_increment= share->next_number_key_offset == 0; + create_info.auto_increment= (ha_create_info->auto_increment_value ? + ha_create_info->auto_increment_value -1 : + (ulonglong) 0); + create_info.data_file_length= ((ulonglong) share->max_rows * + share->avg_row_length); + create_info.data_file_name= ha_create_info->data_file_name; + create_info.index_file_name= ha_create_info->index_file_name; + create_info.language= share->table_charset->number; + if (ht != maria_hton) + { + /* S3 engine */ + create_info.s3_block_size= (ulong) table_options->s3_block_size; + create_info.compression_algorithm= table_options->compression_algorithm; + } + + /* + Table is transactional: + - If the user specify that table is transactional (in this case + row type is forced to BLOCK_RECORD) + - If they specify BLOCK_RECORD without specifying transactional behaviour + + Shouldn't this test be pushed down to maria_create()? Because currently, + ma_test1 -T crashes: it creates a table with DYNAMIC_RECORD but has + born_transactional==1, which confuses some recovery-related code. + */ + create_info.transactional= (row_type == BLOCK_RECORD && + ha_create_info->transactional != HA_CHOICE_NO); + + if (ha_create_info->tmp_table()) + { + create_flags|= HA_CREATE_TMP_TABLE | HA_CREATE_DELAY_KEY_WRITE; + create_info.transactional= 0; + } + if (ha_create_info->options & HA_CREATE_KEEP_FILES) + create_flags|= HA_CREATE_KEEP_FILES; + if (options & HA_OPTION_PACK_RECORD) + create_flags|= HA_PACK_RECORD; + if (options & HA_OPTION_CHECKSUM) + create_flags|= HA_CREATE_CHECKSUM; + if (options & HA_OPTION_DELAY_KEY_WRITE) + create_flags|= HA_CREATE_DELAY_KEY_WRITE; + if ((ha_create_info->page_checksum == HA_CHOICE_UNDEF && + maria_page_checksums) || + ha_create_info->page_checksum == HA_CHOICE_YES) + create_flags|= HA_CREATE_PAGE_CHECKSUM; + + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + + create_info.encrypted= maria_encrypt_tables && ht == maria_hton; + /* TODO: Check that the following fn_format is really needed */ + error= + maria_create(fn_format(buff, name, "", "", + MY_UNPACK_FILENAME | MY_APPEND_EXT), + row_type, share->keys, keydef, + record_count, recinfo, + 0, (MARIA_UNIQUEDEF *) 0, + &create_info, create_flags); + + my_free(recinfo); + DBUG_RETURN(error); +} + + +int ha_maria::rename_table(const char *from, const char *to) +{ + THD *thd= current_thd; + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) thd->query(), thd->query_length()); + return maria_rename(from, to); +} + + +void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values) +{ + ulonglong nr; + int error; + uchar key[MARIA_MAX_KEY_BUFF]; + enum ha_rkey_function search_flag= HA_READ_PREFIX_LAST; + + if (!table->s->next_number_key_offset) + { // Autoincrement at key-start + ha_maria::info(HA_STATUS_AUTO); + *first_value= stats.auto_increment_value; + /* Maria has only table-level lock for now, so reserves to +inf */ + *nb_reserved_values= ULONGLONG_MAX; + return; + } + + /* it's safe to call the following if bulk_insert isn't on */ + maria_flush_bulk_insert(file, table->s->next_number_index); + + if (unlikely(table->key_info[table->s->next_number_index]. + key_part[table->s->next_number_keypart].key_part_flag & + HA_REVERSE_SORT)) + search_flag= HA_READ_KEY_EXACT; + + (void) extra(HA_EXTRA_KEYREAD); + key_copy(key, table->record[0], + table->key_info + table->s->next_number_index, + table->s->next_number_key_offset); + error= maria_rkey(file, table->record[1], (int) table->s->next_number_index, + key, make_prev_keypart_map(table->s->next_number_keypart), + search_flag); + if (error) + nr= 1; + else + { + /* Get data from record[1] */ + nr= ((ulonglong) table->next_number_field-> + val_int_offset(table->s->rec_buff_length) + 1); + } + extra(HA_EXTRA_NO_KEYREAD); + *first_value= nr; + /* + MySQL needs to call us for next row: assume we are inserting ("a",null) + here, we return 3, and next this statement will want to insert ("b",null): + there is no reason why ("b",3+1) would be the good row to insert: maybe it + already exists, maybe 3+1 is too large... + */ + *nb_reserved_values= 1; +} + + +/* + Find out how many rows there is in the given range + + SYNOPSIS + records_in_range() + inx Index to use + min_key Start of range. Null pointer if from first key + max_key End of range. Null pointer if to last key + pages Store first and last page for the range in case of + b-trees. In other cases it's not touched. + + NOTES + min_key.flag can have one of the following values: + HA_READ_KEY_EXACT Include the key in the range + HA_READ_AFTER_KEY Don't include key in range + + max_key.flag can have one of the following values: + HA_READ_BEFORE_KEY Don't include key in range + HA_READ_AFTER_KEY Include all 'end_key' values in the range + + RETURN + HA_POS_ERROR Something is wrong with the index tree. + 0 There is no matching keys in the given range + number > 0 There is approximately 'number' matching rows in + the range. +*/ + +ha_rows ha_maria::records_in_range(uint inx, const key_range *min_key, + const key_range *max_key, page_range *pages) +{ + register_handler(file); + return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key, + pages); +} + + +FT_INFO *ha_maria::ft_init_ext(uint flags, uint inx, String * key) +{ + return maria_ft_init_search(flags, file, inx, + (uchar *) key->ptr(), key->length(), + key->charset(), table->record[0]); +} + + +int ha_maria::ft_read(uchar * buf) +{ + int error; + + if (!ft_handler) + return -1; + + register_handler(file); + + thread_safe_increment(table->in_use->status_var.ha_read_next_count, + &LOCK_status); // why ? + + error= ft_handler->please->read_next(ft_handler, (char*) buf); + + return error; +} + + +bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *create_info, + uint table_changes) +{ + DBUG_ENTER("check_if_incompatible_data"); + uint options= table->s->db_options_in_use; + enum ha_choice page_checksum= table->s->page_checksum; + + if (page_checksum == HA_CHOICE_UNDEF) + page_checksum= file->s->options & HA_OPTION_PAGE_CHECKSUM ? HA_CHOICE_YES + : HA_CHOICE_NO; + + if (create_info->auto_increment_value != stats.auto_increment_value || + create_info->data_file_name != data_file_name || + create_info->index_file_name != index_file_name || + create_info->page_checksum != page_checksum || + create_info->transactional != table->s->transactional || + (maria_row_type(create_info) != data_file_type && + create_info->row_type != ROW_TYPE_DEFAULT) || + table_changes == IS_EQUAL_NO || + (table_changes & IS_EQUAL_PACK_LENGTH)) // Not implemented yet + DBUG_RETURN(COMPATIBLE_DATA_NO); + + if ((options & (HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE)) != + (create_info->table_options & (HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE))) + DBUG_RETURN(COMPATIBLE_DATA_NO); + DBUG_RETURN(COMPATIBLE_DATA_YES); +} + + +static int maria_hton_panic(handlerton *hton, ha_panic_function flag) +{ + /* If no background checkpoints, we need to do one now */ + int ret=0; + + if (!checkpoint_interval && !aria_readonly) + ret= ma_checkpoint_execute(CHECKPOINT_FULL, FALSE); + + ret|= maria_panic(flag); + + maria_hton= 0; + return ret; +} + + +static int maria_commit(handlerton *hton __attribute__ ((unused)), + THD *thd, bool all) +{ + TRN *trn= THD_TRN; + int res= 0; + MARIA_HA *used_instances; + DBUG_ENTER("maria_commit"); + + /* No commit inside lock_tables() */ + if ((!trn || + thd->locked_tables_mode == LTM_LOCK_TABLES || + thd->locked_tables_mode == LTM_PRELOCKED_UNDER_LOCK_TABLES)) + DBUG_RETURN(0); + + /* statement or transaction ? */ + if ((thd->variables.option_bits & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && + !all) + DBUG_RETURN(0); // end of statement + + used_instances= (MARIA_HA*) trn->used_instances; + trnman_reset_locked_tables(trn, 0); + trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED); + trn->used_instances= 0; + if (ma_commit(trn)) + res= HA_ERR_COMMIT_ERROR; + reset_thd_trn(thd, used_instances); + thd_set_ha_data(thd, maria_hton, 0); + DBUG_RETURN(res); +} + +#ifdef MARIA_CANNOT_ROLLBACK +static int maria_rollback(handlerton *hton, THD *thd, bool all) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("maria_rollback"); + if (!trn) + DBUG_RETURN(0); + if (trn->undo_lsn) + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_DATA_WAS_COMMITED_UNDER_ROLLBACK, + ER_THD(thd, ER_DATA_WAS_COMMITED_UNDER_ROLLBACK), + "Aria"); + if (all) + DBUG_RETURN(maria_commit(hton, thd, all)); + /* Statement rollbacks are ignored. Commit will happen in external_lock */ + DBUG_RETURN(0); +} + +#else + +static int maria_rollback(handlerton *hton __attribute__ ((unused)), + THD *thd, bool all) +{ + TRN *trn= THD_TRN; + DBUG_ENTER("maria_rollback"); + + DBUG_ASSERT(trnman_has_locked_tables(trn) == 0); + trnman_reset_locked_tables(trn, 0); + /* statement or transaction ? */ + if ((thd->variables.option_bits & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && + !all) + { + trnman_rollback_statement(trn); + DBUG_RETURN(0); // end of statement + } + reset_thd_trn(thd, (MARIA_HA*) trn->used_instances); + DBUG_RETURN(trnman_rollback_trn(trn) ? + HA_ERR_OUT_OF_MEM : 0); // end of transaction +} +#endif /* MARIA_CANNOT_ROLLBACK */ + + +/** + @brief flush log handler + + @param hton maria handlerton (unused) + + @retval FALSE OK + @retval TRUE Error +*/ + +bool maria_flush_logs(handlerton *hton) +{ + return MY_TEST(translog_purge_at_flush()); +} + + +int maria_checkpoint_state(handlerton *hton, bool disabled) +{ + maria_checkpoint_disabled= (my_bool) disabled; + return 0; +} + + +/* + Handle backup calls +*/ + +void maria_prepare_for_backup() +{ + translog_disable_purge(); +} + +void maria_end_backup() +{ + translog_enable_purge(); +} + + + +#define SHOW_MSG_LEN (FN_REFLEN + 20) +/** + @brief show status handler + + @param hton maria handlerton + @param thd thread handler + @param print print function + @param stat type of status +*/ + +bool maria_show_status(handlerton *hton, + THD *thd, + stat_print_fn *print, + enum ha_stat_type stat) +{ + const LEX_CSTRING *engine_name= hton_name(hton); + switch (stat) { + case HA_ENGINE_LOGS: + { + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + uint32 last_file= LSN_FILE_NO(horizon); + uint32 first_needed= translog_get_first_needed_file(); + uint32 first_file= translog_get_first_file(horizon); + uint32 i; + const char unknown[]= "unknown"; + const char needed[]= "in use"; + const char unneeded[]= "free"; + char path[FN_REFLEN]; + + if (first_file == 0) + { + const char error[]= "error"; + print(thd, engine_name->str, engine_name->length, + STRING_WITH_LEN(""), error, sizeof(error) - 1); + break; + } + + for (i= first_file; i <= last_file; i++) + { + char *file; + const char *status; + size_t length, status_len; + MY_STAT stat_buff, *stat; + const char error[]= "can't stat"; + char object[SHOW_MSG_LEN]; + file= translog_filename_by_fileno(i, path); + if (!(stat= mysql_file_stat(key_file_translog, file, &stat_buff, MYF(0)))) + { + status= error; + status_len= sizeof(error) - 1; + length= my_snprintf(object, SHOW_MSG_LEN, "Size unknown ; %s", file); + } + else + { + if (first_needed == 0) + { + status= unknown; + status_len= sizeof(unknown) - 1; + } + else if (i < first_needed) + { + status= unneeded; + status_len= sizeof(unneeded) - 1; + } + else + { + status= needed; + status_len= sizeof(needed) - 1; + } + length= my_snprintf(object, SHOW_MSG_LEN, "Size %12llu ; %s", + (ulonglong) stat->st_size, file); + } + + print(thd, engine_name->str, engine_name->length, + object, length, status, status_len); + } + break; + } + case HA_ENGINE_STATUS: + case HA_ENGINE_MUTEX: + default: + break; + } + return 0; +} + + +/** + Callback to delete all logs in directory. This is lower-level than other + functions in ma_loghandler.c which delete logs, as it does not rely on + translog_init() having been called first. + + @param directory directory where file is + @param filename base name of the file to delete +*/ + +static my_bool translog_callback_delete_all(const char *directory, + const char *filename) +{ + char complete_name[FN_REFLEN]; + fn_format(complete_name, filename, directory, "", MYF(MY_UNPACK_FILENAME)); + return mysql_file_delete(key_file_translog, complete_name, MYF(MY_WME)); +} + + +/** + Helper function for option aria-force-start-after-recovery-failures. + Deletes logs if too many failures. Otherwise, increments the counter of + failures in the control file. + Notice how this has to be called _before_ translog_init() (if log is + corrupted, translog_init() might crash the server, so we need to remove logs + before). + + @param log_dir directory where logs to be deleted are +*/ + +static int mark_recovery_start(const char* log_dir) +{ + int res; + DBUG_ENTER("mark_recovery_start"); + if (!(maria_recover_options & HA_RECOVER_ANY)) + ma_message_no_user(ME_WARNING, "Please consider using option" + " --aria-recover-options[=...] to automatically check and" + " repair tables when logs are removed by option" + " --aria-force-start-after-recovery-failures=#"); + if (recovery_failures >= force_start_after_recovery_failures) + { + /* + Remove logs which cause the problem; keep control file which has + critical info like uuid, max_trid (removing control file may make + correct tables look corrupted!). + */ + char msg[100]; + res= translog_walk_filenames(log_dir, &translog_callback_delete_all); + my_snprintf(msg, sizeof(msg), + "%s logs after %u consecutive failures of" + " recovery from logs", + (res ? "failed to remove some" : "removed all"), + recovery_failures); + ma_message_no_user((res ? 0 : ME_WARNING), msg); + } + else + res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + max_trid_in_control_file, + recovery_failures + 1); + DBUG_RETURN(res); +} + + +/** + Helper function for option aria-force-start-after-recovery-failures. + Records in the control file that recovery was a success, so that it's not + counted for aria-force-start-after-recovery-failures. +*/ + +static int mark_recovery_success(void) +{ + /* success of recovery, reset recovery_failures: */ + int res; + DBUG_ENTER("mark_recovery_success"); + res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + max_trid_in_control_file, 0); + DBUG_RETURN(res); +} + + +/* + Return 1 if table has changed during the current transaction +*/ + +bool ha_maria::is_changed() const +{ + return file->state->changed; +} + + +static int ha_maria_init(void *p) +{ + int res= 0, tmp; + const char *log_dir= maria_data_root; + + /* + If aria_readonly is set, then we don't run recovery and we don't allow + opening of tables that are crashed. Used by mysqld --help + */ + if ((aria_readonly= opt_help != 0)) + { + maria_recover_options= 0; + checkpoint_interval= 0; + } + +#ifdef HAVE_PSI_INTERFACE + init_aria_psi_keys(); +#endif + + maria_hton= (handlerton *)p; + maria_hton->db_type= DB_TYPE_ARIA; + maria_hton->create= maria_create_handler; + maria_hton->panic= maria_hton_panic; + maria_hton->tablefile_extensions= ha_maria_exts; + maria_hton->commit= maria_commit; + maria_hton->rollback= maria_rollback; + maria_hton->checkpoint_state= maria_checkpoint_state; + maria_hton->flush_logs= maria_flush_logs; + maria_hton->show_status= maria_show_status; + maria_hton->prepare_for_backup= maria_prepare_for_backup; + maria_hton->end_backup= maria_end_backup; + + /* TODO: decide if we support Maria being used for log tables */ + maria_hton->flags= (HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES | + HTON_NO_ROLLBACK | + HTON_TRANSACTIONAL_AND_NON_TRANSACTIONAL); + bzero(maria_log_pagecache, sizeof(*maria_log_pagecache)); + maria_tmpdir= &mysql_tmpdir_list; /* For REDO */ + ma_debug_crash_here= maria_debug_crash_here; + + if (!aria_readonly) + res= maria_upgrade(); + res= res || maria_init(); + tmp= ma_control_file_open(!aria_readonly, !aria_readonly, !aria_readonly); + res= res || aria_readonly ? tmp == CONTROL_FILE_LOCKED : tmp != 0; + res= res || + ((force_start_after_recovery_failures != 0 && !aria_readonly) && + mark_recovery_start(log_dir)) || + !init_pagecache(maria_pagecache, + (size_t) pagecache_buffer_size, pagecache_division_limit, + pagecache_age_threshold, maria_block_size, pagecache_file_hash_size, + 0) || + !init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, 0) || + (!aria_readonly && + translog_init(maria_data_root, log_file_size, + MYSQL_VERSION_ID, server_id, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0)) || + (!aria_readonly && + (maria_recovery_from_log() || + ((force_start_after_recovery_failures != 0 || + maria_recovery_changed_data || recovery_failures) && + mark_recovery_success()))) || + (aria_readonly && trnman_init(MAX_INTERNAL_TRID-16)) || + ma_checkpoint_init(checkpoint_interval); + maria_multi_threaded= maria_in_ha_maria= TRUE; + maria_create_trn_hook= maria_create_trn_for_mysql; + maria_pagecache->extra_debug= 1; + maria_assert_if_crashed_table= debug_assert_if_crashed_table; + + if (res) + { + maria_hton= 0; + maria_panic(HA_PANIC_CLOSE); + } + + ma_killed= ma_killed_in_mariadb; + if (res) + maria_panic(HA_PANIC_CLOSE); + + return res ? HA_ERR_INITIALIZATION : 0; +} + + +#ifdef HAVE_QUERY_CACHE +/** + @brief Register a named table with a call back function to the query cache. + + @param thd The thread handle + @param table_key A pointer to the table name in the table cache + @param key_length The length of the table name + @param[out] engine_callback The pointer to the storage engine call back + function, currently 0 + @param[out] engine_data Engine data will be set to 0. + + @note Despite the name of this function, it is used to check each statement + before it is cached and not to register a table or callback function. + + @see handler::register_query_cache_table + + @return The error code. The engine_data and engine_callback will be set to 0. + @retval TRUE Success + @retval FALSE An error occurred +*/ + +my_bool ha_maria::register_query_cache_table(THD *thd, const char *table_name, + uint table_name_len, + qc_engine_callback + *engine_callback, + ulonglong *engine_data) +{ + ulonglong actual_data_file_length; + ulonglong current_data_file_length; + DBUG_ENTER("ha_maria::register_query_cache_table"); + + /* + No call back function is needed to determine if a cached statement + is valid or not. + */ + *engine_callback= 0; + + /* + No engine data is needed. + */ + *engine_data= 0; + + if (file->s->now_transactional && file->s->have_versioning) + DBUG_RETURN(file->trn->trid >= file->s->state.last_change_trn); + + /* + If a concurrent INSERT has happened just before the currently processed + SELECT statement, the total size of the table is unknown. + + To determine if the table size is known, the current thread's snap shot of + the table size with the actual table size are compared. + + If the table size is unknown the SELECT statement can't be cached. + */ + + /* + POSIX visibility rules specify that "2. Whatever memory values a + thread can see when it unlocks a mutex <...> can also be seen by any + thread that later locks the same mutex". In this particular case, + concurrent insert thread had modified the data_file_length in + MYISAM_SHARE before it has unlocked (or even locked) + structure_guard_mutex. So, here we're guaranteed to see at least that + value after we've locked the same mutex. We can see a later value + (modified by some other thread) though, but it's ok, as we only want + to know if the variable was changed, the actual new value doesn't matter + */ + actual_data_file_length= file->s->state.state.data_file_length; + current_data_file_length= file->state->data_file_length; + + /* Return whether is ok to try to cache current statement. */ + DBUG_RETURN(!(file->s->non_transactional_concurrent_insert && + current_data_file_length != actual_data_file_length)); +} +#endif + +static struct st_mysql_sys_var *system_variables[]= { + MYSQL_SYSVAR(block_size), + MYSQL_SYSVAR(checkpoint_interval), + MYSQL_SYSVAR(checkpoint_log_activity), + MYSQL_SYSVAR(force_start_after_recovery_failures), + MYSQL_SYSVAR(group_commit), + MYSQL_SYSVAR(group_commit_interval), + MYSQL_SYSVAR(log_dir_path), + MYSQL_SYSVAR(log_file_size), + MYSQL_SYSVAR(log_purge_type), + MYSQL_SYSVAR(max_sort_file_size), + MYSQL_SYSVAR(page_checksum), + MYSQL_SYSVAR(pagecache_age_threshold), + MYSQL_SYSVAR(pagecache_buffer_size), + MYSQL_SYSVAR(pagecache_division_limit), + MYSQL_SYSVAR(pagecache_file_hash_size), + MYSQL_SYSVAR(recover_options), + MYSQL_SYSVAR(repair_threads), + MYSQL_SYSVAR(sort_buffer_size), + MYSQL_SYSVAR(stats_method), + MYSQL_SYSVAR(sync_log_dir), + MYSQL_SYSVAR(used_for_temp_tables), + MYSQL_SYSVAR(encrypt_tables), + NULL +}; + + +/** + @brief Updates the checkpoint interval and restarts the background thread. +*/ + +static void update_checkpoint_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + ma_checkpoint_end(); + ma_checkpoint_init(*(ulong *)var_ptr= (ulong)(*(long *)save)); +} + + +/** + @brief Updates group commit mode +*/ + +static void update_maria_group_commit(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + ulong value= (ulong)*((long *)var_ptr); + DBUG_ENTER("update_maria_group_commit"); + DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu", + value, (ulong)(*(long *)save), + maria_group_commit_interval)); + /* old value */ + switch (value) { + case TRANSLOG_GCOMMIT_NONE: + break; + case TRANSLOG_GCOMMIT_HARD: + translog_hard_group_commit(FALSE); + break; + case TRANSLOG_GCOMMIT_SOFT: + translog_soft_sync(FALSE); + if (maria_group_commit_interval) + translog_soft_sync_end(); + break; + default: + DBUG_ASSERT(0); /* impossible */ + } + value= *(ulong *)var_ptr= (ulong)(*(long *)save); + translog_sync(); + /* new value */ + switch (value) { + case TRANSLOG_GCOMMIT_NONE: + break; + case TRANSLOG_GCOMMIT_HARD: + translog_hard_group_commit(TRUE); + break; + case TRANSLOG_GCOMMIT_SOFT: + translog_soft_sync(TRUE); + /* variable change made under global lock so we can just read it */ + if (maria_group_commit_interval) + translog_soft_sync_start(); + break; + default: + DBUG_ASSERT(0); /* impossible */ + } + DBUG_VOID_RETURN; +} + +/** + @brief Updates group commit interval +*/ + +static void update_maria_group_commit_interval(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + ulong new_value= (ulong)*((long *)save); + ulong *value_ptr= (ulong*) var_ptr; + DBUG_ENTER("update_maria_group_commit_interval"); + DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu", + *value_ptr, new_value, maria_group_commit)); + + /* variable change made under global lock so we can just read it */ + switch (maria_group_commit) { + case TRANSLOG_GCOMMIT_NONE: + *value_ptr= new_value; + translog_set_group_commit_interval(new_value); + break; + case TRANSLOG_GCOMMIT_HARD: + *value_ptr= new_value; + translog_set_group_commit_interval(new_value); + break; + case TRANSLOG_GCOMMIT_SOFT: + if (*value_ptr) + translog_soft_sync_end(); + translog_set_group_commit_interval(new_value); + if ((*value_ptr= new_value)) + translog_soft_sync_start(); + break; + default: + DBUG_ASSERT(0); /* impossible */ + } + DBUG_VOID_RETURN; +} + +/** + @brief Updates the transaction log file limit. +*/ + +static void update_log_file_size(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + uint32 size= (uint32)((ulong)(*(long *)save)); + translog_set_file_size(size); + *(ulong *)var_ptr= size; +} + + +static SHOW_VAR status_variables[]= { + {"pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG}, + {"pagecache_blocks_unused", (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG}, + {"pagecache_blocks_used", (char*) &maria_pagecache_var.blocks_used, SHOW_LONG}, + {"pagecache_read_requests", (char*) &maria_pagecache_var.global_cache_r_requests, SHOW_LONGLONG}, + {"pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG}, + {"pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG}, + {"pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG}, + {"transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} +}; + +/**************************************************************************** + * Maria MRR implementation: use DS-MRR + ***************************************************************************/ + +int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint mode, + HANDLER_BUFFER *buf) +{ + return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf); +} + +int ha_maria::multi_range_read_next(range_id_t *range_info) +{ + return ds_mrr.dsmrr_next(range_info); +} + +ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, + uint n_ranges, uint *bufsz, + uint *flags, Cost_estimate *cost) +{ + /* + This call is here because there is no location where this->table would + already be known. + TODO: consider moving it into some per-query initialization call. + */ + ds_mrr.init(this, table); + return ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, bufsz, + flags, cost); +} + +ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint key_parts, uint *bufsz, + uint *flags, Cost_estimate *cost) +{ + ds_mrr.init(this, table); + return ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, flags, cost); +} + +int ha_maria::multi_range_read_explain_info(uint mrr_mode, char *str, + size_t size) +{ + return ds_mrr.dsmrr_explain_info(mrr_mode, str, size); +} +/* MyISAM MRR implementation ends */ + + +/* Index condition pushdown implementation*/ + + +Item *ha_maria::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) +{ + /* + Check if the key contains a blob field. If it does then MyISAM + should not accept the pushed index condition since MyISAM will not + read the blob field from the index entry during evaluation of the + pushed index condition and the BLOB field might be part of the + range evaluation done by the ICP code. + */ + const KEY *key= &table_share->key_info[keyno_arg]; + + for (uint k= 0; k < key->user_defined_key_parts; ++k) + { + const KEY_PART_INFO *key_part= &key->key_part[k]; + if (key_part->key_part_flag & HA_BLOB_PART) + { + /* Let the server handle the index condition */ + return idx_cond_arg; + } + } + + pushed_idx_cond_keyno= keyno_arg; + pushed_idx_cond= idx_cond_arg; + in_range_check_pushed_down= TRUE; + if (active_index == pushed_idx_cond_keyno) + ma_set_index_cond_func(file, handler_index_cond_check, this); + return NULL; +} + +/** + Find record by unique constrain (used in temporary tables) + + @param record (IN|OUT) the record to find + @param constrain_no (IN) number of constrain (for this engine) + + @note It is like hp_search but uses function for raw where hp_search + uses functions for index. + + @retval 0 OK + @retval 1 Not found + @retval -1 Error +*/ + +int ha_maria::find_unique_row(uchar *record, uint constrain_no) +{ + int rc; + register_handler(file); + if (file->s->state.header.uniques) + { + DBUG_ASSERT(file->s->state.header.uniques > constrain_no); + MARIA_UNIQUEDEF *def= file->s->uniqueinfo + constrain_no; + ha_checksum unique_hash= _ma_unique_hash(def, record); + rc= _ma_check_unique(file, def, record, unique_hash, HA_OFFSET_ERROR); + if (rc) + { + file->cur_row.lastpos= file->dup_key_pos; + if ((*file->read_record)(file, record, file->cur_row.lastpos)) + return -1; + file->update|= HA_STATE_AKTIV; /* Record is read */ + } + // invert logic + rc= !MY_TEST(rc); + } + else + { + /* + It is case when just unique index used instead unicue constrain + (conversion from heap table). + */ + DBUG_ASSERT(file->s->state.header.keys > constrain_no); + MARIA_KEY key; + file->once_flags|= USE_PACKED_KEYS; + (*file->s->keyinfo[constrain_no].make_key) + (file, &key, constrain_no, file->lastkey_buff2, record, 0, 0); + rc= maria_rkey(file, record, constrain_no, key.data, key.data_length, + HA_READ_KEY_EXACT); + rc= MY_TEST(rc); + } + return rc; +} + + +/** + Check if a table needs to be repaired +*/ + +int ha_maria::check_for_upgrade(HA_CHECK_OPT *check) +{ + if (table->s->mysql_version && table->s->mysql_version <= 100509 && + (file->s->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED)) + { + /* + Encrypted tables before 10.5.9 had a bug where LSN was not + stored on the pages. These must be repaired! + */ + return HA_ADMIN_NEEDS_ALTER; + } + return HA_ADMIN_OK; +} + + +struct st_mysql_storage_engine maria_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +maria_declare_plugin(aria) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &maria_storage_engine, + "Aria", + "MariaDB Corporation Ab", + "Crash-safe tables with MyISAM heritage. Used for internal temporary tables and privilege tables", + PLUGIN_LICENSE_GPL, + ha_maria_init, /* Plugin Init */ + NULL, /* Plugin Deinit */ + 0x0105, /* 1.5 */ + status_variables, /* status variables */ + system_variables, /* system variables */ + "1.5", /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +} +maria_declare_plugin_end; diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h new file mode 100644 index 00000000..2b8b5dc9 --- /dev/null +++ b/storage/maria/ha_maria.h @@ -0,0 +1,200 @@ +#ifndef HA_MARIA_INCLUDED +#define HA_MARIA_INCLUDED +/* Copyright (C) 2006, 2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2009, 2020, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif + +/* class for the maria handler */ + +#include "maria_def.h" +#include "handler.h" +#include "table.h" + +#define HA_RECOVER_NONE 0 /* No automatic recover */ +#define HA_RECOVER_DEFAULT 1 /* Automatic recover active */ +#define HA_RECOVER_BACKUP 2 /* Make a backupfile on recover */ +#define HA_RECOVER_FORCE 4 /* Recover even if we loose rows */ +#define HA_RECOVER_QUICK 8 /* Don't check rows in data file */ + +C_MODE_START +check_result_t index_cond_func_maria(void *arg); +C_MODE_END + +extern TYPELIB maria_recover_typelib; +extern ulonglong maria_recover_options; + +/* + In the ha_maria class there are a few virtual methods that are not marked as + 'final'. This is because they are re-defined by the ha_s3 engine. +*/ + +class __attribute__((visibility("default"))) ha_maria :public handler +{ +public: + MARIA_HA *file; +private: + ulonglong int_table_flags; + MARIA_RECORD_POS remember_pos; + char *data_file_name, *index_file_name; + enum data_file_type data_file_type; + bool can_enable_indexes; + /** + If a transactional table is doing bulk insert with a single + UNDO_BULK_INSERT with/without repair. + */ + uint8 bulk_insert_single_undo; + int repair(THD * thd, HA_CHECK *param, bool optimize); + int zerofill(THD * thd, HA_CHECK_OPT *check_opt); + +public: + ha_maria(handlerton *hton, TABLE_SHARE * table_arg); + ~ha_maria() = default; + handler *clone(const char *name, MEM_ROOT *mem_root) override final; + const char *index_type(uint key_number) override final; + ulonglong table_flags() const override final + { return int_table_flags; } + ulong index_flags(uint inx, uint part, bool all_parts) const override final; + uint max_supported_keys() const override final + { return MARIA_MAX_KEY; } + uint max_supported_key_length() const override final; + uint max_supported_key_part_length() const override final + { return max_supported_key_length(); } + enum row_type get_row_type() const override final; + void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share) override final; + virtual double scan_time() override final; + + int open(const char *name, int mode, uint test_if_locked) override; + int close(void) override final; + int write_row(const uchar * buf) override; + int update_row(const uchar * old_data, const uchar * new_data) override; + int delete_row(const uchar * buf) override; + int index_read_map(uchar * buf, const uchar * key, key_part_map keypart_map, + enum ha_rkey_function find_flag) override final; + int index_read_idx_map(uchar * buf, uint idx, const uchar * key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) override final; + int index_read_last_map(uchar * buf, const uchar * key, + key_part_map keypart_map) override final; + int index_next(uchar * buf) override final; + int index_prev(uchar * buf) override final; + int index_first(uchar * buf) override final; + int index_last(uchar * buf) override final; + int index_next_same(uchar * buf, const uchar * key, uint keylen) override final; + int ft_init() override final + { + if (!ft_handler) + return 1; + ft_handler->please->reinit_search(ft_handler); + return 0; + } + FT_INFO *ft_init_ext(uint flags, uint inx, String * key) override final; + int ft_read(uchar * buf) override final; + int index_init(uint idx, bool sorted) override final; + int index_end() override final; + int rnd_init(bool scan) override final; + int rnd_end(void) override final; + int rnd_next(uchar * buf) override final; + int rnd_pos(uchar * buf, uchar * pos) override final; + int remember_rnd_pos() override final; + int restart_rnd_next(uchar * buf) override final; + void position(const uchar * record) override final; + int info(uint) override final; + int info(uint, my_bool); + int extra(enum ha_extra_function operation) override final; + int extra_opt(enum ha_extra_function operation, ulong cache_size) override final; + int reset(void) override final; + int external_lock(THD * thd, int lock_type) override; + int start_stmt(THD *thd, thr_lock_type lock_type) override final; + int delete_all_rows(void) override final; + int disable_indexes(uint mode) override final; + int enable_indexes(uint mode) override final; + int indexes_are_disabled(void) override final; + void start_bulk_insert(ha_rows rows, uint flags) override final; + int end_bulk_insert() override final; + ha_rows records_in_range(uint inx, const key_range *min_key, + const key_range *max_key, + page_range *pages) override final; + void update_create_info(HA_CREATE_INFO * create_info) override final; + int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info) override; + THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, + enum thr_lock_type lock_type) override final; + virtual void get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values) override final; + int rename_table(const char *from, const char *to) override; + int delete_table(const char *name) override; + void drop_table(const char *name) override; + int check(THD * thd, HA_CHECK_OPT * check_opt) override; + int analyze(THD * thd, HA_CHECK_OPT * check_opt) override; + int repair(THD * thd, HA_CHECK_OPT * check_opt) override; + int check_for_upgrade(HA_CHECK_OPT *check_opt) override; + bool check_and_repair(THD * thd) override final; + bool is_crashed() const override final; + bool is_changed() const; + bool auto_repair(int error) const override final; + int optimize(THD * thd, HA_CHECK_OPT * check_opt) override final; + int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt) override final; + int preload_keys(THD * thd, HA_CHECK_OPT * check_opt) override; + bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) override final; +#ifdef HAVE_QUERY_CACHE + my_bool register_query_cache_table(THD *thd, const char *table_key, + uint key_length, + qc_engine_callback + *engine_callback, + ulonglong *engine_data) override final; +#endif + MARIA_HA *file_ptr(void) + { + return file; + } + static bool has_active_transaction(THD *thd); + static int implicit_commit(THD *thd, bool new_trn); + /** + * Multi Range Read interface + */ + int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint mode, HANDLER_BUFFER *buf) override final; + int multi_range_read_next(range_id_t *range_info) override final; + ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, + void *seq_init_param, + uint n_ranges, uint *bufsz, + uint *flags, Cost_estimate *cost) override final; + ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint key_parts, uint *bufsz, + uint *flags, Cost_estimate *cost) override final; + int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size) override final; + + /* Index condition pushdown implementation */ + Item *idx_cond_push(uint keyno, Item* idx_cond) override final; + + int find_unique_row(uchar *record, uint unique_idx) override final; + + /* Following functions are needed by the S3 handler */ + virtual S3_INFO *s3_open_args() { return 0; } + virtual void register_handler(MARIA_HA *file) {} + +private: + DsMrr_impl ds_mrr; + friend check_result_t index_cond_func_maria(void *arg); + friend void reset_thd_trn(THD *thd); + friend class ha_s3; +}; + +#endif /* HA_MARIA_INCLUDED */ diff --git a/storage/maria/ha_s3.cc b/storage/maria/ha_s3.cc new file mode 100644 index 00000000..8c105522 --- /dev/null +++ b/storage/maria/ha_s3.cc @@ -0,0 +1,1125 @@ +/* Copyright (C) 2019, 2021 MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the + Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA +*/ + +/* + Implementation of S3 storage engine. + + Storage format: + + The S3 engine is read only storage engine. The data is stored in + same format as a non transactional Aria table in BLOCK_RECORD format. + This makes it easy to cache both index and rows in the page cache. + Data and index file are split into blocks of 's3_block_size', default + 4M. + + The table and it's associated files are stored in S3 into the following + locations: + + frm file (for discovery): + aws_bucket/database/table/frm + + First index block (contains description if the Aria file): + aws_bucket/database/table/aria + + Rest of the index file: + aws_bucket/database/table/index/block_number + + Data file: + aws_bucket/database/table/data/block_number + + block_number is 6 digits decimal number, prefixed with 0 + (Can be larger than 6 numbers, the prefix is just for nice output) + + frm and base blocks are small (just the needed data). + index and blocks are of size 's3_block_size' + + If compression is used, then original block size is s3_block_size + but the stored block will be the size of the compressed block. + + Implementation: + The s3 engine inherits from the ha_maria handler. + + It uses Aria code and relies on Aria being enabled. We don't have to check + that Aria is enabled though, because Aria is a mandatory plugin, and + the server will refuse to start if Aria failed to initialize. + + s3 will use it's own page cache to not interfere with normal Aria + usage but also to ensure that the S3 page cache is large enough + (with a 4M s3_block_size the engine will need a large cache to work, + at least s3_block_size * 32. The default cache is 512M. +*/ + +#define MYSQL_SERVER 1 +#include <my_global.h> +#include <m_string.h> +#include "maria_def.h" +#include "sql_class.h" +#include <mysys_err.h> +#include <libmarias3/marias3.h> +#include <discover.h> +#include "ha_s3.h" +#include "s3_func.h" +#include "aria_backup.h" + +#define DEFAULT_AWS_HOST_NAME "s3.amazonaws.com" + +static PAGECACHE s3_pagecache; +static ulong s3_block_size, s3_protocol_version; +static ulong s3_pagecache_division_limit, s3_pagecache_age_threshold; +static ulong s3_pagecache_file_hash_size; +static ulonglong s3_pagecache_buffer_size; +static char *s3_bucket, *s3_access_key=0, *s3_secret_key=0, *s3_region; +static char *s3_host_name; +static int s3_port; +static my_bool s3_use_http; +static char *s3_tmp_access_key=0, *s3_tmp_secret_key=0; +static my_bool s3_debug= 0, s3_slave_ignore_updates= 0; +static my_bool s3_replicate_alter_as_create_select= 0; +handlerton *s3_hton= 0; + +/* Don't show access or secret keys to users if they exists */ + +static void update_access_key(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + my_free(s3_access_key); + s3_access_key= 0; + /* Don't show real key to user in SHOW VARIABLES */ + if (s3_tmp_access_key[0]) + { + s3_access_key= s3_tmp_access_key; + s3_tmp_access_key= my_strdup(PSI_NOT_INSTRUMENTED, "*****", MYF(MY_WME)); + } +} + +static void update_secret_key(MYSQL_THD thd, + struct st_mysql_sys_var *var, + void *var_ptr, const void *save) +{ + my_free(s3_secret_key); + s3_secret_key= 0; + /* Don't show real key to user in SHOW VARIABLES */ + if (s3_tmp_secret_key[0]) + { + s3_secret_key= s3_tmp_secret_key; + s3_tmp_secret_key= my_strdup(PSI_NOT_INSTRUMENTED, "*****", MYF(MY_WME)); + } +} + +/* Define system variables for S3 */ + +static MYSQL_SYSVAR_ULONG(block_size, s3_block_size, + PLUGIN_VAR_RQCMDARG, + "Block size for S3", 0, 0, + 4*1024*1024, 65536, 16*1024*1024, 8192); + +static MYSQL_SYSVAR_BOOL(debug, s3_debug, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Generates trace file from libmarias3 on stderr for debugging", + 0, 0, 0); + +static MYSQL_SYSVAR_BOOL(slave_ignore_updates, s3_slave_ignore_updates, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "If the slave has shares same S3 storage as the master", + 0, 0, 0); + +static MYSQL_SYSVAR_BOOL(replicate_alter_as_create_select, + s3_replicate_alter_as_create_select, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "When converting S3 table to local table, log all rows in binary log", + 0, 0, 1); + +static MYSQL_SYSVAR_ENUM(protocol_version, s3_protocol_version, + PLUGIN_VAR_RQCMDARG, + "Protocol used to communication with S3. One of " + "\"Auto\", \"Amazon\" or \"Original\".", + NULL, NULL, 0, &s3_protocol_typelib); + +static MYSQL_SYSVAR_ULONG(pagecache_age_threshold, + s3_pagecache_age_threshold, PLUGIN_VAR_RQCMDARG, + "This characterizes the number of hits a hot block has to be untouched " + "until it is considered aged enough to be downgraded to a warm block. " + "This specifies the percentage ratio of that number of hits to the " + "total number of blocks in the page cache.", 0, 0, + 300, 100, ~ (ulong) 0L, 100); + +static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, s3_pagecache_buffer_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The size of the buffer used for index blocks for S3 tables. " + "Increase this to get better index handling (for all reads and " + "multiple writes) to as much as you can afford.", 0, 0, + 128*1024*1024, 1024*1024*32, ~(ulonglong) 0, 8192); + +static MYSQL_SYSVAR_ULONG(pagecache_division_limit, + s3_pagecache_division_limit, + PLUGIN_VAR_RQCMDARG, + "The minimum percentage of warm blocks in key cache", 0, 0, + 100, 1, 100, 1); + +static MYSQL_SYSVAR_ULONG(pagecache_file_hash_size, + s3_pagecache_file_hash_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of hash buckets for open files. If you have a lot " + "of S3 files open you should increase this for faster flush of " + "changes. A good value is probably 1/10 of number of possible open " + "S3 files.", 0,0, 512, 32, 16384, 1); + +static MYSQL_SYSVAR_STR(bucket, s3_bucket, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "AWS bucket", + 0, 0, "MariaDB"); +static MYSQL_SYSVAR_STR(host_name, s3_host_name, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "AWS host name", + 0, 0, DEFAULT_AWS_HOST_NAME); +static MYSQL_SYSVAR_INT(port, s3_port, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Port number to connect to (0 means use default)", + NULL /*check*/, NULL /*update*/, 0 /*default*/, + 0 /*min*/, 65535 /*max*/, 1 /*blk*/); +static MYSQL_SYSVAR_BOOL(use_http, s3_use_http, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "If true, force use of HTTP protocol", + NULL /*check*/, NULL /*update*/, 0 /*default*/); +static MYSQL_SYSVAR_STR(access_key, s3_tmp_access_key, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC, + "AWS access key", + 0, update_access_key, ""); +static MYSQL_SYSVAR_STR(secret_key, s3_tmp_secret_key, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC, + "AWS secret key", + 0, update_secret_key, ""); +static MYSQL_SYSVAR_STR(region, s3_region, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "AWS region", + 0, 0, ""); + +ha_create_table_option s3_table_option_list[]= +{ + /* + one numeric option, with the default of UINT_MAX32, valid + range of values 0..UINT_MAX32, and a "block size" of 10 + (any value must be divisible by 10). + */ + HA_TOPTION_SYSVAR("s3_block_size", s3_block_size, block_size), + HA_TOPTION_ENUM("compression_algorithm", compression_algorithm, "none,zlib", + 0), + HA_TOPTION_END +}; + + +/***************************************************************************** + S3 handler code +******************************************************************************/ + +/** + Create S3 handler +*/ + + +ha_s3::ha_s3(handlerton *hton, TABLE_SHARE *table_arg) + :ha_maria(hton, table_arg), in_alter_table(S3_NO_ALTER), open_args(NULL) +{ + /* Remove things that S3 doesn't support */ + int_table_flags&= ~(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE | + HA_CAN_EXPORT); + can_enable_indexes= 0; +} + + +/** + Remember the handler to use for s3_block_read() + + @note + In the future the ms3_st objects could be stored in + a list in share. In this case we would however need a mutex + to access the next free one. By using st_my_thread_var we + can avoid the mutex with the small cost of having to call + register handler in all handler functions that will access + the page cache +*/ + +void ha_s3::register_handler(MARIA_HA *file) +{ + struct st_my_thread_var *thread= my_thread_var; + thread->keycache_file= (void*) file; +} + + +/** + Write a row + + When generating the table as part of ALTER TABLE, writes are allowed. + When table is moved to S3, writes are not allowed. +*/ + +int ha_s3::write_row(const uchar *buf) +{ + DBUG_ENTER("ha_s3::write_row"); + if (in_alter_table) + DBUG_RETURN(ha_maria::write_row(buf)); + DBUG_RETURN(HA_ERR_TABLE_READONLY); +} + +/* Return true if S3 can be used */ + +static my_bool s3_usable() +{ + return (s3_access_key != 0 && s3_secret_key != 0 && s3_region != 0 && + s3_bucket != 0); +} + + +static my_bool s3_info_init(S3_INFO *info) +{ + if (!s3_usable()) + return 1; + info->protocol_version= (uint8_t) s3_protocol_version; + lex_string_set(&info->host_name, s3_host_name); + info->port= s3_port; + info->use_http= s3_use_http; + lex_string_set(&info->access_key, s3_access_key); + lex_string_set(&info->secret_key, s3_secret_key); + lex_string_set(&info->region, s3_region); + lex_string_set(&info->bucket, s3_bucket); + return 0; +} + + +/** + Fill information in S3_INFO including paths to table and database + + Notes: + Database and table name are set even if s3 variables are not + initialized. This is needed by s3::drop_table +*/ + +static my_bool s3_info_init(S3_INFO *s3_info, const char *path, + char *database_buff, size_t database_length) +{ + set_database_and_table_from_path(s3_info, path); + /* Fix database as it's not \0 terminated */ + strmake(database_buff, s3_info->database.str, + MY_MIN(database_length, s3_info->database.length)); + s3_info->database.str= database_buff; + s3_info->base_table= s3_info->table; + return s3_info_init(s3_info); +} + +/* + Check if table is a temporary table + + Returns 1 if table is a temporary table that should be stored in Aria + (to later be copied to S3 with a name change) +*/ + +static int is_mariadb_internal_tmp_table(const char *table_name) +{ + int length; + const int p_length= sizeof(tmp_file_prefix); // prefix + '-' + /* Temporary table from ALTER TABLE */ + if (!strncmp(table_name, tmp_file_prefix "-" , p_length)) + { + /* + Internal temporary tables used by ALTER TABLE and ALTER PARTITION + should be stored in S3 + */ + if (!strncmp(table_name+p_length, "backup-", sizeof("backup-")-1) || + !strncmp(table_name+p_length, "exchange-", sizeof("exchange-")-1) || + !strncmp(table_name+p_length, "temptable-", sizeof("temptable-")-1)) + return 0; + /* Other temporary tables should be stored in Aria on local disk */ + return 1; + } + length= strlen(table_name); + if (length > 5 && !strncmp(table_name + length - 5, "#TMP#", 5)) + return 1; + return 0; +} + + +/** + Drop S3 table +*/ + +int ha_s3::delete_table(const char *name) +{ + ms3_st *s3_client; + S3_INFO s3_info; + int error; + char database[NAME_LEN+1]; + DBUG_ENTER("ha_s3::delete_table"); + + error= s3_info_init(&s3_info, name, database, sizeof(database)-1); + + /* If internal on disk temporary table, let Aria take care of it */ + if (is_mariadb_internal_tmp_table(s3_info.table.str)) + DBUG_RETURN(ha_maria::delete_table(name)); + + if (error) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + + if (!(s3_client= s3_open_connection(&s3_info))) + DBUG_RETURN(HA_ERR_NO_CONNECTION); + error= aria_delete_from_s3(s3_client, s3_info.bucket.str, + s3_info.database.str, + s3_info.table.str,0); + s3_deinit(s3_client); + DBUG_RETURN(error); +} + +/* + The table is a temporary table as part of ALTER TABLE. + + Copy the on disk 'temporary' Aria table to S3 and delete the Aria table +*/ + +static int move_table_to_s3(ms3_st *s3_client, + S3_INFO *to_s3_info, + const char *local_name, + bool is_partition) +{ + int error; + DBUG_ASSERT(!is_mariadb_internal_tmp_table(to_s3_info->table.str)); + + if (!(error= aria_copy_to_s3(s3_client, to_s3_info->bucket.str, local_name, + to_s3_info->database.str, + to_s3_info->table.str, + 0, 0, 1, 0, !is_partition))) + { + /* Table now in S3. Remove original files table files, keep .frm */ + error= maria_delete_table_files(local_name, 1, 0); + } + return error; +} + + +/** + Copy an Aria table to S3 or rename a table in S3 + + The copy happens as part of the rename in ALTER TABLE when all data + is in an Aria table and we now have to copy it to S3. + + If the table is an old table already in S3, we should just rename it. +*/ + +int ha_s3::rename_table(const char *from, const char *to) +{ + S3_INFO to_s3_info; + char to_name[NAME_LEN+1], frm_name[FN_REFLEN]; + ms3_st *s3_client; + MY_STAT stat_info; + int error; + bool is_partition= (strstr(from, "#P#") != NULL) || + (strstr(to, "#P#") != NULL); + DBUG_ENTER("ha_s3::rename_table"); + + if (s3_info_init(&to_s3_info, to, to_name, sizeof(to_name)-1)) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + if (!(s3_client= s3_open_connection(&to_s3_info))) + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + + /* + Check if this is a on disk table created by ALTER TABLE that should be + copied to S3. We know this is the case if the table is a temporary table + and the .MAI file for the table is on disk + */ + fn_format(frm_name, from, "", reg_ext, MYF(0)); + if (is_mariadb_internal_tmp_table(from + dirname_length(from)) && + (is_partition || my_stat(frm_name, &stat_info, MYF(0)))) + { + error= move_table_to_s3(s3_client, &to_s3_info, from, is_partition); + } + else + { + char from_name[NAME_LEN+1]; + S3_INFO from_s3_info; + /* The table is an internal S3 table. Do the renames */ + s3_info_init(&from_s3_info, from, from_name, sizeof(from_name)-1); + + if (is_mariadb_internal_tmp_table(to + dirname_length(to))) + { + /* + The table is renamed to a temporary table. This only happens + in the case of an ALTER PARTITION failure and there will be soon + a delete issued for the temporary table. The only thing we can do + is to remove the from table. We will get an extra errors for the + uppcoming but we will ignore this minor problem for now as this + is an unlikely event and the extra warnings are just annoying, + not critical. + */ + error= aria_delete_from_s3(s3_client, from_s3_info.bucket.str, + from_s3_info.database.str, + from_s3_info.table.str,0); + } + else + error= aria_rename_s3(s3_client, to_s3_info.bucket.str, + from_s3_info.database.str, + from_s3_info.table.str, + to_s3_info.database.str, + to_s3_info.table.str, + !is_partition && + !current_thd->lex->alter_info.partition_flags); + } + s3_deinit(s3_client); + DBUG_RETURN(error); +} + + +/** + Create a s3 table. + + @notes + One can only create an s3 table as part of ALTER TABLE + The table is created as a non transactional Aria table with + BLOCK_RECORD format +*/ + +int ha_s3::create(const char *name, TABLE *table_arg, + HA_CREATE_INFO *ha_create_info) +{ + uchar *frm_ptr; + size_t frm_len; + int error; + TABLE_SHARE *share= table_arg->s; + DBUG_ENTER("ha_s3::create"); + + if (!(ha_create_info->options & HA_CREATE_TMP_ALTER) || + ha_create_info->tmp_table()) + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + + if (share->table_type == TABLE_TYPE_SEQUENCE) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + + /* When using partitions, S3 only supports adding and remove partitions */ + if ((table_arg->in_use->lex->alter_info.partition_flags & + ~(ALTER_PARTITION_REMOVE | ALTER_PARTITION_ADD | ALTER_PARTITION_INFO))) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + + if (!s3_usable()) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + + /* Force the table to a format suitable for S3 */ + ha_create_info->row_type= ROW_TYPE_PAGE; + ha_create_info->transactional= HA_CHOICE_NO; + error= ha_maria::create(name, table_arg, ha_create_info); + if (error) + DBUG_RETURN(error); + +#ifdef MOVE_FILES_TO_S3_ON_CREATE + /* + If we are in ADD PARTITION and we created a new table (not + temporary table, which will be moved as part of the final rename), + we should move it S3 right away. The other option would to move + it as part of close(). We prefer to do this here as there is no error + checking with close() which would leave incomplete tables around in + case of failures. The downside is that we can't move rows around as + part of changing partitions, but that is not a big problem with S3 + as it's readonly anyway. + */ + if (!is_mariadb_internal_tmp_table(name + dirname_length(name)) && + strstr(name, "#P#")) + { + S3_INFO to_s3_info; + char database[NAME_LEN+1]; + ms3_st *s3_client; + + if (s3_info_init(&to_s3_info, name, database, sizeof(database)-1)) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + if (!(s3_client= s3_open_connection(&to_s3_info))) + DBUG_RETURN(HA_ERR_NO_CONNECTION); + + /* Note that if error is set, then the empty temp table was not removed */ + error= move_table_to_s3(s3_client, &to_s3_info, name, 1); + s3_deinit(s3_client); + if (error) + maria_delete_table_files(name, 1, 0); + else +#endif /* MOVE_TABLE_TO_S3 */ + { + /* Create the .frm file. Needed for ha_s3::rename_table() later */ + if (!table_arg->s->read_frm_image((const uchar**) &frm_ptr, &frm_len)) + { + table_arg->s->write_frm_image(frm_ptr, frm_len); + table_arg->s->free_frm_image(frm_ptr); + } + } + DBUG_RETURN(error); +} + +/** + Open table + + @notes + Table is read only, except if opened by ALTER as in this case we + are creating the S3 table. +*/ + +int ha_s3::open(const char *name, int mode, uint open_flags) +{ + bool internal_tmp_table= 0; + int res; + S3_INFO s3_info; + DBUG_ENTER("ha_s3:open"); + + if (!s3_usable()) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + + /* + On slaves with s3_slave_ignore_updates set we allow tables to be + opened in write mode to be able to ignore queries that modify + the table trough handler::check_if_updates_are_ignored(). + + This is needed for the slave to be able to handle + CREATE TABLE t1... + INSERT INTO TABLE t1 .... + ALTER TABLE t1 ENGINE=S3 + If this is not done, the insert will fail on the slave if the + master has already executed the ALTER TABLE. + + We also have to allow open for create, as part of + ALTER TABLE ... ENGINE=S3. + + Otherwise we only allow the table to be open in read mode + */ + if (mode != O_RDONLY && !(open_flags & HA_OPEN_FOR_CREATE) && + !s3_slave_ignore_updates) + DBUG_RETURN(EACCES); + + open_args= 0; + internal_tmp_table= is_mariadb_internal_tmp_table(name + + dirname_length(name)); + + if (!(open_flags & HA_OPEN_FOR_CREATE) && !internal_tmp_table) + { + (void) s3_info_init(&s3_info); + s3_info.tabledef_version= table->s->tabledef_version; + s3_info.base_table= table->s->table_name; + + /* Pass the above arguments to maria_open() */ + open_args= &s3_info; + in_alter_table= S3_NO_ALTER; + } + else + { + /* + Table was created as an Aria table that will be moved to S3 either + by rename_table() or external_lock() + */ + bool is_partition= (strstr(name, "#P#") != NULL); + in_alter_table= (!is_partition ? S3_ALTER_TABLE : + internal_tmp_table ? S3_ADD_TMP_PARTITION : + S3_ADD_PARTITION); + } + DBUG_PRINT("info", ("in_alter_table: %d", in_alter_table)); + + if (!(res= ha_maria::open(name, mode, open_flags))) + { + if (open_args) + { + /* + Table is in S3. We have to modify the pagecache callbacks for the + data file, index file and for bitmap handling. + */ + file->s->pagecache= &s3_pagecache; + file->dfile.big_block_size= file->s->kfile.big_block_size= + file->s->bitmap.file.big_block_size= file->s->base.s3_block_size; + file->s->kfile.head_blocks= file->s->base.keystart / file->s->block_size; + file->s->no_status_updates= in_alter_table == S3_NO_ALTER; + } + } + open_args= 0; + DBUG_RETURN(res); +} + + +int ha_s3::external_lock(THD * thd, int lock_type) +{ + int error; + DBUG_ENTER("ha_s3::external_lock"); + + error= ha_maria::external_lock(thd, lock_type); + if (in_alter_table == S3_ADD_PARTITION && !error && lock_type == F_UNLCK) + { + /* + This was a new partition. All data is now copied to the table + so it's time to move it to S3) + */ + + MARIA_SHARE *share= file->s; + uint org_open_count; + + /* First, flush all data to the Aria table */ + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_RELEASE)) + error= my_errno; + if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file, + FLUSH_RELEASE)) + error= my_errno; + org_open_count= share->state.open_count; + if (share->global_changed) + share->state.open_count--; + if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK)) + error= my_errno; + share->state.open_count= org_open_count; + + if (!error) + { + S3_INFO to_s3_info; + char database[NAME_LEN+1], *name= file->s->open_file_name.str; + ms3_st *s3_client; + + /* Copy data to S3 */ + if (s3_info_init(&to_s3_info, name, database, sizeof(database)-1)) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + if (!(s3_client= s3_open_connection(&to_s3_info))) + DBUG_RETURN(HA_ERR_NO_CONNECTION); + + /* + Note that if error is set, then the empty temp table was not + removed + */ + error= move_table_to_s3(s3_client, &to_s3_info, name, 1); + s3_deinit(s3_client); + + maria_delete_table_files(name, 1, 0); + } + } + DBUG_RETURN(error); +} + + +/****************************************************************************** + Storage engine handler definitions +******************************************************************************/ + +/** + Free all resources for s3 +*/ + +static handler *s3_create_handler(handlerton *hton, + TABLE_SHARE * table, + MEM_ROOT *mem_root) +{ + return new (mem_root) ha_s3(hton, table); +} + + +static int s3_hton_panic(handlerton *hton, ha_panic_function flag) +{ + if (flag == HA_PANIC_CLOSE && s3_hton) + { + end_pagecache(&s3_pagecache, TRUE); + s3_deinit_library(); + my_free(s3_access_key); + my_free(s3_secret_key); + s3_access_key= s3_secret_key= 0; + s3_hton= 0; + } + return 0; +} + + +/** + Check if a table is in S3 as part of discovery. Returns TABLE_SHARE if found. + + @param hton S3 handlerton + @param thd MariaDB thd + @param [out] share If table exists, this is updated to contain the found + TABLE_SHARE (based on the .frm in S3) + + @return 0 Table exists + @return # Error number +*/ + +static int s3_discover_table(handlerton *hton, THD* thd, TABLE_SHARE *share) +{ + S3_INFO s3_info; + S3_BLOCK frm_block, par_block; + ms3_st *s3_client; + int error; + DBUG_ENTER("s3_discover_table"); + + if (s3_info_init(&s3_info)) + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + if (!(s3_client= s3_open_connection(&s3_info))) + DBUG_RETURN(HA_ERR_NO_CONNECTION); + + s3_info.database= share->db; + s3_info.table= share->table_name; + s3_info.base_table= share->table_name; + + if (s3_get_def(s3_client, &s3_info, &frm_block, "frm")) + { + s3_free(&frm_block); + s3_deinit(s3_client); + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + (void) s3_get_def(s3_client, &s3_info, &par_block, "par"); + + error= share->init_from_binary_frm_image(thd, 1, + frm_block.str, frm_block.length, + par_block.str, par_block.length); + s3_free(&frm_block); + s3_free(&par_block); + s3_deinit(s3_client); + DBUG_RETURN((my_errno= error)); +} + + +/** + Check if a table exists + + @return 0 frm doesn't exists + @return 1 frm exists +*/ + +static int s3_discover_table_existence(handlerton *hton, const char *db, + const char *table_name) +{ + S3_INFO s3_info; + ms3_st *s3_client; + int res; + DBUG_ENTER("s3_discover_table_existence"); + + /* Ignore names in "mysql" database to speed up boot */ + if (!strcmp(db, MYSQL_SCHEMA_NAME.str)) + DBUG_RETURN(0); + + if (s3_info_init(&s3_info)) + DBUG_RETURN(0); + if (!(s3_client= s3_open_connection(&s3_info))) + DBUG_RETURN(0); + + s3_info.database.str= db; + s3_info.database.length= strlen(db); + s3_info.table.str= table_name; + s3_info.table.length= strlen(table_name); + + res= s3_frm_exists(s3_client, &s3_info); + s3_deinit(s3_client); + DBUG_PRINT("exit", ("exists: %d", res == 0)); + DBUG_RETURN(res == 0); // Return 1 if exists +} + + +/** + Return a list of all S3 tables in a database + + Partitoned tables are not shown +*/ + +static int s3_discover_table_names(handlerton *hton __attribute__((unused)), + LEX_CSTRING *db, + MY_DIR *dir __attribute__((unused)), + handlerton::discovered_list *result) +{ + char aws_path[AWS_PATH_LENGTH]; + S3_INFO s3_info; + ms3_st *s3_client; + ms3_list_st *list, *org_list= 0; + int error; + DBUG_ENTER("s3_discover_table_names"); + + /* Ignore names in "mysql" database to speed up boot */ + if (!strcmp(db->str, MYSQL_SCHEMA_NAME.str)) + DBUG_RETURN(0); + + if (s3_info_init(&s3_info)) + DBUG_RETURN(0); + if (!(s3_client= s3_open_connection(&s3_info))) + DBUG_RETURN(0); + + strxnmov(aws_path, sizeof(aws_path)-1, db->str, "/", NullS); + + if ((error= ms3_list_dir(s3_client, s3_info.bucket.str, aws_path, &org_list))) + goto end; + + for (list= org_list ; list ; list= list->next) + { + const char *name= list->key + db->length + 1; // Skip database and '/' + if (!strstr(name, "#P#")) + { + size_t name_length= strlen(name)-1; // Remove end '/' + result->add_table(name, name_length); + } + } + if (org_list) + ms3_list_free(org_list); +end: + s3_deinit(s3_client); + DBUG_RETURN(0); +} + +/* + Check if definition of table in S3 is same as in MariaDB. + This also covers the case where the table is not in S3 anymore. + + Called when a copy of the S3 table is taken from the MariaDB table cache + + TODO: Could possible be optimized by checking if the file on S3 is + of same time, data and size since when table was originally opened. +*/ + +int ha_s3::discover_check_version() +{ + S3_INFO s3_info= *file->s->s3_path; + s3_info.tabledef_version= table->s->tabledef_version; + /* + We have to change the database and table as the table may part of a + partitoned table. In this case we want to check the frm file for the + partitioned table, not the part table. + */ + s3_info.base_table= table->s->table_name; + return (s3_check_frm_version(file->s3, &s3_info) ? + HA_ERR_TABLE_DEF_CHANGED : 0); +} + + +/** + Update the .frm file in S3 +*/ + +static int s3_notify_tabledef_changed(handlerton *, + LEX_CSTRING *db, LEX_CSTRING *table, + LEX_CUSTRING *frm, + LEX_CUSTRING *org_tabledef_version, + handler *) +{ + char aws_path[AWS_PATH_LENGTH]; + S3_INFO s3_info; + ms3_st *s3_client; + int error= 0; + DBUG_ENTER("s3_notify_tabledef_changed"); + + if (strstr(table->str, "#P#")) + DBUG_RETURN(0); // Ignore partitions + + if (s3_info_init(&s3_info)) + DBUG_RETURN(0); + if (!(s3_client= s3_open_connection(&s3_info))) + DBUG_RETURN(0); + + s3_info.database= *db; + s3_info.base_table= *table; + s3_info.tabledef_version= *org_tabledef_version; + if (s3_check_frm_version(s3_client, &s3_info)) + { + error= 1; + goto err; + } + + strxnmov(aws_path, sizeof(aws_path)-1, db->str, "/", table->str, "/frm", + NullS); + + if (s3_put_object(s3_client, s3_info.bucket.str, aws_path, (uchar*) frm->str, + frm->length, 0)) + error= 2; + +err: + s3_deinit(s3_client); + DBUG_RETURN(error); +} + + +/** + Update the .frm and .par file of a partitioned table stored in s3 + + Logic is: + - Skip temporary tables used internally by ALTER TABLE and ALTER PARTITION + - In case of delete, delete the .frm and .par file from S3 + - In case of create, copy the .frm and .par files to S3 + - In case of rename: + - Delete from old_path if not internal temporary file and if exists + - Copy new .frm and .par file to S3 + + To ensure that this works with the reply logic from ALTER PARTITION + there should be no errors, only notes, for deletes. +*/ + +static int s3_create_partitioning_metadata(const char *path, + const char *old_path, + chf_create_flags action_flag) +{ + ms3_st *s3_client; + S3_INFO s3_info; + int error= 0; + char database[NAME_LEN+1]; + const char *tmp_path; + DBUG_ENTER("s3_create_partitioning_metadata"); + + /* Path is empty in case of delete */ + tmp_path= path ? path : old_path; + + if (s3_info_init(&s3_info, tmp_path, database, sizeof(database)-1)) + DBUG_RETURN(HA_ERR_UNSUPPORTED); + if (!(s3_client= s3_open_connection(&s3_info))) + DBUG_RETURN(HA_ERR_NO_CONNECTION); + + switch (action_flag) { + case CHF_DELETE_FLAG: + case CHF_RENAME_FLAG: + { + if (!is_mariadb_internal_tmp_table(old_path + dirname_length(old_path))) + { + S3_INFO s3_info2; + char database2[NAME_LEN+1]; + s3_info_init(&s3_info2, old_path, database2, sizeof(database2)-1); + + partition_delete_from_s3(s3_client, s3_info2.bucket.str, + s3_info2.database.str, s3_info2.table.str, + MYF(ME_NOTE)); + } + if (action_flag == CHF_DELETE_FLAG) + break; + } + /* Fall through */ + case CHF_CREATE_FLAG: + if (!is_mariadb_internal_tmp_table(path + dirname_length(path))) + error= partition_copy_to_s3(s3_client, s3_info.bucket.str, + path, old_path, + s3_info.database.str, s3_info.table.str); + break; + case CHF_INDEX_FLAG: + break; + } + s3_deinit(s3_client); + DBUG_RETURN(error); +} + + +/** + Initialize s3 plugin +*/ + +static int ha_s3_init(void *p) +{ + bool res; + static const char *no_exts[]= { 0 }; + + s3_hton= (handlerton *)p; + s3_hton->db_type= DB_TYPE_S3; + s3_hton->create= s3_create_handler; + s3_hton->panic= s3_hton_panic; + s3_hton->table_options= s3_table_option_list; + s3_hton->discover_table= s3_discover_table; + s3_hton->discover_table_names= s3_discover_table_names; + s3_hton->discover_table_existence= s3_discover_table_existence; + s3_hton->notify_tabledef_changed= s3_notify_tabledef_changed; + s3_hton->create_partitioning_metadata= s3_create_partitioning_metadata; + s3_hton->tablefile_extensions= no_exts; + s3_hton->commit= 0; + s3_hton->rollback= 0; + s3_hton->checkpoint_state= 0; + s3_hton->flush_logs= 0; + s3_hton->show_status= 0; + s3_hton->prepare_for_backup= 0; + s3_hton->end_backup= 0; + s3_hton->flags= ((s3_slave_ignore_updates ? HTON_IGNORE_UPDATES : 0) | + (s3_replicate_alter_as_create_select ? + HTON_TABLE_MAY_NOT_EXIST_ON_SLAVE : 0)); + /* Copy global arguments to s3_access_key and s3_secret_key */ + update_access_key(0,0,0,0); + update_secret_key(0,0,0,0); + + if ((res= !init_pagecache(&s3_pagecache, + (size_t) s3_pagecache_buffer_size, + s3_pagecache_division_limit, + s3_pagecache_age_threshold, maria_block_size, + s3_pagecache_file_hash_size, 0))) + s3_hton= 0; + s3_pagecache.big_block_read= s3_block_read; + s3_pagecache.big_block_free= s3_free; + s3_init_library(); + if (s3_debug) + ms3_debug(); + + struct s3_func s3f_real = + { + ms3_set_option, s3_free, ms3_deinit, s3_unique_file_number, + read_index_header, s3_check_frm_version, s3_info_copy, + set_database_and_table_from_path, s3_open_connection + }; + s3f= s3f_real; + + return res ? HA_ERR_INITIALIZATION : 0; +} + +static int ha_s3_deinit(void*) +{ + bzero(&s3f, sizeof(s3f)); + return 0; +} + +static SHOW_VAR status_variables[]= { + {"pagecache_blocks_not_flushed", + (char*) &s3_pagecache.global_blocks_changed, SHOW_LONG}, + {"pagecache_blocks_unused", + (char*) &s3_pagecache.blocks_unused, SHOW_LONG}, + {"pagecache_blocks_used", + (char*) &s3_pagecache.blocks_used, SHOW_LONG}, + {"pagecache_read_requests", + (char*) &s3_pagecache.global_cache_r_requests, SHOW_LONGLONG}, + {"pagecache_reads", + (char*) &s3_pagecache.global_cache_read, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} +}; + + +static struct st_mysql_sys_var* system_variables[]= { + MYSQL_SYSVAR(block_size), + MYSQL_SYSVAR(debug), + MYSQL_SYSVAR(protocol_version), + MYSQL_SYSVAR(pagecache_age_threshold), + MYSQL_SYSVAR(pagecache_buffer_size), + MYSQL_SYSVAR(pagecache_division_limit), + MYSQL_SYSVAR(pagecache_file_hash_size), + MYSQL_SYSVAR(host_name), + MYSQL_SYSVAR(port), + MYSQL_SYSVAR(use_http), + MYSQL_SYSVAR(bucket), + MYSQL_SYSVAR(access_key), + MYSQL_SYSVAR(secret_key), + MYSQL_SYSVAR(region), + MYSQL_SYSVAR(slave_ignore_updates), + MYSQL_SYSVAR(replicate_alter_as_create_select), + NULL +}; + +struct st_mysql_storage_engine s3_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +maria_declare_plugin(s3) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &s3_storage_engine, + "S3", + "MariaDB Corporation Ab", + "Read only table stored in S3. Created by running " + "ALTER TABLE table_name ENGINE=s3", + PLUGIN_LICENSE_GPL, + ha_s3_init, /* Plugin Init */ + ha_s3_deinit, /* Plugin Deinit */ + 0x0100, /* 1.0 */ + status_variables, /* status variables */ + system_variables, /* system variables */ + "1.0", /* string version */ + MariaDB_PLUGIN_MATURITY_STABLE/* maturity */ +} +maria_declare_plugin_end; diff --git a/storage/maria/ha_s3.h b/storage/maria/ha_s3.h new file mode 100644 index 00000000..f7bffceb --- /dev/null +++ b/storage/maria/ha_s3.h @@ -0,0 +1,75 @@ +#ifndef HA_S3_INCLUDED +#define HA_S3_INCLUDED +/* Copyright (C) 2019, 2020, MariaDB Corporation AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the + Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA +*/ + +#include "ha_maria.h" + +class ha_s3 final :public ha_maria +{ + enum alter_table_op + { S3_NO_ALTER, S3_ALTER_TABLE, S3_ADD_PARTITION, S3_ADD_TMP_PARTITION }; + alter_table_op in_alter_table; + S3_INFO *open_args; + +public: + ha_s3(handlerton *hton, TABLE_SHARE * table_arg); + ~ha_s3() {} + + int create(const char *name, TABLE *table_arg, + HA_CREATE_INFO *ha_create_info) override; + int open(const char *name, int mode, uint open_flags) override; + int write_row(const uchar *buf) override; + int update_row(const uchar *, const uchar *) override + { + DBUG_ENTER("update_row"); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + int delete_row(const uchar *) override + { + DBUG_ENTER("delete_row"); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + int analyze(THD *, HA_CHECK_OPT *) override + { + DBUG_ENTER("analyze"); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + int repair(THD * thd, HA_CHECK_OPT * check_opt) override + { + DBUG_ENTER("repair"); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + int preload_keys(THD * thd, HA_CHECK_OPT * check_opt) override + { + DBUG_ENTER("preload_keys"); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + int external_lock(THD * thd, int lock_type) override; + /* + drop_table() is only used for internal temporary tables, + not applicable for s3 + */ + void drop_table(const char *) override {} + int delete_table(const char *name) override; + int rename_table(const char *from, const char *to) override; + int discover_check_version() override; + int rebind(); + S3_INFO *s3_open_args() override { return open_args; } + void register_handler(MARIA_HA *file) override; +}; +#endif /* HA_S3_INCLUDED */ diff --git a/storage/maria/libmarias3/.gitignore b/storage/maria/libmarias3/.gitignore new file mode 100644 index 00000000..ca360629 --- /dev/null +++ b/storage/maria/libmarias3/.gitignore @@ -0,0 +1,40 @@ +aclocal.m4 +autom4te.cache/ +build-aux/ +config.in +configure +m4/libtool.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/ltversion.m4 +m4/lt~obsolete.m4 +Makefile +Makefile.in +aminclude.am +config.h +config.in~ +config.log +config.status +libmarias3-config +libmarias3.pc +libtool +src/.deps/ +src/.dirstamp +src/.libs/ +stamp-h1 +t/ +test-suite.log +tests/.deps/ +tests/.dirstamp +version.h +*.dgcov +*.o +*.lo +*.la +tags +docs/latex/ +html/ +*.tar.gz +*.rpm +rpm/libmarias3.spec +*.orig diff --git a/storage/maria/libmarias3/GNUmakefile b/storage/maria/libmarias3/GNUmakefile new file mode 100644 index 00000000..646edc8e --- /dev/null +++ b/storage/maria/libmarias3/GNUmakefile @@ -0,0 +1,36 @@ +# vim:ft=make +# +_bootstrap_Makefile := $(wildcard [M]akefile) +_bootstrap_config-status := $(wildcard config.status) + +ALL_RECURSIVE_TARGETS= + +ifneq ($(_bootstrap_Makefile),) + include Makefile +else + ifneq ($(_bt_config-status),) + $(srcdir)/config.status + $(MAKE) $(AM_MAKEFLAGS) configure + endif + +.DEFAULT_GOAL:= basic_build +srcdir= . + +configure: + @autoreconf -fi + +Makefile: configure + @$(srcdir)/configure + +.PHONY: basic_build +basic_build: Makefile + @$(MAKE) $(AM_MAKEFLAGS) +endif + +ALL_RECURSIVE_TARGETS+= $(AM_RECURSIVE_TARGETS) + +ifneq ($(word 2, $(MAKECMDGOALS)), ) +ifneq ($(filter $(ALL_RECURSIVE_TARGETS), $(MAKECMDGOALS)), ) +.NOTPARALLEL: +endif +endif diff --git a/storage/maria/libmarias3/LICENSE b/storage/maria/libmarias3/LICENSE new file mode 100644 index 00000000..4362b491 --- /dev/null +++ b/storage/maria/libmarias3/LICENSE @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/storage/maria/libmarias3/Makefile.am b/storage/maria/libmarias3/Makefile.am new file mode 100644 index 00000000..c1c5190a --- /dev/null +++ b/storage/maria/libmarias3/Makefile.am @@ -0,0 +1,77 @@ +# vim:ft=automake + +ACLOCAL_AMFLAGS = -I m4 + +# includes append to these: +SUFFIXES = +.PHONY = +TESTS = +CLEANFILES = +DISTCLEANFILES = +bin_PROGRAMS = +dist_bin_SCRIPTS = +noinst_HEADERS = +lib_LTLIBRARIES = +noinst_LTLIBRARIES = +noinst_PROGRAMS = +include_HEADERS = +nobase_include_HEADERS = +check_PROGRAMS = +EXTRA_HEADERS = +EXTRA_SCRIPTS = +BUILT_SOURCES= +EXTRA_DIST= +CONFIGURE_DEPENDENCIES= + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = libmarias3.pc + +#includes additional rules from aminclude.am +@INC_AMINCLUDE@ +DISTCLEANFILES+= aminclude.am + +EXTRA_DIST+= README.rst +EXTRA_DIST+= LICENSE +EXTRA_DIST+= VERSION.txt + +aclocaldir= $(datadir)/aclocal +dist_aclocal_DATA= + +TESTS+= ${check_PROGRAMS} + +include m4/include.am +include src/include.am +include libmarias3/include.am +include tests/include.am +#include examples/include.am +include rpm/include.mk +include docs/include.am +include yatl/include.am + +dist_bin_SCRIPTS+= @GENERIC_CONFIG@ +DISTCLEANFILES+= @GENERIC_CONFIG@ + +# Cleanup individual files in order to preserve uninstall/etc order +maintainer-clean-local: + -rm Makefile.in + -rm aclocal.m4 + -rm build-aux/config.guess + -rm build-aux/config.sub + -rm build-aux/depcomp + -rm build-aux/install-sh + -rm build-aux/ltmain.sh + -rm build-aux/missing + -rm build-aux/test-driver + -rmdir build-aux + -rm configure + -rm config.log + -rm config.status + -rm config.in + -rm m4/libtool.m4 + -rm m4/ltoptions.m4 + -rm m4/ltsugar.m4 + -rm m4/ltversion.m4 + -rm m4/lt~obsolete.m4 + find . -type f -name '*~' -exec rm -f '{}' \; + -rm -f @PACKAGE@-*.tar.gz + -rm -f @PACKAGE@-*.rpm diff --git a/storage/maria/libmarias3/README.rst b/storage/maria/libmarias3/README.rst new file mode 100644 index 00000000..2d9a7f49 --- /dev/null +++ b/storage/maria/libmarias3/README.rst @@ -0,0 +1,68 @@ +libMariaS3 +========== + +.. image:: https://readthedocs.org/projects/libmarias3/badge/?version=latest + :target: https://libmarias3.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +This is a lightweight C library to read/write to AWS S3 buckets using objects in memory. + +You will need an access key which for AWS can be created at `the AWS security crenditials page <https://console.aws.amazon.com/iam/home?#/security_credentials>`_. + +Compiling +--------- + +.. code-block:: bash + + autoreconf -fi + ./configure + make + +Testing +------- + +libMariaS3 comes with a basic test suite which we recommend executing, especially if you are building for a new platform. + +You will need the following OS environment variables set to run the tests: + ++------------+----------------------------------------------------------+ +| Variable | Desription | ++============+==========================================================+ +| S3KEY | Your AWS access key | ++------------+----------------------------------------------------------+ +| S3SECRET | Your AWS secret key | ++------------+----------------------------------------------------------+ +| S3REGION | The AWS region (for example us-east-1) | ++------------+----------------------------------------------------------+ +| S3BUCKET | The S3 bucket name | ++------------+----------------------------------------------------------+ +| S3HOST | OPTIONAL hostname for non-AWS S3 service | ++------------+----------------------------------------------------------+ +| S3PORT | OPTIONAL port for non-AWS S3 service | ++------------+----------------------------------------------------------+ +| S3USEHTTP | Set to ``1`` if the host uses http instead of https | ++------------+----------------------------------------------------------+ +| S3NOVERIFY | Set to ``1`` if the host should not use SSL verification | ++------------+----------------------------------------------------------+ + +The test suite is automatically built along with the library and can be executed with ``make check`` or ``make distcheck``. + +Before pushing, please ALWAYS ensure that ``make check`` and ``make distcheck`` works! + + +Credits +------- + +The libMariaS3 authors are: + +* `Andrew (LinuxJedi) Hutchings <mailto:andrew@linuxjedi.co.uk>`_ + - Starting with this commit, all my contributions are under the 3-clause BSD license. +* `Sergei Golubchik <mailto:sergei@mariadb.com>`_ +* `Markus Mäkelä <markus.makela@mariadb.com>`_ + +libMariaS3 uses the following Open Source projects: + +* `libcurl <https://curl.haxx.se/>`_ +* `xml.c <https://github.com/ooxi/xml.c/>`_ +* `DDM4 <https://github.com/TangentOrg/ddm4>`_ +* `Jouni Malinen's SHA256 hash code <j@w1.fi>`_ diff --git a/storage/maria/libmarias3/VERSION.txt b/storage/maria/libmarias3/VERSION.txt new file mode 100644 index 00000000..ef538c28 --- /dev/null +++ b/storage/maria/libmarias3/VERSION.txt @@ -0,0 +1 @@ +3.1.2 diff --git a/storage/maria/libmarias3/ci-scripts/asan.sh b/storage/maria/libmarias3/ci-scripts/asan.sh new file mode 100755 index 00000000..ad3ff4b2 --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/asan.sh @@ -0,0 +1,7 @@ +#!/bin/sh +export CC=clang +export CFLAGS="-fsanitize=address" +autoreconf -fi +./configure --enable-debug=yes +make +make check 2>/dev/null diff --git a/storage/maria/libmarias3/ci-scripts/cppcheck.sh b/storage/maria/libmarias3/ci-scripts/cppcheck.sh new file mode 100755 index 00000000..a717b406 --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/cppcheck.sh @@ -0,0 +1,2 @@ +#!/bin/sh +cppcheck --quiet --enable=all --error-exitcode=1 . src tests libmarias3 diff --git a/storage/maria/libmarias3/ci-scripts/distcheck.sh b/storage/maria/libmarias3/ci-scripts/distcheck.sh new file mode 100755 index 00000000..e6c469b0 --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/distcheck.sh @@ -0,0 +1,4 @@ +#!/bin/sh +autoreconf -fi +./configure --enable-debug=yes +make distcheck 2>/dev/null diff --git a/storage/maria/libmarias3/ci-scripts/docs.sh b/storage/maria/libmarias3/ci-scripts/docs.sh new file mode 100755 index 00000000..d5c8dcb6 --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/docs.sh @@ -0,0 +1,4 @@ +#!/bin/sh +autoreconf -fi +./configure +make html diff --git a/storage/maria/libmarias3/ci-scripts/scanbuild.sh b/storage/maria/libmarias3/ci-scripts/scanbuild.sh new file mode 100755 index 00000000..541e7f6a --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/scanbuild.sh @@ -0,0 +1,5 @@ +#!/bin/sh +export CC="clang" +autoreconf -fi +./configure --enable-debug=yes +scan-build --use-cc=clang --use-c++=clang++ --status-bugs make diff --git a/storage/maria/libmarias3/ci-scripts/tsan.sh b/storage/maria/libmarias3/ci-scripts/tsan.sh new file mode 100755 index 00000000..9fa2b132 --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/tsan.sh @@ -0,0 +1,7 @@ +#!/bin/sh +export CC=clang +export CFLAGS="-fsanitize=thread" +autoreconf -fi +./configure --enable-debug=yes +make +make check 2>/dev/null diff --git a/storage/maria/libmarias3/ci-scripts/usan.sh b/storage/maria/libmarias3/ci-scripts/usan.sh new file mode 100755 index 00000000..9090c37d --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/usan.sh @@ -0,0 +1,7 @@ +#!/bin/sh +export CC=clang +export CFLAGS="-fsanitize=undefined -fsanitize=nullability" +autoreconf -fi +./configure --enable-debug=yes +make +make check 2>/dev/null diff --git a/storage/maria/libmarias3/ci-scripts/valgrind.sh b/storage/maria/libmarias3/ci-scripts/valgrind.sh new file mode 100755 index 00000000..0d6ac291 --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/valgrind.sh @@ -0,0 +1,4 @@ +#!/bin/sh +autoreconf -fi +./configure --enable-debug=yes +TESTS_ENVIRONMENT="./libtool --mode=execute valgrind --error-exitcode=1 --leak-check=full --track-fds=no --malloc-fill=A5 --free-fill=DE --suppressions=ci-scripts/valgrind.supp" make check diff --git a/storage/maria/libmarias3/ci-scripts/valgrind.supp b/storage/maria/libmarias3/ci-scripts/valgrind.supp new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/storage/maria/libmarias3/ci-scripts/valgrind.supp diff --git a/storage/maria/libmarias3/configure.ac b/storage/maria/libmarias3/configure.ac new file mode 100644 index 00000000..40a9e48c --- /dev/null +++ b/storage/maria/libmarias3/configure.ac @@ -0,0 +1,122 @@ +# configure.ac -*- autoconf -*- vim: filetype=config +# +# DDM4 +# Copyright (C) 2012 Data Differential, http://datadifferential.com/ +# All rights reserved. +# +# Use and distribution licensed under the BSD license. See +# the COPYING file in this directory for full text. + +AC_COPYRIGHT([2019 MariaDB Corporation Ab]) +#AC_REVISION([m4_esyscmd_s([git describe --always])]) +AC_PREREQ([2.63]) + +# These two needed for RHEL/CentOS 6 +m4_ifndef([m4_esyscmd_s], [m4_define([m4_chomp_all],[m4_format([[%.*s]], m4_bregexp(m4_translit([[$1]],[/],[/]),[/*$]), [$1])])] [m4_define([m4_esyscmd_s], [m4_chomp_all(m4_esyscmd([$1]))])]) + +m4_ifndef([AS_VAR_COPY], [m4_define([AS_VAR_COPY], [AS_LITERAL_IF([$1[]$2], [$1=$$2],[eval $1=\$$2])])]) + +# NOTE: Major version should be < 100 and minor/patch < 256 or bad things will +# happen in src/asql/utility.cc +AC_INIT([libmarias3],m4_esyscmd_s([tr -d '\n' < VERSION.txt]),[linuxjedi@mariadb.com],[libmarias3],[https://mariadb.com]) +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) + +# Must come before AM_INIT_AUTOMAKE. +AC_CONFIG_AUX_DIR([build-aux]) +AC_CANONICAL_HOST +AC_CONFIG_MACRO_DIR([m4]) + +# Setup the compilers early on +AC_PROG_CC([cc gcc clang]) + +# Automake version before 1.13 (when the serial-tests option was dnl still the default) still defined the badly obsolete macro.  'AC_PROG_INSTALL'. +AM_INIT_AUTOMAKE(1.11 no-define color-tests -Wno-portability subdir-objects foreign tar-ustar m4_ifndef([AC_PROG_INSTALL], [serial-tests])) + +AC_ARG_PROGRAM +AC_USE_SYSTEM_EXTENSIONS + +AC_CONFIG_HEADERS([config.h:config.in])dnl Keep filename to 8.3 for MS-DOS. + +# shared library versioning +LIBMARIAS3_LIBRARY_VERSION=4:2:1 +# | | | +# +------+ | +---+ +# | | | +# current:revision:age +# | | | +# | | +- increment if interfaces have been added +# | | set to zero if interfaces have been removed or changed +# | +- increment if source code has changed +# | set to zero if current is incremented +# +- increment if interfaces have been added, removed or changed +AC_SUBST([LIBMARIAS3_LIBRARY_VERSION]) + +LT_PREREQ([2.2]) +LT_INIT +LT_LANG([C]) + +AX_PLATFORM + +CONFIG_EXTRA + +# Checks for programs. +AC_CHECK_PROGS([LSB_RELEASE],[lsb_release]) +AC_CHECK_PROGS([DPKG_GENSYMBOLS],[dpkg-gensymbols]) +AX_PROG_SPHINX_BUILD(,[AC_MSG_WARN([sphinx-build version 1.0 or greater is required to build man pages])]) +#AX_PROG_VALGRIND([memcheck],[--error-exitcode=1 --leak-check=yes --track-fds=yes --malloc-fill=A5 --free-fill=DE --suppressions=valgrind.supp]) +AC_CHECK_PROGS([RPMBUILD],[rpmbuild --nocheck]) +AC_CHECK_PROGS([RPMDEV_SETUPTREE],[rpmdev-setuptree]) +AC_CHECK_PROGS([RPM],[rpm]) + +# Checks for typedefs, structures, and compiler characteristics. + +PKG_CHECK_MODULES([LIBCURL], [libcurl >= 7.0], [ax_cv_libcurl=yes], [AC_MSG_ERROR(could not find a suitable version of libcurl)]) +LT_LIB_M + +AX_ENDIAN +AX_HEX_VERSION([LIBMARIAS3],[$VERSION]) +AC_SUBST([RPM_RELEASE],[1]) + +AX_HARDEN_COMPILER_FLAGS + +AC_DEFINE_UNQUOTED([C_COMPILER_VENDOR],["$ax_cv_c_compiler_vendor"],[Compiler vendor]) +AC_DEFINE_UNQUOTED([CC],["$CC"],[Compiler information for CC compiler]) +AC_DEFINE_UNQUOTED([CC_VERSION],["$CC_VERSION"],[Version information for CC compiler]) +AC_DEFINE_UNQUOTED([CFLAGS],["$CFLAGS"],[CFLAGS used for compiling binary]) + +AS_IF([test -n "${LSB_RELEASE}"],[ + LSB_DESCRIPTION="`lsb_release -d -s`" + ],[LSB_DESCRIPTION="unknown"]) + +AX_AM_JOBSERVER([yes]) + +AC_CONFIG_FILES([Makefile + rpm/libmarias3.spec + version.h + libmarias3.pc + ]) + +AX_CREATE_GENERIC_CONFIG + +AC_OUTPUT + +echo "---" +echo "Configuration summary for $PACKAGE_NAME version $VERSION" +echo "" +echo " * Installation prefix: $prefix" +echo " * LSB Release: $LSB_RELEASE" +echo " * System type: $host_vendor-$host_os" +echo " * Host CPU: $host_cpu" +echo " * Compiler Vendor: $ax_cv_c_compiler_vendor" +echo " * C Compiler: $CC" +echo " * C Compiler Version: $ax_c_compiler_version" +echo " * C Flags: $CFLAGS" +echo " * LIBS Flags: $LIBS" +echo " * LDFLAGS Flags: $LDFLAGS" +echo " * Assertions enabled: $ax_enable_assert" +echo " * Debug enabled: $ax_enable_debug" +echo " * Warnings as failure: $ac_cv_warnings_as_errors" +echo " * make -j: $enable_jobserver" +echo " * VCS checkout: $ac_cv_vcs_system" +echo "" +echo "---" diff --git a/storage/maria/libmarias3/debian/changelog b/storage/maria/libmarias3/debian/changelog new file mode 100644 index 00000000..601d5726 --- /dev/null +++ b/storage/maria/libmarias3/debian/changelog @@ -0,0 +1,6 @@ +marias3 (1:1.0.1) unstable; urgency=low + + * Add packaging support + * Fix memory leaks + + -- Andrew Hutchings <linuxjedi@mariadb.com> Mon, 25 Mar 2019 12:24:20 +0000 diff --git a/storage/maria/libmarias3/debian/compat b/storage/maria/libmarias3/debian/compat new file mode 100644 index 00000000..ec635144 --- /dev/null +++ b/storage/maria/libmarias3/debian/compat @@ -0,0 +1 @@ +9 diff --git a/storage/maria/libmarias3/debian/control b/storage/maria/libmarias3/debian/control new file mode 100644 index 00000000..80f7eb15 --- /dev/null +++ b/storage/maria/libmarias3/debian/control @@ -0,0 +1,14 @@ +Source: marias3 +Maintainer: Andrew Hutchings <linuxjedi@mariadb.com> + +Package: libmarias3 +Architecture: any +Section: libs +Depends: ${misc:Depends}, ${shlibs:Depends} +Description: a lightweight C library to read/write to AWS S3 buckets using objects in memory. + +Package: libmarias3-dev +Architecture: any +Section: libdevel +Depends: libmarias3 (= ${binary:Version}), ${misc:Depends} +Description: a lightweight C library to read/write to AWS S3 buckets using objects in memory. diff --git a/storage/maria/libmarias3/debian/copyright b/storage/maria/libmarias3/debian/copyright new file mode 100644 index 00000000..baef8418 --- /dev/null +++ b/storage/maria/libmarias3/debian/copyright @@ -0,0 +1,30 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: libmarias3 +Upstream-Contact: Andrew Hutchings <linuxjedi@mariadb.com> +Maintainer: Andrew Hutchings <linuxjedi@mariadb.com> +Source: https://github.com/mariadb-corporation/marias3 + +Files: * +Copyright: 2019 MariaDB Corporation Ab +License: LGPL-2.1 + +License: LGPL-2.1 + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + . + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + . + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1301 USA + . + On Debian systems, the full text of the Lesser GNU General Public + License version 2,1 can be found in the file + '/usr/share/common-licenses/LGPL-2.1'. + diff --git a/storage/maria/libmarias3/debian/libmarias3-dev.install b/storage/maria/libmarias3/debian/libmarias3-dev.install new file mode 100644 index 00000000..b9985d9b --- /dev/null +++ b/storage/maria/libmarias3/debian/libmarias3-dev.install @@ -0,0 +1,2 @@ +usr/include/libmarias3 +usr/lib/x86_64-linux-gnu/pkgconfig/libmarias3.pc diff --git a/storage/maria/libmarias3/debian/libmarias3.install b/storage/maria/libmarias3/debian/libmarias3.install new file mode 100644 index 00000000..c25eddb4 --- /dev/null +++ b/storage/maria/libmarias3/debian/libmarias3.install @@ -0,0 +1,2 @@ +usr/lib/x86_64-linux-gnu/libmarias3.* +usr/bin/libmarias3-config diff --git a/storage/maria/libmarias3/debian/rules b/storage/maria/libmarias3/debian/rules new file mode 100755 index 00000000..2d33f6ac --- /dev/null +++ b/storage/maria/libmarias3/debian/rules @@ -0,0 +1,4 @@ +#!/usr/bin/make -f + +%: + dh $@ diff --git a/storage/maria/libmarias3/docs/_static/cc-symbol.png b/storage/maria/libmarias3/docs/_static/cc-symbol.png Binary files differnew file mode 100644 index 00000000..f0a944e0 --- /dev/null +++ b/storage/maria/libmarias3/docs/_static/cc-symbol.png diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/__init__.py b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/__init__.py new file mode 100644 index 00000000..1440863d --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/__init__.py @@ -0,0 +1,17 @@ +"""Sphinx ReadTheDocs theme. + +From https://github.com/ryan-roemer/sphinx-bootstrap-theme. + +""" +import os + +VERSION = (0, 1, 5) + +__version__ = ".".join(str(v) for v in VERSION) +__version_full__ = __version__ + + +def get_html_theme_path(): + """Return list of HTML theme paths.""" + cur_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + return cur_dir diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/breadcrumbs.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/breadcrumbs.html new file mode 100644 index 00000000..ff0938e5 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/breadcrumbs.html @@ -0,0 +1,19 @@ +<div role="navigation" aria-label="breadcrumbs navigation"> + <ul class="wy-breadcrumbs"> + <li><a href="{{ pathto(master_doc) }}">Docs</a> »</li> + {% for doc in parents %} + <li><a href="{{ doc.link|e }}">{{ doc.title }}</a> »</li> + {% endfor %} + <li>{{ title }}</li> + <li class="wy-breadcrumbs-aside"> + {% if display_github %} + <a href="https://github.com/{{ github_user }}/{{ github_repo }}/blob/{{ github_version }}{{ conf_py_path }}{{ pagename }}{{ source_suffix }}" class="fa fa-github"> Edit on GitHub</a> + {% elif display_bitbucket %} + <a href="https://bitbucket.org/{{ bitbucket_user }}/{{ bitbucket_repo }}/src/{{ bitbucket_version}}{{ conf_py_path }}{{ pagename }}{{ source_suffix }}" class="fa fa-bitbucket"> Edit on Bitbucket</a> + {% elif show_source and has_source and sourcename %} + <a href="{{ pathto('_sources/' + sourcename, true)|e }}" rel="nofollow"> View page source</a> + {% endif %} + </li> + </ul> + <hr/> +</div> diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/footer.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/footer.html new file mode 100644 index 00000000..3c7afed3 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/footer.html @@ -0,0 +1,32 @@ +<footer> + {% if next or prev %} + <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> + {% if next %} + <a href="{{ next.link|e }}" class="btn btn-neutral float-right" title="{{ next.title|striptags|e }}">Next <span class="fa fa-arrow-circle-right"></span></a> + {% endif %} + {% if prev %} + <a href="{{ prev.link|e }}" class="btn btn-neutral" title="{{ prev.title|striptags|e }}"><span class="fa fa-arrow-circle-left"></span> Previous</a> + {% endif %} + </div> + {% endif %} + + <hr/> + + <div role="contentinfo"> + <p> + {%- if show_copyright %} + {%- if hasdoc('copyright') %} + {% trans path=pathto('copyright'), copyright=copyright|e %}© <a href="{{ path }}">Copyright</a> {{ copyright }}.{% endtrans %} + {%- else %} + {% trans copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} + {%- endif %} + {%- endif %} + + {%- if last_updated %} + {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %} + {%- endif %} + </p> + </div> + + {% trans %}<a href="https://github.com/snide/sphinx_rtd_theme">Sphinx theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>{% endtrans %} +</footer> diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/layout.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/layout.html new file mode 100644 index 00000000..2e352605 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/layout.html @@ -0,0 +1,162 @@ +{# TEMPLATE VAR SETTINGS #} +{%- set url_root = pathto('', 1) %} +{%- if url_root == '#' %}{% set url_root = '' %}{% endif %} +{%- if not embedded and docstitle %} + {%- set titlesuffix = " — "|safe + docstitle|e %} +{%- else %} + {%- set titlesuffix = "" %} +{%- endif %} + +<!DOCTYPE html> +<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + {% block htmltitle %} + <title>{{ title|striptags|e }}{{ titlesuffix }}</title> + {% endblock %} + + {# FAVICON #} + {% if favicon %} + <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/> + {% endif %} + + {# CSS #} + <link href='https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'> + + {# OPENSEARCH #} + {% if not embedded %} + {% if use_opensearch %} + <link rel="search" type="application/opensearchdescription+xml" title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" href="{{ pathto('_static/opensearch.xml', 1) }}"/> + {% endif %} + + {% endif %} + + {# RTD hosts this file, so just load on non RTD builds #} + {% if not READTHEDOCS %} + <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" /> + {% endif %} + + {% for cssfile in css_files %} + <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" /> + {% endfor %} + + {%- block linktags %} + {%- if hasdoc('about') %} + <link rel="author" title="{{ _('About these documents') }}" + href="{{ pathto('about') }}"/> + {%- endif %} + {%- if hasdoc('genindex') %} + <link rel="index" title="{{ _('Index') }}" + href="{{ pathto('genindex') }}"/> + {%- endif %} + {%- if hasdoc('search') %} + <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}"/> + {%- endif %} + {%- if hasdoc('copyright') %} + <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}"/> + {%- endif %} + <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}"/> + {%- if parents %} + <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}"/> + {%- endif %} + {%- if next %} + <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}"/> + {%- endif %} + {%- if prev %} + <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}"/> + {%- endif %} + {%- endblock %} + {%- block extrahead %} {% endblock %} + + {# Keep modernizr in head - http://modernizr.com/docs/#installing #} + <script src="https://cdnjs.cloudflare.com/ajax/libs/modernizr/2.6.2/modernizr.min.js"></script> + +</head> + +<body class="wy-body-for-nav" role="document"> + + <div class="wy-grid-for-nav"> + + {# SIDE NAV, TOGGLES ON MOBILE #} + <nav data-toggle="wy-nav-shift" class="wy-nav-side"> + <div class="wy-side-nav-search"> + {% block sidebartitle %} + <a href="{{ pathto(master_doc) }}" class="fa fa-home"> {{ project }}</a> + {% endblock %} + {% include "searchbox.html" %} + </div> + + <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> + {% set toctree = toctree(maxdepth=2, collapse=False, includehidden=True) %} + {% if toctree %} + {{ toctree }} + {% else %} + <!-- Local TOC --> + <div class="local-toc">{{ toc }}</div> + {% endif %} + </div> + + </nav> + + <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> + + {# MOBILE NAV, TRIGGLES SIDE NAV ON TOGGLE #} + <nav class="wy-nav-top" role="navigation" aria-label="top navigation"> + <i data-toggle="wy-nav-top" class="fa fa-bars"></i> + <a href="{{ pathto(master_doc) }}">{{ project }}</a> + </nav> + + + {# PAGE CONTENT #} + <div class="wy-nav-content"> + <div class="rst-content"> + {% include "breadcrumbs.html" %} + <div role="main"> + {% block body %}{% endblock %} + </div> + {% include "footer.html" %} + </div> + </div> + + </section> + + </div> + {% include "versions.html" %} + + {% if not embedded %} + + <script type="text/javascript"> + var DOCUMENTATION_OPTIONS = { + URL_ROOT:'{{ url_root }}', + VERSION:'{{ release|e }}', + COLLAPSE_INDEX:false, + FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}', + HAS_SOURCE: {{ has_source|lower }} + }; + </script> + {%- for scriptfile in script_files %} + <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script> + {%- endfor %} + + {% endif %} + + {# RTD hosts this file, so just load on non RTD builds #} + {% if not READTHEDOCS %} + <script type="text/javascript" src="{{ pathto('_static/js/theme.js', 1) }}"></script> + {% endif %} + + {# STICKY NAVIGATION #} + {% if theme_sticky_navigation %} + <script type="text/javascript"> + jQuery(function () { + SphinxRtdTheme.StickyNav.enable(); + }); + </script> + {% endif %} + + {%- block footer %} {% endblock %} + +</body> +</html> diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/layout_old.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/layout_old.html new file mode 100644 index 00000000..deb8df2a --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/layout_old.html @@ -0,0 +1,205 @@ +{# + basic/layout.html + ~~~~~~~~~~~~~~~~~ + + Master layout template for Sphinx themes. + + :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. +#} +{%- block doctype -%} +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +{%- endblock %} +{%- set reldelim1 = reldelim1 is not defined and ' »' or reldelim1 %} +{%- set reldelim2 = reldelim2 is not defined and ' |' or reldelim2 %} +{%- set render_sidebar = (not embedded) and (not theme_nosidebar|tobool) and + (sidebars != []) %} +{%- set url_root = pathto('', 1) %} +{# XXX necessary? #} +{%- if url_root == '#' %}{% set url_root = '' %}{% endif %} +{%- if not embedded and docstitle %} + {%- set titlesuffix = " — "|safe + docstitle|e %} +{%- else %} + {%- set titlesuffix = "" %} +{%- endif %} + +{%- macro relbar() %} + <div class="related"> + <h3>{{ _('Navigation') }}</h3> + <ul> + {%- for rellink in rellinks %} + <li class="right" {% if loop.first %}style="margin-right: 10px"{% endif %}> + <a href="{{ pathto(rellink[0]) }}" title="{{ rellink[1]|striptags|e }}" + {{ accesskey(rellink[2]) }}>{{ rellink[3] }}</a> + {%- if not loop.first %}{{ reldelim2 }}{% endif %}</li> + {%- endfor %} + {%- block rootrellink %} + <li><a href="{{ pathto(master_doc) }}">{{ shorttitle|e }}</a>{{ reldelim1 }}</li> + {%- endblock %} + {%- for parent in parents %} + <li><a href="{{ parent.link|e }}" {% if loop.last %}{{ accesskey("U") }}{% endif %}>{{ parent.title }}</a>{{ reldelim1 }}</li> + {%- endfor %} + {%- block relbaritems %} {% endblock %} + </ul> + </div> +{%- endmacro %} + +{%- macro sidebar() %} + {%- if render_sidebar %} + <div class="sphinxsidebar"> + <div class="sphinxsidebarwrapper"> + {%- block sidebarlogo %} + {%- if logo %} + <p class="logo"><a href="{{ pathto(master_doc) }}"> + <img class="logo" src="{{ pathto('_static/' + logo, 1) }}" alt="Logo"/> + </a></p> + {%- endif %} + {%- endblock %} + {%- if sidebars != None %} + {#- new style sidebar: explicitly include/exclude templates #} + {%- for sidebartemplate in sidebars %} + {%- include sidebartemplate %} + {%- endfor %} + {%- else %} + {#- old style sidebars: using blocks -- should be deprecated #} + {%- block sidebartoc %} + {%- include "localtoc.html" %} + {%- endblock %} + {%- block sidebarrel %} + {%- include "relations.html" %} + {%- endblock %} + {%- block sidebarsourcelink %} + {%- include "sourcelink.html" %} + {%- endblock %} + {%- if customsidebar %} + {%- include customsidebar %} + {%- endif %} + {%- block sidebarsearch %} + {%- include "searchbox.html" %} + {%- endblock %} + {%- endif %} + </div> + </div> + {%- endif %} +{%- endmacro %} + +{%- macro script() %} + <script type="text/javascript"> + var DOCUMENTATION_OPTIONS = { + URL_ROOT: '{{ url_root }}', + VERSION: '{{ release|e }}', + COLLAPSE_INDEX: false, + FILE_SUFFIX: '{{ '' if no_search_suffix else file_suffix }}', + HAS_SOURCE: {{ has_source|lower }} + }; + </script> + {%- for scriptfile in script_files %} + <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script> + {%- endfor %} +{%- endmacro %} + +{%- macro css() %} + <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" /> + <link rel="stylesheet" href="{{ pathto('_static/pygments.css', 1) }}" type="text/css" /> + {%- for cssfile in css_files %} + <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" /> + {%- endfor %} +{%- endmacro %} + +<html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset={{ encoding }}" /> + {{ metatags }} + {%- block htmltitle %} + <title>{{ title|striptags|e }}{{ titlesuffix }}</title> + {%- endblock %} + {{ css() }} + {%- if not embedded %} + {{ script() }} + {%- if use_opensearch %} + <link rel="search" type="application/opensearchdescription+xml" + title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" + href="{{ pathto('_static/opensearch.xml', 1) }}"/> + {%- endif %} + {%- if favicon %} + <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/> + {%- endif %} + {%- endif %} +{%- block linktags %} + {%- if hasdoc('about') %} + <link rel="author" title="{{ _('About these documents') }}" href="{{ pathto('about') }}" /> + {%- endif %} + {%- if hasdoc('genindex') %} + <link rel="index" title="{{ _('Index') }}" href="{{ pathto('genindex') }}" /> + {%- endif %} + {%- if hasdoc('search') %} + <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}" /> + {%- endif %} + {%- if hasdoc('copyright') %} + <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}" /> + {%- endif %} + <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}" /> + {%- if parents %} + <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}" /> + {%- endif %} + {%- if next %} + <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}" /> + {%- endif %} + {%- if prev %} + <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}" /> + {%- endif %} +{%- endblock %} +{%- block extrahead %} {% endblock %} + </head> + <body> +{%- block header %}{% endblock %} + +{%- block relbar1 %}{{ relbar() }}{% endblock %} + +{%- block content %} + {%- block sidebar1 %} {# possible location for sidebar #} {% endblock %} + + <div class="document"> + {%- block document %} + <div class="documentwrapper"> + {%- if render_sidebar %} + <div class="bodywrapper"> + {%- endif %} + <div class="body"> + {% block body %} {% endblock %} + </div> + {%- if render_sidebar %} + </div> + {%- endif %} + </div> + {%- endblock %} + + {%- block sidebar2 %}{{ sidebar() }}{% endblock %} + <div class="clearer"></div> + </div> +{%- endblock %} + +{%- block relbar2 %}{{ relbar() }}{% endblock %} + +{%- block footer %} + <div class="footer"> + {%- if show_copyright %} + {%- if hasdoc('copyright') %} + {% trans path=pathto('copyright'), copyright=copyright|e %}© <a href="{{ path }}">Copyright</a> {{ copyright }}.{% endtrans %} + {%- else %} + {% trans copyright=copyright|e %}© Copyright {{ copyright }}.{% endtrans %} + {%- endif %} + {%- endif %} + {%- if last_updated %} + {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %} + {%- endif %} + {%- if show_sphinx %} + {% trans sphinx_version=sphinx_version|e %}Created using <a href="http://sphinx-doc.org/">Sphinx</a> {{ sphinx_version }}.{% endtrans %} + {%- endif %} + </div> + <p>asdf asdf asdf asdf 22</p> +{%- endblock %} + </body> +</html> + diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/search.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/search.html new file mode 100644 index 00000000..e3aa9b5c --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/search.html @@ -0,0 +1,50 @@ +{# + basic/search.html + ~~~~~~~~~~~~~~~~~ + + Template for the search page. + + :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. +#} +{%- extends "layout.html" %} +{% set title = _('Search') %} +{% set script_files = script_files + ['_static/searchtools.js'] %} +{% block footer %} + <script type="text/javascript"> + jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); }); + </script> + {# this is used when loading the search index using $.ajax fails, + such as on Chrome for documents on localhost #} + <script type="text/javascript" id="searchindexloader"></script> + {{ super() }} +{% endblock %} +{% block body %} + <noscript> + <div id="fallback" class="admonition warning"> + <p class="last"> + {% trans %}Please activate JavaScript to enable the search + functionality.{% endtrans %} + </p> + </div> + </noscript> + + {% if search_performed %} + <h2>{{ _('Search Results') }}</h2> + {% if not search_results %} + <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p> + {% endif %} + {% endif %} + <div id="search-results"> + {% if search_results %} + <ul> + {% for href, caption, context in search_results %} + <li> + <a href="{{ pathto(item.href) }}">{{ caption }}</a> + <p class="context">{{ context|e }}</p> + </li> + {% endfor %} + </ul> + {% endif %} + </div> +{% endblock %} diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/searchbox.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/searchbox.html new file mode 100644 index 00000000..24418d32 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/searchbox.html @@ -0,0 +1,7 @@ +<div role="search"> + <form id ="rtd-search-form" class="wy-form" action="{{ pathto('search') }}" method="get"> + <input type="text" name="q" placeholder="Search docs" /> + <input type="hidden" name="check_keywords" value="yes" /> + <input type="hidden" name="area" value="default" /> + </form> +</div> diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/css/badge_only.css b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/css/badge_only.css new file mode 100644 index 00000000..4868a002 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/css/badge_only.css @@ -0,0 +1 @@ +.fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-weight:normal;font-style:normal;src:url("../font/fontawesome_webfont.eot");src:url("../font/fontawesome_webfont.eot?#iefix") format("embedded-opentype"),url("../font/fontawesome_webfont.woff") format("woff"),url("../font/fontawesome_webfont.ttf") format("truetype"),url("../font/fontawesome_webfont.svg#FontAwesome") format("svg")}.fa:before{display:inline-block;font-family:FontAwesome;font-style:normal;font-weight:normal;line-height:1;text-decoration:inherit}a .fa{display:inline-block;text-decoration:inherit}li .fa{display:inline-block}li .fa-large:before,li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-0.8em}ul.fas li .fa{width:0.8em}ul.fas li .fa-large:before,ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before{content:"\f02d"}.icon-book:before{content:"\f02d"}.fa-caret-down:before{content:"\f0d7"}.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before{content:"\f0d8"}.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before{content:"\f0d9"}.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;border-top:solid 10px #343131;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:before,.rst-versions .rst-current-version:after{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book{float:left}.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:gray;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:solid 1px #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px}.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge .fa-book{float:none}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book{float:left}.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge .rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width: 768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}img{width:100%;height:auto}} diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/css/theme.css b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/css/theme.css new file mode 100644 index 00000000..eb3f865f --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/css/theme.css @@ -0,0 +1,4 @@ +*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}audio:not([controls]){display:none}[hidden]{display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:hover,a:active{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:bold}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;color:#000;text-decoration:none}mark{background:#ff0;color:#000;font-style:italic;font-weight:bold}pre,code,.rst-content tt,kbd,samp{font-family:monospace,serif;_font-family:"courier new",monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:before,q:after{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-0.5em}sub{bottom:-0.25em}ul,ol,dl{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure{margin:0}form{margin:0}fieldset{border:0;margin:0;padding:0}label{cursor:pointer}legend{border:0;*margin-left:-7px;padding:0;white-space:normal}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type="checkbox"],input[type="radio"]{box-sizing:border-box;padding:0;*width:13px;*height:13px}input[type="search"]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}textarea{overflow:auto;vertical-align:top;resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:0.2em 0;background:#ccc;color:#000;padding:0.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none !important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{html,body,section{background:none !important}*{box-shadow:none !important;text-shadow:none !important;filter:none !important;-ms-filter:none !important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h2,h3{page-break-after:avoid}}.fa:before,.rst-content .admonition-title:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content dl dt .headerlink:before,.icon:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-alert,.rst-content .note,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .warning,.rst-content .seealso,.rst-content .admonition-todo,.btn,input[type="text"],input[type="password"],input[type="email"],input[type="url"],input[type="date"],input[type="month"],input[type="time"],input[type="datetime"],input[type="datetime-local"],input[type="week"],input[type="number"],input[type="search"],input[type="tel"],input[type="color"],select,textarea,.wy-menu-vertical li.on a,.wy-menu-vertical li.current>a,.wy-side-nav-search>a,.wy-side-nav-search .wy-dropdown>a,.wy-nav-top a{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.1.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:'FontAwesome';src:url("../fonts/fontawesome-webfont.eot?v=4.1.0");src:url("../fonts/fontawesome-webfont.eot?#iefix&v=4.1.0") format("embedded-opentype"),url("../fonts/fontawesome-webfont.woff?v=4.1.0") format("woff"),url("../fonts/fontawesome-webfont.ttf?v=4.1.0") format("truetype"),url("../fonts/fontawesome-webfont.svg?v=4.1.0#fontawesomeregular") format("svg");font-weight:normal;font-style:normal}.fa,.rst-content .admonition-title,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.icon{display:inline-block;font-family:FontAwesome;font-style:normal;font-weight:normal;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:0.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:0.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:solid 0.08em #eee;border-radius:.1em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.rst-content .pull-left.admonition-title,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content dl dt .pull-left.headerlink,.pull-left.icon{margin-right:.3em}.fa.pull-right,.rst-content .pull-right.admonition-title,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content dl dt .pull-right.headerlink,.pull-right.icon{margin-left:.3em}.fa-spin{-webkit-animation:spin 2s infinite linear;-moz-animation:spin 2s infinite linear;-o-animation:spin 2s infinite linear;animation:spin 2s infinite linear}@-moz-keyframes spin{0%{-moz-transform:rotate(0deg)}100%{-moz-transform:rotate(359deg)}}@-webkit-keyframes spin{0%{-webkit-transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg)}}@-o-keyframes spin{0%{-o-transform:rotate(0deg)}100%{-o-transform:rotate(359deg)}}@keyframes spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=1);-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=2);-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=3);-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=0);-webkit-transform:scale(-1, 1);-moz-transform:scale(-1, 1);-ms-transform:scale(-1, 1);-o-transform:scale(-1, 1);transform:scale(-1, 1)}.fa-flip-vertical{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=2);-webkit-transform:scale(1, -1);-moz-transform:scale(1, -1);-ms-transform:scale(1, -1);-o-transform:scale(1, -1);transform:scale(1, -1)}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:"\f000"}.fa-music:before{content:"\f001"}.fa-search:before,.icon-search:before{content:"\f002"}.fa-envelope-o:before{content:"\f003"}.fa-heart:before{content:"\f004"}.fa-star:before{content:"\f005"}.fa-star-o:before{content:"\f006"}.fa-user:before{content:"\f007"}.fa-film:before{content:"\f008"}.fa-th-large:before{content:"\f009"}.fa-th:before{content:"\f00a"}.fa-th-list:before{content:"\f00b"}.fa-check:before{content:"\f00c"}.fa-times:before{content:"\f00d"}.fa-search-plus:before{content:"\f00e"}.fa-search-minus:before{content:"\f010"}.fa-power-off:before{content:"\f011"}.fa-signal:before{content:"\f012"}.fa-gear:before,.fa-cog:before{content:"\f013"}.fa-trash-o:before{content:"\f014"}.fa-home:before,.icon-home:before{content:"\f015"}.fa-file-o:before{content:"\f016"}.fa-clock-o:before{content:"\f017"}.fa-road:before{content:"\f018"}.fa-download:before{content:"\f019"}.fa-arrow-circle-o-down:before{content:"\f01a"}.fa-arrow-circle-o-up:before{content:"\f01b"}.fa-inbox:before{content:"\f01c"}.fa-play-circle-o:before{content:"\f01d"}.fa-rotate-right:before,.fa-repeat:before{content:"\f01e"}.fa-refresh:before{content:"\f021"}.fa-list-alt:before{content:"\f022"}.fa-lock:before{content:"\f023"}.fa-flag:before{content:"\f024"}.fa-headphones:before{content:"\f025"}.fa-volume-off:before{content:"\f026"}.fa-volume-down:before{content:"\f027"}.fa-volume-up:before{content:"\f028"}.fa-qrcode:before{content:"\f029"}.fa-barcode:before{content:"\f02a"}.fa-tag:before{content:"\f02b"}.fa-tags:before{content:"\f02c"}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-bookmark:before{content:"\f02e"}.fa-print:before{content:"\f02f"}.fa-camera:before{content:"\f030"}.fa-font:before{content:"\f031"}.fa-bold:before{content:"\f032"}.fa-italic:before{content:"\f033"}.fa-text-height:before{content:"\f034"}.fa-text-width:before{content:"\f035"}.fa-align-left:before{content:"\f036"}.fa-align-center:before{content:"\f037"}.fa-align-right:before{content:"\f038"}.fa-align-justify:before{content:"\f039"}.fa-list:before{content:"\f03a"}.fa-dedent:before,.fa-outdent:before{content:"\f03b"}.fa-indent:before{content:"\f03c"}.fa-video-camera:before{content:"\f03d"}.fa-photo:before,.fa-image:before,.fa-picture-o:before{content:"\f03e"}.fa-pencil:before{content:"\f040"}.fa-map-marker:before{content:"\f041"}.fa-adjust:before{content:"\f042"}.fa-tint:before{content:"\f043"}.fa-edit:before,.fa-pencil-square-o:before{content:"\f044"}.fa-share-square-o:before{content:"\f045"}.fa-check-square-o:before{content:"\f046"}.fa-arrows:before{content:"\f047"}.fa-step-backward:before{content:"\f048"}.fa-fast-backward:before{content:"\f049"}.fa-backward:before{content:"\f04a"}.fa-play:before{content:"\f04b"}.fa-pause:before{content:"\f04c"}.fa-stop:before{content:"\f04d"}.fa-forward:before{content:"\f04e"}.fa-fast-forward:before{content:"\f050"}.fa-step-forward:before{content:"\f051"}.fa-eject:before{content:"\f052"}.fa-chevron-left:before{content:"\f053"}.fa-chevron-right:before{content:"\f054"}.fa-plus-circle:before{content:"\f055"}.fa-minus-circle:before{content:"\f056"}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:"\f057"}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:"\f058"}.fa-question-circle:before{content:"\f059"}.fa-info-circle:before{content:"\f05a"}.fa-crosshairs:before{content:"\f05b"}.fa-times-circle-o:before{content:"\f05c"}.fa-check-circle-o:before{content:"\f05d"}.fa-ban:before{content:"\f05e"}.fa-arrow-left:before{content:"\f060"}.fa-arrow-right:before{content:"\f061"}.fa-arrow-up:before{content:"\f062"}.fa-arrow-down:before{content:"\f063"}.fa-mail-forward:before,.fa-share:before{content:"\f064"}.fa-expand:before{content:"\f065"}.fa-compress:before{content:"\f066"}.fa-plus:before{content:"\f067"}.fa-minus:before{content:"\f068"}.fa-asterisk:before{content:"\f069"}.fa-exclamation-circle:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.rst-content .admonition-title:before{content:"\f06a"}.fa-gift:before{content:"\f06b"}.fa-leaf:before{content:"\f06c"}.fa-fire:before,.icon-fire:before{content:"\f06d"}.fa-eye:before{content:"\f06e"}.fa-eye-slash:before{content:"\f070"}.fa-warning:before,.fa-exclamation-triangle:before{content:"\f071"}.fa-plane:before{content:"\f072"}.fa-calendar:before{content:"\f073"}.fa-random:before{content:"\f074"}.fa-comment:before{content:"\f075"}.fa-magnet:before{content:"\f076"}.fa-chevron-up:before{content:"\f077"}.fa-chevron-down:before{content:"\f078"}.fa-retweet:before{content:"\f079"}.fa-shopping-cart:before{content:"\f07a"}.fa-folder:before{content:"\f07b"}.fa-folder-open:before{content:"\f07c"}.fa-arrows-v:before{content:"\f07d"}.fa-arrows-h:before{content:"\f07e"}.fa-bar-chart-o:before{content:"\f080"}.fa-twitter-square:before{content:"\f081"}.fa-facebook-square:before{content:"\f082"}.fa-camera-retro:before{content:"\f083"}.fa-key:before{content:"\f084"}.fa-gears:before,.fa-cogs:before{content:"\f085"}.fa-comments:before{content:"\f086"}.fa-thumbs-o-up:before{content:"\f087"}.fa-thumbs-o-down:before{content:"\f088"}.fa-star-half:before{content:"\f089"}.fa-heart-o:before{content:"\f08a"}.fa-sign-out:before{content:"\f08b"}.fa-linkedin-square:before{content:"\f08c"}.fa-thumb-tack:before{content:"\f08d"}.fa-external-link:before{content:"\f08e"}.fa-sign-in:before{content:"\f090"}.fa-trophy:before{content:"\f091"}.fa-github-square:before{content:"\f092"}.fa-upload:before{content:"\f093"}.fa-lemon-o:before{content:"\f094"}.fa-phone:before{content:"\f095"}.fa-square-o:before{content:"\f096"}.fa-bookmark-o:before{content:"\f097"}.fa-phone-square:before{content:"\f098"}.fa-twitter:before{content:"\f099"}.fa-facebook:before{content:"\f09a"}.fa-github:before,.icon-github:before{content:"\f09b"}.fa-unlock:before{content:"\f09c"}.fa-credit-card:before{content:"\f09d"}.fa-rss:before{content:"\f09e"}.fa-hdd-o:before{content:"\f0a0"}.fa-bullhorn:before{content:"\f0a1"}.fa-bell:before{content:"\f0f3"}.fa-certificate:before{content:"\f0a3"}.fa-hand-o-right:before{content:"\f0a4"}.fa-hand-o-left:before{content:"\f0a5"}.fa-hand-o-up:before{content:"\f0a6"}.fa-hand-o-down:before{content:"\f0a7"}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:"\f0a8"}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:"\f0a9"}.fa-arrow-circle-up:before{content:"\f0aa"}.fa-arrow-circle-down:before{content:"\f0ab"}.fa-globe:before{content:"\f0ac"}.fa-wrench:before{content:"\f0ad"}.fa-tasks:before{content:"\f0ae"}.fa-filter:before{content:"\f0b0"}.fa-briefcase:before{content:"\f0b1"}.fa-arrows-alt:before{content:"\f0b2"}.fa-group:before,.fa-users:before{content:"\f0c0"}.fa-chain:before,.fa-link:before,.icon-link:before{content:"\f0c1"}.fa-cloud:before{content:"\f0c2"}.fa-flask:before{content:"\f0c3"}.fa-cut:before,.fa-scissors:before{content:"\f0c4"}.fa-copy:before,.fa-files-o:before{content:"\f0c5"}.fa-paperclip:before{content:"\f0c6"}.fa-save:before,.fa-floppy-o:before{content:"\f0c7"}.fa-square:before{content:"\f0c8"}.fa-navicon:before,.fa-reorder:before,.fa-bars:before{content:"\f0c9"}.fa-list-ul:before{content:"\f0ca"}.fa-list-ol:before{content:"\f0cb"}.fa-strikethrough:before{content:"\f0cc"}.fa-underline:before{content:"\f0cd"}.fa-table:before{content:"\f0ce"}.fa-magic:before{content:"\f0d0"}.fa-truck:before{content:"\f0d1"}.fa-pinterest:before{content:"\f0d2"}.fa-pinterest-square:before{content:"\f0d3"}.fa-google-plus-square:before{content:"\f0d4"}.fa-google-plus:before{content:"\f0d5"}.fa-money:before{content:"\f0d6"}.fa-caret-down:before,.wy-dropdown .caret:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before{content:"\f0d8"}.fa-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.fa-columns:before{content:"\f0db"}.fa-unsorted:before,.fa-sort:before{content:"\f0dc"}.fa-sort-down:before,.fa-sort-desc:before{content:"\f0dd"}.fa-sort-up:before,.fa-sort-asc:before{content:"\f0de"}.fa-envelope:before{content:"\f0e0"}.fa-linkedin:before{content:"\f0e1"}.fa-rotate-left:before,.fa-undo:before{content:"\f0e2"}.fa-legal:before,.fa-gavel:before{content:"\f0e3"}.fa-dashboard:before,.fa-tachometer:before{content:"\f0e4"}.fa-comment-o:before{content:"\f0e5"}.fa-comments-o:before{content:"\f0e6"}.fa-flash:before,.fa-bolt:before{content:"\f0e7"}.fa-sitemap:before{content:"\f0e8"}.fa-umbrella:before{content:"\f0e9"}.fa-paste:before,.fa-clipboard:before{content:"\f0ea"}.fa-lightbulb-o:before{content:"\f0eb"}.fa-exchange:before{content:"\f0ec"}.fa-cloud-download:before{content:"\f0ed"}.fa-cloud-upload:before{content:"\f0ee"}.fa-user-md:before{content:"\f0f0"}.fa-stethoscope:before{content:"\f0f1"}.fa-suitcase:before{content:"\f0f2"}.fa-bell-o:before{content:"\f0a2"}.fa-coffee:before{content:"\f0f4"}.fa-cutlery:before{content:"\f0f5"}.fa-file-text-o:before{content:"\f0f6"}.fa-building-o:before{content:"\f0f7"}.fa-hospital-o:before{content:"\f0f8"}.fa-ambulance:before{content:"\f0f9"}.fa-medkit:before{content:"\f0fa"}.fa-fighter-jet:before{content:"\f0fb"}.fa-beer:before{content:"\f0fc"}.fa-h-square:before{content:"\f0fd"}.fa-plus-square:before{content:"\f0fe"}.fa-angle-double-left:before{content:"\f100"}.fa-angle-double-right:before{content:"\f101"}.fa-angle-double-up:before{content:"\f102"}.fa-angle-double-down:before{content:"\f103"}.fa-angle-left:before{content:"\f104"}.fa-angle-right:before{content:"\f105"}.fa-angle-up:before{content:"\f106"}.fa-angle-down:before{content:"\f107"}.fa-desktop:before{content:"\f108"}.fa-laptop:before{content:"\f109"}.fa-tablet:before{content:"\f10a"}.fa-mobile-phone:before,.fa-mobile:before{content:"\f10b"}.fa-circle-o:before{content:"\f10c"}.fa-quote-left:before{content:"\f10d"}.fa-quote-right:before{content:"\f10e"}.fa-spinner:before{content:"\f110"}.fa-circle:before{content:"\f111"}.fa-mail-reply:before,.fa-reply:before{content:"\f112"}.fa-github-alt:before{content:"\f113"}.fa-folder-o:before{content:"\f114"}.fa-folder-open-o:before{content:"\f115"}.fa-smile-o:before{content:"\f118"}.fa-frown-o:before{content:"\f119"}.fa-meh-o:before{content:"\f11a"}.fa-gamepad:before{content:"\f11b"}.fa-keyboard-o:before{content:"\f11c"}.fa-flag-o:before{content:"\f11d"}.fa-flag-checkered:before{content:"\f11e"}.fa-terminal:before{content:"\f120"}.fa-code:before{content:"\f121"}.fa-mail-reply-all:before,.fa-reply-all:before{content:"\f122"}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:"\f123"}.fa-location-arrow:before{content:"\f124"}.fa-crop:before{content:"\f125"}.fa-code-fork:before{content:"\f126"}.fa-unlink:before,.fa-chain-broken:before{content:"\f127"}.fa-question:before{content:"\f128"}.fa-info:before{content:"\f129"}.fa-exclamation:before{content:"\f12a"}.fa-superscript:before{content:"\f12b"}.fa-subscript:before{content:"\f12c"}.fa-eraser:before{content:"\f12d"}.fa-puzzle-piece:before{content:"\f12e"}.fa-microphone:before{content:"\f130"}.fa-microphone-slash:before{content:"\f131"}.fa-shield:before{content:"\f132"}.fa-calendar-o:before{content:"\f133"}.fa-fire-extinguisher:before{content:"\f134"}.fa-rocket:before{content:"\f135"}.fa-maxcdn:before{content:"\f136"}.fa-chevron-circle-left:before{content:"\f137"}.fa-chevron-circle-right:before{content:"\f138"}.fa-chevron-circle-up:before{content:"\f139"}.fa-chevron-circle-down:before{content:"\f13a"}.fa-html5:before{content:"\f13b"}.fa-css3:before{content:"\f13c"}.fa-anchor:before{content:"\f13d"}.fa-unlock-alt:before{content:"\f13e"}.fa-bullseye:before{content:"\f140"}.fa-ellipsis-h:before{content:"\f141"}.fa-ellipsis-v:before{content:"\f142"}.fa-rss-square:before{content:"\f143"}.fa-play-circle:before{content:"\f144"}.fa-ticket:before{content:"\f145"}.fa-minus-square:before{content:"\f146"}.fa-minus-square-o:before{content:"\f147"}.fa-level-up:before{content:"\f148"}.fa-level-down:before{content:"\f149"}.fa-check-square:before{content:"\f14a"}.fa-pencil-square:before{content:"\f14b"}.fa-external-link-square:before{content:"\f14c"}.fa-share-square:before{content:"\f14d"}.fa-compass:before{content:"\f14e"}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:"\f150"}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:"\f151"}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:"\f152"}.fa-euro:before,.fa-eur:before{content:"\f153"}.fa-gbp:before{content:"\f154"}.fa-dollar:before,.fa-usd:before{content:"\f155"}.fa-rupee:before,.fa-inr:before{content:"\f156"}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:"\f157"}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:"\f158"}.fa-won:before,.fa-krw:before{content:"\f159"}.fa-bitcoin:before,.fa-btc:before{content:"\f15a"}.fa-file:before{content:"\f15b"}.fa-file-text:before{content:"\f15c"}.fa-sort-alpha-asc:before{content:"\f15d"}.fa-sort-alpha-desc:before{content:"\f15e"}.fa-sort-amount-asc:before{content:"\f160"}.fa-sort-amount-desc:before{content:"\f161"}.fa-sort-numeric-asc:before{content:"\f162"}.fa-sort-numeric-desc:before{content:"\f163"}.fa-thumbs-up:before{content:"\f164"}.fa-thumbs-down:before{content:"\f165"}.fa-youtube-square:before{content:"\f166"}.fa-youtube:before{content:"\f167"}.fa-xing:before{content:"\f168"}.fa-xing-square:before{content:"\f169"}.fa-youtube-play:before{content:"\f16a"}.fa-dropbox:before{content:"\f16b"}.fa-stack-overflow:before{content:"\f16c"}.fa-instagram:before{content:"\f16d"}.fa-flickr:before{content:"\f16e"}.fa-adn:before{content:"\f170"}.fa-bitbucket:before,.icon-bitbucket:before{content:"\f171"}.fa-bitbucket-square:before{content:"\f172"}.fa-tumblr:before{content:"\f173"}.fa-tumblr-square:before{content:"\f174"}.fa-long-arrow-down:before{content:"\f175"}.fa-long-arrow-up:before{content:"\f176"}.fa-long-arrow-left:before{content:"\f177"}.fa-long-arrow-right:before{content:"\f178"}.fa-apple:before{content:"\f179"}.fa-windows:before{content:"\f17a"}.fa-android:before{content:"\f17b"}.fa-linux:before{content:"\f17c"}.fa-dribbble:before{content:"\f17d"}.fa-skype:before{content:"\f17e"}.fa-foursquare:before{content:"\f180"}.fa-trello:before{content:"\f181"}.fa-female:before{content:"\f182"}.fa-male:before{content:"\f183"}.fa-gittip:before{content:"\f184"}.fa-sun-o:before{content:"\f185"}.fa-moon-o:before{content:"\f186"}.fa-archive:before{content:"\f187"}.fa-bug:before{content:"\f188"}.fa-vk:before{content:"\f189"}.fa-weibo:before{content:"\f18a"}.fa-renren:before{content:"\f18b"}.fa-pagelines:before{content:"\f18c"}.fa-stack-exchange:before{content:"\f18d"}.fa-arrow-circle-o-right:before{content:"\f18e"}.fa-arrow-circle-o-left:before{content:"\f190"}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:"\f191"}.fa-dot-circle-o:before{content:"\f192"}.fa-wheelchair:before{content:"\f193"}.fa-vimeo-square:before{content:"\f194"}.fa-turkish-lira:before,.fa-try:before{content:"\f195"}.fa-plus-square-o:before{content:"\f196"}.fa-space-shuttle:before{content:"\f197"}.fa-slack:before{content:"\f198"}.fa-envelope-square:before{content:"\f199"}.fa-wordpress:before{content:"\f19a"}.fa-openid:before{content:"\f19b"}.fa-institution:before,.fa-bank:before,.fa-university:before{content:"\f19c"}.fa-mortar-board:before,.fa-graduation-cap:before{content:"\f19d"}.fa-yahoo:before{content:"\f19e"}.fa-google:before{content:"\f1a0"}.fa-reddit:before{content:"\f1a1"}.fa-reddit-square:before{content:"\f1a2"}.fa-stumbleupon-circle:before{content:"\f1a3"}.fa-stumbleupon:before{content:"\f1a4"}.fa-delicious:before{content:"\f1a5"}.fa-digg:before{content:"\f1a6"}.fa-pied-piper-square:before,.fa-pied-piper:before{content:"\f1a7"}.fa-pied-piper-alt:before{content:"\f1a8"}.fa-drupal:before{content:"\f1a9"}.fa-joomla:before{content:"\f1aa"}.fa-language:before{content:"\f1ab"}.fa-fax:before{content:"\f1ac"}.fa-building:before{content:"\f1ad"}.fa-child:before{content:"\f1ae"}.fa-paw:before{content:"\f1b0"}.fa-spoon:before{content:"\f1b1"}.fa-cube:before{content:"\f1b2"}.fa-cubes:before{content:"\f1b3"}.fa-behance:before{content:"\f1b4"}.fa-behance-square:before{content:"\f1b5"}.fa-steam:before{content:"\f1b6"}.fa-steam-square:before{content:"\f1b7"}.fa-recycle:before{content:"\f1b8"}.fa-automobile:before,.fa-car:before{content:"\f1b9"}.fa-cab:before,.fa-taxi:before{content:"\f1ba"}.fa-tree:before{content:"\f1bb"}.fa-spotify:before{content:"\f1bc"}.fa-deviantart:before{content:"\f1bd"}.fa-soundcloud:before{content:"\f1be"}.fa-database:before{content:"\f1c0"}.fa-file-pdf-o:before{content:"\f1c1"}.fa-file-word-o:before{content:"\f1c2"}.fa-file-excel-o:before{content:"\f1c3"}.fa-file-powerpoint-o:before{content:"\f1c4"}.fa-file-photo-o:before,.fa-file-picture-o:before,.fa-file-image-o:before{content:"\f1c5"}.fa-file-zip-o:before,.fa-file-archive-o:before{content:"\f1c6"}.fa-file-sound-o:before,.fa-file-audio-o:before{content:"\f1c7"}.fa-file-movie-o:before,.fa-file-video-o:before{content:"\f1c8"}.fa-file-code-o:before{content:"\f1c9"}.fa-vine:before{content:"\f1ca"}.fa-codepen:before{content:"\f1cb"}.fa-jsfiddle:before{content:"\f1cc"}.fa-life-bouy:before,.fa-life-saver:before,.fa-support:before,.fa-life-ring:before{content:"\f1cd"}.fa-circle-o-notch:before{content:"\f1ce"}.fa-ra:before,.fa-rebel:before{content:"\f1d0"}.fa-ge:before,.fa-empire:before{content:"\f1d1"}.fa-git-square:before{content:"\f1d2"}.fa-git:before{content:"\f1d3"}.fa-hacker-news:before{content:"\f1d4"}.fa-tencent-weibo:before{content:"\f1d5"}.fa-qq:before{content:"\f1d6"}.fa-wechat:before,.fa-weixin:before{content:"\f1d7"}.fa-send:before,.fa-paper-plane:before{content:"\f1d8"}.fa-send-o:before,.fa-paper-plane-o:before{content:"\f1d9"}.fa-history:before{content:"\f1da"}.fa-circle-thin:before{content:"\f1db"}.fa-header:before{content:"\f1dc"}.fa-paragraph:before{content:"\f1dd"}.fa-sliders:before{content:"\f1de"}.fa-share-alt:before{content:"\f1e0"}.fa-share-alt-square:before{content:"\f1e1"}.fa-bomb:before{content:"\f1e2"}.fa,.rst-content .admonition-title,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink,.icon,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context{font-family:inherit}.fa:before,.rst-content .admonition-title:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content dl dt .headerlink:before,.icon:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before{font-family:"FontAwesome";display:inline-block;font-style:normal;font-weight:normal;line-height:1;text-decoration:inherit}a .fa,a .rst-content .admonition-title,.rst-content a .admonition-title,a .rst-content h1 .headerlink,.rst-content h1 a .headerlink,a .rst-content h2 .headerlink,.rst-content h2 a .headerlink,a .rst-content h3 .headerlink,.rst-content h3 a .headerlink,a .rst-content h4 .headerlink,.rst-content h4 a .headerlink,a .rst-content h5 .headerlink,.rst-content h5 a .headerlink,a .rst-content h6 .headerlink,.rst-content h6 a .headerlink,a .rst-content dl dt .headerlink,.rst-content dl dt a .headerlink,a .icon{display:inline-block;text-decoration:inherit}.btn .fa,.btn .rst-content .admonition-title,.rst-content .btn .admonition-title,.btn .rst-content h1 .headerlink,.rst-content h1 .btn .headerlink,.btn .rst-content h2 .headerlink,.rst-content h2 .btn .headerlink,.btn .rst-content h3 .headerlink,.rst-content h3 .btn .headerlink,.btn .rst-content h4 .headerlink,.rst-content h4 .btn .headerlink,.btn .rst-content h5 .headerlink,.rst-content h5 .btn .headerlink,.btn .rst-content h6 .headerlink,.rst-content h6 .btn .headerlink,.btn .rst-content dl dt .headerlink,.rst-content dl dt .btn .headerlink,.btn .icon,.nav .fa,.nav .rst-content .admonition-title,.rst-content .nav .admonition-title,.nav .rst-content h1 .headerlink,.rst-content h1 .nav .headerlink,.nav .rst-content h2 .headerlink,.rst-content h2 .nav .headerlink,.nav .rst-content h3 .headerlink,.rst-content h3 .nav .headerlink,.nav .rst-content h4 .headerlink,.rst-content h4 .nav .headerlink,.nav .rst-content h5 .headerlink,.rst-content h5 .nav .headerlink,.nav .rst-content h6 .headerlink,.rst-content h6 .nav .headerlink,.nav .rst-content dl dt .headerlink,.rst-content dl dt .nav .headerlink,.nav .icon{display:inline}.btn .fa.fa-large,.btn .rst-content .fa-large.admonition-title,.rst-content .btn .fa-large.admonition-title,.btn .rst-content h1 .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.btn .rst-content dl dt .fa-large.headerlink,.rst-content dl dt .btn .fa-large.headerlink,.btn .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .fa-large.admonition-title,.rst-content .nav .fa-large.admonition-title,.nav .rst-content h1 .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.nav .rst-content dl dt .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.nav .fa-large.icon{line-height:0.9em}.btn .fa.fa-spin,.btn .rst-content .fa-spin.admonition-title,.rst-content .btn .fa-spin.admonition-title,.btn .rst-content h1 .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.btn .rst-content dl dt .fa-spin.headerlink,.rst-content dl dt .btn .fa-spin.headerlink,.btn .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .fa-spin.admonition-title,.rst-content .nav .fa-spin.admonition-title,.nav .rst-content h1 .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.nav .rst-content dl dt .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.nav .fa-spin.icon{display:inline-block}.btn.fa:before,.rst-content .btn.admonition-title:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content dl dt .btn.headerlink:before,.btn.icon:before{opacity:0.5;-webkit-transition:opacity 0.05s ease-in;-moz-transition:opacity 0.05s ease-in;transition:opacity 0.05s ease-in}.btn.fa:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.btn.icon:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .rst-content .admonition-title:before,.rst-content .btn-mini .admonition-title:before,.btn-mini .rst-content h1 .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.btn-mini .rst-content dl dt .headerlink:before,.rst-content dl dt .btn-mini .headerlink:before,.btn-mini .icon:before{font-size:14px;vertical-align:-15%}.wy-alert,.rst-content .note,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .warning,.rst-content .seealso,.rst-content .admonition-todo{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.wy-alert-title,.rst-content .admonition-title{color:#fff;font-weight:bold;display:block;color:#fff;background:#6ab0de;margin:-12px;padding:6px 12px;margin-bottom:12px}.wy-alert.wy-alert-danger,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.admonition-todo{background:#fdf3f2}.wy-alert.wy-alert-danger .wy-alert-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .danger .wy-alert-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .danger .admonition-title,.rst-content .error .admonition-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title{background:#f29f97}.wy-alert.wy-alert-warning,.rst-content .wy-alert-warning.note,.rst-content .attention,.rst-content .caution,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.tip,.rst-content .warning,.rst-content .wy-alert-warning.seealso,.rst-content .admonition-todo{background:#ffedcc}.wy-alert.wy-alert-warning .wy-alert-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .attention .wy-alert-title,.rst-content .caution .wy-alert-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .admonition-todo .wy-alert-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .attention .admonition-title,.rst-content .caution .admonition-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .warning .admonition-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .admonition-todo .admonition-title{background:#f0b37e}.wy-alert.wy-alert-info,.rst-content .note,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.rst-content .seealso,.rst-content .wy-alert-info.admonition-todo{background:#e7f2fa}.wy-alert.wy-alert-info .wy-alert-title,.rst-content .note .wy-alert-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.rst-content .note .admonition-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .seealso .admonition-title,.rst-content .wy-alert-info.admonition-todo .admonition-title{background:#6ab0de}.wy-alert.wy-alert-success,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.warning,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.admonition-todo{background:#dbfaf4}.wy-alert.wy-alert-success .wy-alert-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .hint .wy-alert-title,.rst-content .important .wy-alert-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .hint .admonition-title,.rst-content .important .admonition-title,.rst-content .tip .admonition-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.admonition-todo .admonition-title{background:#1abc9c}.wy-alert.wy-alert-neutral,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.admonition-todo{background:#f3f6f6}.wy-alert.wy-alert-neutral .wy-alert-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .admonition-title{color:#404040;background:#e1e4e5}.wy-alert.wy-alert-neutral a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.admonition-todo a{color:#2980b9}.wy-alert p:last-child,.rst-content .note p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.rst-content .seealso p:last-child,.rst-content .admonition-todo p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0px;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,0.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all 0.3s ease-in;-moz-transition:all 0.3s ease-in;transition:all 0.3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width: 768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px 12px;color:#fff;border:1px solid rgba(0,0,0,0.1);background-color:#27ae60;text-decoration:none;font-weight:normal;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;box-shadow:0px 1px 2px -1px rgba(255,255,255,0.5) inset,0px -2px 0px 0px rgba(0,0,0,0.1) inset;outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all 0.1s linear;-moz-transition:all 0.1s linear;transition:all 0.1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:0px -1px 0px 0px rgba(0,0,0,0.05) inset,0px 2px 0px 0px rgba(0,0,0,0.1) inset;padding:8px 12px 6px 12px}.btn:visited{color:#fff}.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:0.4;cursor:not-allowed;box-shadow:none}.btn-disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:0.4;cursor:not-allowed;box-shadow:none}.btn-disabled:hover,.btn-disabled:focus,.btn-disabled:active{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:0.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9 !important}.btn-info:hover{background-color:#2e8ece !important}.btn-neutral{background-color:#f3f6f6 !important;color:#404040 !important}.btn-neutral:hover{background-color:#e5ebeb !important;color:#404040}.btn-neutral:visited{color:#404040 !important}.btn-success{background-color:#27ae60 !important}.btn-success:hover{background-color:#295 !important}.btn-danger{background-color:#e74c3c !important}.btn-danger:hover{background-color:#ea6153 !important}.btn-warning{background-color:#e67e22 !important}.btn-warning:hover{background-color:#e98b39 !important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f !important}.btn-link{background-color:transparent !important;color:#2980b9;box-shadow:none;border-color:transparent !important}.btn-link:hover{background-color:transparent !important;color:#409ad5 !important;box-shadow:none}.btn-link:active{background-color:transparent !important;color:#409ad5 !important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:before,.wy-btn-group:after{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:solid 1px #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,0.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:solid 1px #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type="search"]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned input,.wy-form-aligned textarea,.wy-form-aligned select,.wy-form-aligned .wy-help-inline,.wy-form-aligned label{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{border:0;margin:0;padding:0}legend{display:block;width:100%;border:0;padding:0;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label{display:block;margin:0 0 0.3125em 0;color:#999;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;*zoom:1;max-width:68em;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:before,.wy-control-group:after{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group:before,.wy-control-group:after{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full input[type="text"],.wy-control-group .wy-form-full input[type="password"],.wy-control-group .wy-form-full input[type="email"],.wy-control-group .wy-form-full input[type="url"],.wy-control-group .wy-form-full input[type="date"],.wy-control-group .wy-form-full input[type="month"],.wy-control-group .wy-form-full input[type="time"],.wy-control-group .wy-form-full input[type="datetime"],.wy-control-group .wy-form-full input[type="datetime-local"],.wy-control-group .wy-form-full input[type="week"],.wy-control-group .wy-form-full input[type="number"],.wy-control-group .wy-form-full input[type="search"],.wy-control-group .wy-form-full input[type="tel"],.wy-control-group .wy-form-full input[type="color"],.wy-control-group .wy-form-halves input[type="text"],.wy-control-group .wy-form-halves input[type="password"],.wy-control-group .wy-form-halves input[type="email"],.wy-control-group .wy-form-halves input[type="url"],.wy-control-group .wy-form-halves input[type="date"],.wy-control-group .wy-form-halves input[type="month"],.wy-control-group .wy-form-halves input[type="time"],.wy-control-group .wy-form-halves input[type="datetime"],.wy-control-group .wy-form-halves input[type="datetime-local"],.wy-control-group .wy-form-halves input[type="week"],.wy-control-group .wy-form-halves input[type="number"],.wy-control-group .wy-form-halves input[type="search"],.wy-control-group .wy-form-halves input[type="tel"],.wy-control-group .wy-form-halves input[type="color"],.wy-control-group .wy-form-thirds input[type="text"],.wy-control-group .wy-form-thirds input[type="password"],.wy-control-group .wy-form-thirds input[type="email"],.wy-control-group .wy-form-thirds input[type="url"],.wy-control-group .wy-form-thirds input[type="date"],.wy-control-group .wy-form-thirds input[type="month"],.wy-control-group .wy-form-thirds input[type="time"],.wy-control-group .wy-form-thirds input[type="datetime"],.wy-control-group .wy-form-thirds input[type="datetime-local"],.wy-control-group .wy-form-thirds input[type="week"],.wy-control-group .wy-form-thirds input[type="number"],.wy-control-group .wy-form-thirds input[type="search"],.wy-control-group .wy-form-thirds input[type="tel"],.wy-control-group .wy-form-thirds input[type="color"]{width:100%}.wy-control-group .wy-form-full{display:block;float:left;margin-right:2.35765%;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{display:block;float:left;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child{margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(2n+1){clear:left}.wy-control-group .wy-form-thirds{display:block;float:left;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child{margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control{margin:6px 0 0 0;font-size:90%}.wy-control-no-input{display:inline-block;margin:6px 0 0 0;font-size:90%}.wy-control-group.fluid-input input[type="text"],.wy-control-group.fluid-input input[type="password"],.wy-control-group.fluid-input input[type="email"],.wy-control-group.fluid-input input[type="url"],.wy-control-group.fluid-input input[type="date"],.wy-control-group.fluid-input input[type="month"],.wy-control-group.fluid-input input[type="time"],.wy-control-group.fluid-input input[type="datetime"],.wy-control-group.fluid-input input[type="datetime-local"],.wy-control-group.fluid-input input[type="week"],.wy-control-group.fluid-input input[type="number"],.wy-control-group.fluid-input input[type="search"],.wy-control-group.fluid-input input[type="tel"],.wy-control-group.fluid-input input[type="color"]{width:100%}.wy-form-message-inline{display:inline-block;padding-left:0.3em;color:#666;vertical-align:middle;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:0.3125em;font-style:italic}input{line-height:normal}input[type="button"],input[type="reset"],input[type="submit"]{-webkit-appearance:button;cursor:pointer;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;*overflow:visible}input[type="text"],input[type="password"],input[type="email"],input[type="url"],input[type="date"],input[type="month"],input[type="time"],input[type="datetime"],input[type="datetime-local"],input[type="week"],input[type="number"],input[type="search"],input[type="tel"],input[type="color"]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border 0.3s linear;-moz-transition:border 0.3s linear;transition:border 0.3s linear}input[type="datetime-local"]{padding:0.34375em 0.625em}input[disabled]{cursor:default}input[type="checkbox"],input[type="radio"]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;padding:0;margin-right:0.3125em;*height:13px;*width:13px}input[type="search"]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type="search"]::-webkit-search-cancel-button,input[type="search"]::-webkit-search-decoration{-webkit-appearance:none}input[type="text"]:focus,input[type="password"]:focus,input[type="email"]:focus,input[type="url"]:focus,input[type="date"]:focus,input[type="month"]:focus,input[type="time"]:focus,input[type="datetime"]:focus,input[type="datetime-local"]:focus,input[type="week"]:focus,input[type="number"]:focus,input[type="search"]:focus,input[type="tel"]:focus,input[type="color"]:focus{outline:0;outline:thin dotted \9;border-color:#333}input.no-focus:focus{border-color:#ccc !important}input[type="file"]:focus,input[type="radio"]:focus,input[type="checkbox"]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type="text"][disabled],input[type="password"][disabled],input[type="email"][disabled],input[type="url"][disabled],input[type="date"][disabled],input[type="month"][disabled],input[type="time"][disabled],input[type="datetime"][disabled],input[type="datetime-local"][disabled],input[type="week"][disabled],input[type="number"][disabled],input[type="search"][disabled],input[type="tel"][disabled],input[type="color"][disabled]{cursor:not-allowed;background-color:#f3f6f6;color:#cad2d3}input:focus:invalid,textarea:focus:invalid,select:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,textarea:focus:invalid:focus,select:focus:invalid:focus{border-color:#e74c3c}input[type="file"]:focus:invalid:focus,input[type="radio"]:focus:invalid:focus,input[type="checkbox"]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif}select,textarea{padding:0.5em 0.625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border 0.3s linear;-moz-transition:border 0.3s linear;transition:border 0.3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}select[disabled],textarea[disabled],input[readonly],select[readonly],textarea[readonly]{cursor:not-allowed;background-color:#fff;color:#cad2d3;border-color:transparent}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{padding:6px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:solid 1px #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type="text"],.wy-control-group.wy-control-group-error input[type="password"],.wy-control-group.wy-control-group-error input[type="email"],.wy-control-group.wy-control-group-error input[type="url"],.wy-control-group.wy-control-group-error input[type="date"],.wy-control-group.wy-control-group-error input[type="month"],.wy-control-group.wy-control-group-error input[type="time"],.wy-control-group.wy-control-group-error input[type="datetime"],.wy-control-group.wy-control-group-error input[type="datetime-local"],.wy-control-group.wy-control-group-error input[type="week"],.wy-control-group.wy-control-group-error input[type="number"],.wy-control-group.wy-control-group-error input[type="search"],.wy-control-group.wy-control-group-error input[type="tel"],.wy-control-group.wy-control-group-error input[type="color"]{border:solid 1px #e74c3c}.wy-control-group.wy-control-group-error textarea{border:solid 1px #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:0.5em 0.625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width: 480px){.wy-form button[type="submit"]{margin:0.7em 0 0}.wy-form input[type="text"],.wy-form input[type="password"],.wy-form input[type="email"],.wy-form input[type="url"],.wy-form input[type="date"],.wy-form input[type="month"],.wy-form input[type="time"],.wy-form input[type="datetime"],.wy-form input[type="datetime-local"],.wy-form input[type="week"],.wy-form input[type="number"],.wy-form input[type="search"],.wy-form input[type="tel"],.wy-form input[type="color"]{margin-bottom:0.3em;display:block}.wy-form label{margin-bottom:0.3em;display:block}.wy-form input[type="password"],.wy-form input[type="email"],.wy-form input[type="url"],.wy-form input[type="date"],.wy-form input[type="month"],.wy-form input[type="time"],.wy-form input[type="datetime"],.wy-form input[type="datetime-local"],.wy-form input[type="week"],.wy-form input[type="number"],.wy-form input[type="search"],.wy-form input[type="tel"],.wy-form input[type="color"]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:0.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0 0}.wy-form .wy-help-inline,.wy-form-message-inline,.wy-form-message{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width: 768px){.tablet-hide{display:none}}@media screen and (max-width: 480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.wy-table,.rst-content table.docutils,.rst-content table.field-list{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.wy-table caption,.rst-content table.docutils caption,.rst-content table.field-list caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.wy-table td,.rst-content table.docutils td,.rst-content table.field-list td,.wy-table th,.rst-content table.docutils th,.rst-content table.field-list th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.wy-table td:first-child,.rst-content table.docutils td:first-child,.rst-content table.field-list td:first-child,.wy-table th:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list th:first-child{border-left-width:0}.wy-table thead,.rst-content table.docutils thead,.rst-content table.field-list thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.wy-table thead th,.rst-content table.docutils thead th,.rst-content table.field-list thead th{font-weight:bold;border-bottom:solid 2px #e1e4e5}.wy-table td,.rst-content table.docutils td,.rst-content table.field-list td{background-color:transparent;vertical-align:middle}.wy-table td p,.rst-content table.docutils td p,.rst-content table.field-list td p{line-height:18px}.wy-table td p:last-child,.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child{margin-bottom:0}.wy-table .wy-table-cell-min,.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min{width:1%;padding-right:0}.wy-table .wy-table-cell-min input[type=checkbox],.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox],.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:gray;font-size:90%}.wy-table-tertiary{color:gray;font-size:80%}.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td,.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td{background-color:#f3f6f6}.wy-table-backed{background-color:#f3f6f6}.wy-table-bordered-all,.rst-content table.docutils{border:1px solid #e1e4e5}.wy-table-bordered-all td,.rst-content table.docutils td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.wy-table-bordered-all tbody>tr:last-child td,.rst-content table.docutils tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px 0;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0 !important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%;overflow-x:hidden}body{font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;font-weight:normal;color:#404040;min-height:100%;overflow-x:hidden;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22 !important}a.wy-text-warning:hover{color:#eb9950 !important}.wy-text-info{color:#2980b9 !important}a.wy-text-info:hover{color:#409ad5 !important}.wy-text-success{color:#27ae60 !important}a.wy-text-success:hover{color:#36d278 !important}.wy-text-danger{color:#e74c3c !important}a.wy-text-danger:hover{color:#ed7669 !important}.wy-text-neutral{color:#404040 !important}a.wy-text-neutral:hover{color:#595959 !important}h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:"Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif}p{line-height:24px;margin:0;font-size:16px;margin-bottom:24px}h1{font-size:175%}h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}code,.rst-content tt{white-space:nowrap;max-width:100%;background:#fff;border:solid 1px #e1e4e5;font-size:75%;padding:0 5px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;color:#e74c3c;overflow-x:auto}code.code-large,.rst-content tt.code-large{font-size:90%}.wy-plain-list-disc,.rst-content .section ul,.rst-content .toctree-wrapper ul,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.wy-plain-list-disc li,.rst-content .section ul li,.rst-content .toctree-wrapper ul li,article ul li{list-style:disc;margin-left:24px}.wy-plain-list-disc li p:last-child,.rst-content .section ul li p:last-child,.rst-content .toctree-wrapper ul li p:last-child,article ul li p:last-child{margin-bottom:0}.wy-plain-list-disc li ul,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li ul,article ul li ul{margin-bottom:0}.wy-plain-list-disc li li,.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,article ul li li{list-style:circle}.wy-plain-list-disc li li li,.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,article ul li li li{list-style:square}.wy-plain-list-disc li ol li,.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,article ul li ol li{list-style:decimal}.wy-plain-list-decimal,.rst-content .section ol,.rst-content ol.arabic,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.wy-plain-list-decimal li,.rst-content .section ol li,.rst-content ol.arabic li,article ol li{list-style:decimal;margin-left:24px}.wy-plain-list-decimal li p:last-child,.rst-content .section ol li p:last-child,.rst-content ol.arabic li p:last-child,article ol li p:last-child{margin-bottom:0}.wy-plain-list-decimal li ul,.rst-content .section ol li ul,.rst-content ol.arabic li ul,article ol li ul{margin-bottom:0}.wy-plain-list-decimal li ul li,.rst-content .section ol li ul li,.rst-content ol.arabic li ul li,article ol li ul li{list-style:disc}.codeblock-example{border:1px solid #e1e4e5;border-bottom:none;padding:24px;padding-top:48px;font-weight:500;background:#fff;position:relative}.codeblock-example:after{content:"Example";position:absolute;top:0px;left:0px;background:#9b59b6;color:#fff;padding:6px 12px}.codeblock-example.prettyprint-example-only{border:1px solid #e1e4e5;margin-bottom:24px}.codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight']{border:1px solid #e1e4e5;padding:0px;overflow-x:auto;background:#fff;margin:1px 0 24px 0}.codeblock div[class^='highlight'],pre.literal-block div[class^='highlight'],.rst-content .literal-block div[class^='highlight'],div[class^='highlight'] div[class^='highlight']{border:none;background:none;margin:0}div[class^='highlight'] td.code{width:100%}.linenodiv pre{border-right:solid 1px #e6e9ea;margin:0;padding:12px 12px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;font-size:12px;line-height:1.5;color:#d9d9d9}div[class^='highlight'] pre{white-space:pre;margin:0;padding:12px 12px;font-family:Consolas,"Andale Mono WT","Andale Mono","Lucida Console","Lucida Sans Typewriter","DejaVu Sans Mono","Bitstream Vera Sans Mono","Liberation Mono","Nimbus Mono L",Monaco,"Courier New",Courier,monospace;font-size:12px;line-height:1.5;display:block;overflow:auto;color:#404040}@media print{.codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight'],div[class^='highlight'] pre{white-space:pre-wrap}}.hll{background-color:#ffc;margin:0 -12px;padding:0 12px;display:block}.c{color:#998;font-style:italic}.err{color:#a61717;background-color:#e3d2d2}.k{font-weight:bold}.o{font-weight:bold}.cm{color:#998;font-style:italic}.cp{color:#999;font-weight:bold}.c1{color:#998;font-style:italic}.cs{color:#999;font-weight:bold;font-style:italic}.gd{color:#000;background-color:#fdd}.gd .x{color:#000;background-color:#faa}.ge{font-style:italic}.gr{color:#a00}.gh{color:#999}.gi{color:#000;background-color:#dfd}.gi .x{color:#000;background-color:#afa}.go{color:#888}.gp{color:#555}.gs{font-weight:bold}.gu{color:purple;font-weight:bold}.gt{color:#a00}.kc{font-weight:bold}.kd{font-weight:bold}.kn{font-weight:bold}.kp{font-weight:bold}.kr{font-weight:bold}.kt{color:#458;font-weight:bold}.m{color:#099}.s{color:#d14}.n{color:#333}.na{color:teal}.nb{color:#0086b3}.nc{color:#458;font-weight:bold}.no{color:teal}.ni{color:purple}.ne{color:#900;font-weight:bold}.nf{color:#900;font-weight:bold}.nn{color:#555}.nt{color:navy}.nv{color:teal}.ow{font-weight:bold}.w{color:#bbb}.mf{color:#099}.mh{color:#099}.mi{color:#099}.mo{color:#099}.sb{color:#d14}.sc{color:#d14}.sd{color:#d14}.s2{color:#d14}.se{color:#d14}.sh{color:#d14}.si{color:#d14}.sx{color:#d14}.sr{color:#009926}.s1{color:#d14}.ss{color:#990073}.bp{color:#999}.vc{color:teal}.vg{color:teal}.vi{color:teal}.il{color:#099}.gc{color:#999;background-color:#eaf2f5}.wy-breadcrumbs li{display:inline-block}.wy-breadcrumbs li.wy-breadcrumbs-aside{float:right}.wy-breadcrumbs li a{display:inline-block;padding:5px}.wy-breadcrumbs li a:first-child{padding-left:0}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width: 480px){.wy-breadcrumbs-extra{display:none}.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:before,.wy-menu-horiz:after{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz ul,.wy-menu-horiz li{display:inline-block}.wy-menu-horiz li:hover{background:rgba(255,255,255,0.1)}.wy-menu-horiz li.divide-left{border-left:solid 1px #404040}.wy-menu-horiz li.divide-right{border-right:solid 1px #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical header{height:32px;display:inline-block;line-height:32px;padding:0 1.618em;display:block;font-weight:bold;text-transform:uppercase;font-size:80%;color:#2980b9;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:solid 1px #404040}.wy-menu-vertical li.divide-bottom{border-bottom:solid 1px #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:gray;border-right:solid 1px #c9c9c9;padding:0.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.wy-menu-vertical li.on a,.wy-menu-vertical li.current>a{color:#404040;padding:0.4045em 1.618em;font-weight:bold;position:relative;background:#fcfcfc;border:none;border-bottom:solid 1px #c9c9c9;border-top:solid 1px #c9c9c9;padding-left:1.618em -4px}.wy-menu-vertical li.on a:hover,.wy-menu-vertical li.current>a:hover{background:#fcfcfc}.wy-menu-vertical li.toctree-l2.current>a{background:#c9c9c9;padding:0.4045em 2.427em}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical .local-toc li ul{display:block}.wy-menu-vertical li ul li a{margin-bottom:0;color:#b3b3b3;font-weight:normal}.wy-menu-vertical a{display:inline-block;line-height:18px;padding:0.4045em 1.618em;display:block;position:relative;font-size:90%;color:#b3b3b3}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-side-nav-search{z-index:200;background-color:#2980b9;text-align:center;padding:0.809em;display:block;color:#fcfcfc;margin-bottom:0.809em}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto 0.809em auto;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search>a,.wy-side-nav-search .wy-dropdown>a{color:#fcfcfc;font-size:100%;font-weight:bold;display:inline-block;padding:4px 6px;margin-bottom:0.809em}.wy-side-nav-search>a:hover,.wy-side-nav-search .wy-dropdown>a:hover{background:rgba(255,255,255,0.1)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all 0.2s ease-in;-moz-transition:all 0.2s ease-in;transition:all 0.2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:left repeat-y #fcfcfc;background-image:url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyRpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMy1jMDExIDY2LjE0NTY2MSwgMjAxMi8wMi8wNi0xNDo1NjoyNyAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNiAoTWFjaW50b3NoKSIgeG1wTU06SW5zdGFuY2VJRD0ieG1wLmlpZDoxOERBMTRGRDBFMUUxMUUzODUwMkJCOThDMEVFNURFMCIgeG1wTU06RG9jdW1lbnRJRD0ieG1wLmRpZDoxOERBMTRGRTBFMUUxMUUzODUwMkJCOThDMEVFNURFMCI+IDx4bXBNTTpEZXJpdmVkRnJvbSBzdFJlZjppbnN0YW5jZUlEPSJ4bXAuaWlkOjE4REExNEZCMEUxRTExRTM4NTAyQkI5OEMwRUU1REUwIiBzdFJlZjpkb2N1bWVudElEPSJ4bXAuZGlkOjE4REExNEZDMEUxRTExRTM4NTAyQkI5OEMwRUU1REUwIi8+IDwvcmRmOkRlc2NyaXB0aW9uPiA8L3JkZjpSREY+IDwveDp4bXBtZXRhPiA8P3hwYWNrZXQgZW5kPSJyIj8+EwrlwAAAAA5JREFUeNpiMDU0BAgwAAE2AJgB9BnaAAAAAElFTkSuQmCC);background-size:300px 1px}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:absolute;top:0;left:0;width:300px;overflow:hidden;min-height:100%;background:#343131;z-index:200}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:0.4045em 0.809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:before,.wy-nav-top:after{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:bold}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,0.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:#999}footer p{margin-bottom:12px}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:before,.rst-footer-buttons:after{display:table;content:""}.rst-footer-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:solid 1px #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:solid 1px #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:gray;font-size:90%}@media screen and (max-width: 768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width: 1400px){.wy-nav-content-wrap{background:rgba(0,0,0,0.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,footer,.wy-nav-side{display:none}.wy-nav-content-wrap{margin-left:0}}nav.stickynav{position:fixed;top:0}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;border-top:solid 10px #343131;font-family:"Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:before,.rst-versions .rst-current-version:after{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-versions .rst-current-version .icon{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:gray;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:solid 1px #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px}.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge .rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width: 768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}img{width:100%;height:auto}}.rst-content img{max-width:100%;height:auto !important}.rst-content div.figure{margin-bottom:24px}.rst-content div.figure.align-center{text-align:center}.rst-content .section>img{margin-bottom:24px}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content .note .last,.rst-content .attention .last,.rst-content .caution .last,.rst-content .danger .last,.rst-content .error .last,.rst-content .hint .last,.rst-content .important .last,.rst-content .tip .last,.rst-content .warning .last,.rst-content .seealso .last,.rst-content .admonition-todo .last{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,0.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent !important;border-color:rgba(0,0,0,0.1) !important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha li{list-style:upper-alpha}.rst-content .section ol p,.rst-content .section ul p{margin-bottom:12px}.rst-content .line-block{margin-left:24px}.rst-content .topic-title{font-weight:bold;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0px 0px 24px 24px}.rst-content .align-left{float:left;margin:0px 24px 24px 0px}.rst-content .align-center{margin:auto;display:block}.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content dl dt .headerlink{display:none;visibility:hidden;font-size:14px}.rst-content h1 .headerlink:after,.rst-content h2 .headerlink:after,.rst-content h3 .headerlink:after,.rst-content h4 .headerlink:after,.rst-content h5 .headerlink:after,.rst-content h6 .headerlink:after,.rst-content dl dt .headerlink:after{visibility:visible;content:"\f0c1";font-family:FontAwesome;display:inline-block}.rst-content h1:hover .headerlink,.rst-content h2:hover .headerlink,.rst-content h3:hover .headerlink,.rst-content h4:hover .headerlink,.rst-content h5:hover .headerlink,.rst-content h6:hover .headerlink,.rst-content dl dt:hover .headerlink{display:inline-block}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:solid 1px #e1e4e5}.rst-content .sidebar p,.rst-content .sidebar ul,.rst-content .sidebar dl{font-size:90%}.rst-content .sidebar .last{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:"Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif;font-weight:bold;background:#e1e4e5;padding:6px 12px;margin:-24px;margin-bottom:24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;display:inline-block;font-weight:bold;padding:0 6px}.rst-content .footnote-reference,.rst-content .citation-reference{vertical-align:super;font-size:90%}.rst-content table.docutils.citation,.rst-content table.docutils.footnote{background:none;border:none;color:#999}.rst-content table.docutils.citation td,.rst-content table.docutils.citation tr,.rst-content table.docutils.footnote td,.rst-content table.docutils.footnote tr{border:none;background-color:transparent !important;white-space:normal}.rst-content table.docutils.citation td.label,.rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}.rst-content table.field-list{border:none}.rst-content table.field-list td{border:none;padding-top:5px}.rst-content table.field-list td>strong{display:inline-block;margin-top:3px}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left;padding-left:0}.rst-content tt{color:#000}.rst-content tt big,.rst-content tt em{font-size:100% !important;line-height:normal}.rst-content tt .xref,a .rst-content tt{font-weight:bold}.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:bold}.rst-content dl p,.rst-content dl table,.rst-content dl ul,.rst-content dl ol{margin-bottom:12px !important}.rst-content dl dd{margin:0 0 12px 24px}.rst-content dl:not(.docutils){margin-bottom:24px}.rst-content dl:not(.docutils) dt{display:inline-block;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:solid 3px #6ab0de;padding:6px;position:relative}.rst-content dl:not(.docutils) dt:before{color:#6ab0de}.rst-content dl:not(.docutils) dt .headerlink{color:#404040;font-size:100% !important}.rst-content dl:not(.docutils) dl dt{margin-bottom:6px;border:none;border-left:solid 3px #ccc;background:#f0f0f0;color:gray}.rst-content dl:not(.docutils) dl dt .headerlink{color:#404040;font-size:100% !important}.rst-content dl:not(.docutils) dt:first-child{margin-top:0}.rst-content dl:not(.docutils) tt{font-weight:bold}.rst-content dl:not(.docutils) tt.descname,.rst-content dl:not(.docutils) tt.descclassname{background-color:transparent;border:none;padding:0;font-size:100% !important}.rst-content dl:not(.docutils) tt.descname{font-weight:bold}.rst-content dl:not(.docutils) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:bold}.rst-content dl:not(.docutils) .property{display:inline-block;padding-right:8px}.rst-content .viewcode-link,.rst-content .viewcode-back{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:bold}@media screen and (max-width: 480px){.rst-content .sidebar{width:100%}}span[id*='MathJax-Span']{color:#404040}.math{text-align:center} diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/FontAwesome.otf b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/FontAwesome.otf Binary files differnew file mode 100644 index 00000000..8b0f54e4 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/FontAwesome.otf diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.eot b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.eot Binary files differnew file mode 100644 index 00000000..7c79c6a6 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.eot diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.svg b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.svg new file mode 100644 index 00000000..45fdf338 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.svg @@ -0,0 +1,414 @@ +<?xml version="1.0" standalone="no"?> +<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" > +<svg xmlns="http://www.w3.org/2000/svg"> +<metadata></metadata> +<defs> +<font id="fontawesomeregular" horiz-adv-x="1536" > +<font-face units-per-em="1792" ascent="1536" descent="-256" /> +<missing-glyph horiz-adv-x="448" /> +<glyph unicode=" " horiz-adv-x="448" /> +<glyph unicode="	" horiz-adv-x="448" /> +<glyph unicode=" " horiz-adv-x="448" /> +<glyph unicode="¨" horiz-adv-x="1792" /> +<glyph unicode="©" horiz-adv-x="1792" /> +<glyph unicode="®" horiz-adv-x="1792" /> +<glyph unicode="´" horiz-adv-x="1792" /> +<glyph unicode="Æ" horiz-adv-x="1792" /> +<glyph unicode=" " horiz-adv-x="768" /> +<glyph unicode=" " /> +<glyph unicode=" " horiz-adv-x="768" /> +<glyph unicode=" " /> +<glyph unicode=" " horiz-adv-x="512" /> +<glyph unicode=" " horiz-adv-x="384" /> +<glyph unicode=" " horiz-adv-x="256" /> +<glyph unicode=" " horiz-adv-x="256" /> +<glyph unicode=" " horiz-adv-x="192" /> +<glyph unicode=" " horiz-adv-x="307" /> +<glyph unicode=" " horiz-adv-x="85" /> +<glyph unicode=" " horiz-adv-x="307" /> +<glyph unicode=" " horiz-adv-x="384" /> +<glyph unicode="™" horiz-adv-x="1792" /> +<glyph unicode="∞" horiz-adv-x="1792" /> +<glyph unicode="≠" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="500" d="M0 0z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1699 1350q0 -35 -43 -78l-632 -632v-768h320q26 0 45 -19t19 -45t-19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45t45 19h320v768l-632 632q-43 43 -43 78q0 23 18 36.5t38 17.5t43 4h1408q23 0 43 -4t38 -17.5t18 -36.5z" /> +<glyph unicode="" d="M1536 1312v-1120q0 -50 -34 -89t-86 -60.5t-103.5 -32t-96.5 -10.5t-96.5 10.5t-103.5 32t-86 60.5t-34 89t34 89t86 60.5t103.5 32t96.5 10.5q105 0 192 -39v537l-768 -237v-709q0 -50 -34 -89t-86 -60.5t-103.5 -32t-96.5 -10.5t-96.5 10.5t-103.5 32t-86 60.5t-34 89 t34 89t86 60.5t103.5 32t96.5 10.5q105 0 192 -39v967q0 31 19 56.5t49 35.5l832 256q12 4 28 4q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1152 704q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5zM1664 -128q0 -52 -38 -90t-90 -38q-54 0 -90 38l-343 342q-179 -124 -399 -124q-143 0 -273.5 55.5t-225 150t-150 225t-55.5 273.5 t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -220 -124 -399l343 -343q37 -37 37 -90z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1664 32v768q-32 -36 -69 -66q-268 -206 -426 -338q-51 -43 -83 -67t-86.5 -48.5t-102.5 -24.5h-1h-1q-48 0 -102.5 24.5t-86.5 48.5t-83 67q-158 132 -426 338q-37 30 -69 66v-768q0 -13 9.5 -22.5t22.5 -9.5h1472q13 0 22.5 9.5t9.5 22.5zM1664 1083v11v13.5t-0.5 13 t-3 12.5t-5.5 9t-9 7.5t-14 2.5h-1472q-13 0 -22.5 -9.5t-9.5 -22.5q0 -168 147 -284q193 -152 401 -317q6 -5 35 -29.5t46 -37.5t44.5 -31.5t50.5 -27.5t43 -9h1h1q20 0 43 9t50.5 27.5t44.5 31.5t46 37.5t35 29.5q208 165 401 317q54 43 100.5 115.5t46.5 131.5z M1792 1120v-1088q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1472q66 0 113 -47t47 -113z" /> +<glyph unicode="" horiz-adv-x="1792" d="M896 -128q-26 0 -44 18l-624 602q-10 8 -27.5 26t-55.5 65.5t-68 97.5t-53.5 121t-23.5 138q0 220 127 344t351 124q62 0 126.5 -21.5t120 -58t95.5 -68.5t76 -68q36 36 76 68t95.5 68.5t120 58t126.5 21.5q224 0 351 -124t127 -344q0 -221 -229 -450l-623 -600 q-18 -18 -44 -18z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1664 889q0 -22 -26 -48l-363 -354l86 -500q1 -7 1 -20q0 -21 -10.5 -35.5t-30.5 -14.5q-19 0 -40 12l-449 236l-449 -236q-22 -12 -40 -12q-21 0 -31.5 14.5t-10.5 35.5q0 6 2 20l86 500l-364 354q-25 27 -25 48q0 37 56 46l502 73l225 455q19 41 49 41t49 -41l225 -455 l502 -73q56 -9 56 -46z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1137 532l306 297l-422 62l-189 382l-189 -382l-422 -62l306 -297l-73 -421l378 199l377 -199zM1664 889q0 -22 -26 -48l-363 -354l86 -500q1 -7 1 -20q0 -50 -41 -50q-19 0 -40 12l-449 236l-449 -236q-22 -12 -40 -12q-21 0 -31.5 14.5t-10.5 35.5q0 6 2 20l86 500 l-364 354q-25 27 -25 48q0 37 56 46l502 73l225 455q19 41 49 41t49 -41l225 -455l502 -73q56 -9 56 -46z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1408 131q0 -120 -73 -189.5t-194 -69.5h-874q-121 0 -194 69.5t-73 189.5q0 53 3.5 103.5t14 109t26.5 108.5t43 97.5t62 81t85.5 53.5t111.5 20q9 0 42 -21.5t74.5 -48t108 -48t133.5 -21.5t133.5 21.5t108 48t74.5 48t42 21.5q61 0 111.5 -20t85.5 -53.5t62 -81 t43 -97.5t26.5 -108.5t14 -109t3.5 -103.5zM1088 1024q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M384 -64v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM384 320v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM384 704v128q0 26 -19 45t-45 19h-128 q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1408 -64v512q0 26 -19 45t-45 19h-768q-26 0 -45 -19t-19 -45v-512q0 -26 19 -45t45 -19h768q26 0 45 19t19 45zM384 1088v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45 t45 -19h128q26 0 45 19t19 45zM1792 -64v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1408 704v512q0 26 -19 45t-45 19h-768q-26 0 -45 -19t-19 -45v-512q0 -26 19 -45t45 -19h768q26 0 45 19t19 45zM1792 320v128 q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1792 704v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1792 1088v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19 t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1920 1248v-1344q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1344q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" /> +<glyph unicode="" horiz-adv-x="1664" d="M768 512v-384q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90zM768 1280v-384q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90zM1664 512v-384q0 -52 -38 -90t-90 -38 h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90zM1664 1280v-384q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90z" /> +<glyph unicode="" horiz-adv-x="1792" d="M512 288v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM512 800v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1152 288v-192q0 -40 -28 -68t-68 -28h-320 q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM512 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1152 800v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28 h320q40 0 68 -28t28 -68zM1792 288v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1152 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 800v-192 q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="1792" d="M512 288v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM512 800v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 288v-192q0 -40 -28 -68t-68 -28h-960 q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h960q40 0 68 -28t28 -68zM512 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 800v-192q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v192q0 40 28 68t68 28 h960q40 0 68 -28t28 -68zM1792 1312v-192q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h960q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1671 970q0 -40 -28 -68l-724 -724l-136 -136q-28 -28 -68 -28t-68 28l-136 136l-362 362q-28 28 -28 68t28 68l136 136q28 28 68 28t68 -28l294 -295l656 657q28 28 68 28t68 -28l136 -136q28 -28 28 -68z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1298 214q0 -40 -28 -68l-136 -136q-28 -28 -68 -28t-68 28l-294 294l-294 -294q-28 -28 -68 -28t-68 28l-136 136q-28 28 -28 68t28 68l294 294l-294 294q-28 28 -28 68t28 68l136 136q28 28 68 28t68 -28l294 -294l294 294q28 28 68 28t68 -28l136 -136q28 -28 28 -68 t-28 -68l-294 -294l294 -294q28 -28 28 -68z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1024 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-224v-224q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v224h-224q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h224v224q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5v-224h224 q13 0 22.5 -9.5t9.5 -22.5zM1152 704q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5zM1664 -128q0 -53 -37.5 -90.5t-90.5 -37.5q-54 0 -90 38l-343 342q-179 -124 -399 -124q-143 0 -273.5 55.5 t-225 150t-150 225t-55.5 273.5t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -220 -124 -399l343 -343q37 -37 37 -90z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1024 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-576q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h576q13 0 22.5 -9.5t9.5 -22.5zM1152 704q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5z M1664 -128q0 -53 -37.5 -90.5t-90.5 -37.5q-54 0 -90 38l-343 342q-179 -124 -399 -124q-143 0 -273.5 55.5t-225 150t-150 225t-55.5 273.5t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -220 -124 -399l343 -343q37 -37 37 -90z " /> +<glyph unicode="" d="M1536 640q0 -156 -61 -298t-164 -245t-245 -164t-298 -61t-298 61t-245 164t-164 245t-61 298q0 182 80.5 343t226.5 270q43 32 95.5 25t83.5 -50q32 -42 24.5 -94.5t-49.5 -84.5q-98 -74 -151.5 -181t-53.5 -228q0 -104 40.5 -198.5t109.5 -163.5t163.5 -109.5 t198.5 -40.5t198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5q0 121 -53.5 228t-151.5 181q-42 32 -49.5 84.5t24.5 94.5q31 43 84 50t95 -25q146 -109 226.5 -270t80.5 -343zM896 1408v-640q0 -52 -38 -90t-90 -38t-90 38t-38 90v640q0 52 38 90t90 38t90 -38t38 -90z" /> +<glyph unicode="" horiz-adv-x="1792" d="M256 96v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM640 224v-320q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v320q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1024 480v-576q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23 v576q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1408 864v-960q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v960q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1792 1376v-1472q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v1472q0 14 9 23t23 9h192q14 0 23 -9t9 -23z" /> +<glyph unicode="" d="M1024 640q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1536 749v-222q0 -12 -8 -23t-20 -13l-185 -28q-19 -54 -39 -91q35 -50 107 -138q10 -12 10 -25t-9 -23q-27 -37 -99 -108t-94 -71q-12 0 -26 9l-138 108q-44 -23 -91 -38 q-16 -136 -29 -186q-7 -28 -36 -28h-222q-14 0 -24.5 8.5t-11.5 21.5l-28 184q-49 16 -90 37l-141 -107q-10 -9 -25 -9q-14 0 -25 11q-126 114 -165 168q-7 10 -7 23q0 12 8 23q15 21 51 66.5t54 70.5q-27 50 -41 99l-183 27q-13 2 -21 12.5t-8 23.5v222q0 12 8 23t19 13 l186 28q14 46 39 92q-40 57 -107 138q-10 12 -10 24q0 10 9 23q26 36 98.5 107.5t94.5 71.5q13 0 26 -10l138 -107q44 23 91 38q16 136 29 186q7 28 36 28h222q14 0 24.5 -8.5t11.5 -21.5l28 -184q49 -16 90 -37l142 107q9 9 24 9q13 0 25 -10q129 -119 165 -170q7 -8 7 -22 q0 -12 -8 -23q-15 -21 -51 -66.5t-54 -70.5q26 -50 41 -98l183 -28q13 -2 21 -12.5t8 -23.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M512 800v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM768 800v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1024 800v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576 q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1152 76v948h-896v-948q0 -22 7 -40.5t14.5 -27t10.5 -8.5h832q3 0 10.5 8.5t14.5 27t7 40.5zM480 1152h448l-48 117q-7 9 -17 11h-317q-10 -2 -17 -11zM1408 1120v-64q0 -14 -9 -23t-23 -9h-96v-948q0 -83 -47 -143.5t-113 -60.5h-832 q-66 0 -113 58.5t-47 141.5v952h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h309l70 167q15 37 54 63t79 26h320q40 0 79 -26t54 -63l70 -167h309q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1408 544v-480q0 -26 -19 -45t-45 -19h-384v384h-256v-384h-384q-26 0 -45 19t-19 45v480q0 1 0.5 3t0.5 3l575 474l575 -474q1 -2 1 -6zM1631 613l-62 -74q-8 -9 -21 -11h-3q-13 0 -21 7l-692 577l-692 -577q-12 -8 -24 -7q-13 2 -21 11l-62 74q-8 10 -7 23.5t11 21.5 l719 599q32 26 76 26t76 -26l244 -204v195q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-408l219 -182q10 -8 11 -21.5t-7 -23.5z" /> +<glyph unicode="" horiz-adv-x="1280" d="M128 0h1024v768h-416q-40 0 -68 28t-28 68v416h-512v-1280zM768 896h376q-10 29 -22 41l-313 313q-12 12 -41 22v-376zM1280 864v-896q0 -40 -28 -68t-68 -28h-1088q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h640q40 0 88 -20t76 -48l312 -312q28 -28 48 -76t20 -88z " /> +<glyph unicode="" d="M896 992v-448q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h224v352q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1111 540v4l-24 320q-1 13 -11 22.5t-23 9.5h-186q-13 0 -23 -9.5t-11 -22.5l-24 -320v-4q-1 -12 8 -20t21 -8h244q12 0 21 8t8 20zM1870 73q0 -73 -46 -73h-704q13 0 22 9.5t8 22.5l-20 256q-1 13 -11 22.5t-23 9.5h-272q-13 0 -23 -9.5t-11 -22.5l-20 -256 q-1 -13 8 -22.5t22 -9.5h-704q-46 0 -46 73q0 54 26 116l417 1044q8 19 26 33t38 14h339q-13 0 -23 -9.5t-11 -22.5l-15 -192q-1 -14 8 -23t22 -9h166q13 0 22 9t8 23l-15 192q-1 13 -11 22.5t-23 9.5h339q20 0 38 -14t26 -33l417 -1044q26 -62 26 -116z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1280 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1536 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 416v-320q0 -40 -28 -68t-68 -28h-1472q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h465l135 -136 q58 -56 136 -56t136 56l136 136h464q40 0 68 -28t28 -68zM1339 985q17 -41 -14 -70l-448 -448q-18 -19 -45 -19t-45 19l-448 448q-31 29 -14 70q17 39 59 39h256v448q0 26 19 45t45 19h256q26 0 45 -19t19 -45v-448h256q42 0 59 -39z" /> +<glyph unicode="" d="M1120 608q0 -12 -10 -24l-319 -319q-11 -9 -23 -9t-23 9l-320 320q-15 16 -7 35q8 20 30 20h192v352q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-352h192q14 0 23 -9t9 -23zM768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273 t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1118 660q-8 -20 -30 -20h-192v-352q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v352h-192q-14 0 -23 9t-9 23q0 12 10 24l319 319q11 9 23 9t23 -9l320 -320q15 -16 7 -35zM768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198 t73 273t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1023 576h316q-1 3 -2.5 8t-2.5 8l-212 496h-708l-212 -496q-1 -2 -2.5 -8t-2.5 -8h316l95 -192h320zM1536 546v-482q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v482q0 62 25 123l238 552q10 25 36.5 42t52.5 17h832q26 0 52.5 -17t36.5 -42l238 -552 q25 -61 25 -123z" /> +<glyph unicode="" d="M1184 640q0 -37 -32 -55l-544 -320q-15 -9 -32 -9q-16 0 -32 8q-32 19 -32 56v640q0 37 32 56q33 18 64 -1l544 -320q32 -18 32 -55zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1536 1280v-448q0 -26 -19 -45t-45 -19h-448q-42 0 -59 40q-17 39 14 69l138 138q-148 137 -349 137q-104 0 -198.5 -40.5t-163.5 -109.5t-109.5 -163.5t-40.5 -198.5t40.5 -198.5t109.5 -163.5t163.5 -109.5t198.5 -40.5q119 0 225 52t179 147q7 10 23 12q14 0 25 -9 l137 -138q9 -8 9.5 -20.5t-7.5 -22.5q-109 -132 -264 -204.5t-327 -72.5q-156 0 -298 61t-245 164t-164 245t-61 298t61 298t164 245t245 164t298 61q147 0 284.5 -55.5t244.5 -156.5l130 129q29 31 70 14q39 -17 39 -59z" /> +<glyph unicode="" d="M1511 480q0 -5 -1 -7q-64 -268 -268 -434.5t-478 -166.5q-146 0 -282.5 55t-243.5 157l-129 -129q-19 -19 -45 -19t-45 19t-19 45v448q0 26 19 45t45 19h448q26 0 45 -19t19 -45t-19 -45l-137 -137q71 -66 161 -102t187 -36q134 0 250 65t186 179q11 17 53 117 q8 23 30 23h192q13 0 22.5 -9.5t9.5 -22.5zM1536 1280v-448q0 -26 -19 -45t-45 -19h-448q-26 0 -45 19t-19 45t19 45l138 138q-148 137 -349 137q-134 0 -250 -65t-186 -179q-11 -17 -53 -117q-8 -23 -30 -23h-199q-13 0 -22.5 9.5t-9.5 22.5v7q65 268 270 434.5t480 166.5 q146 0 284 -55.5t245 -156.5l130 129q19 19 45 19t45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M384 352v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 608v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M384 864v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1536 352v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h960q13 0 22.5 -9.5t9.5 -22.5z M1536 608v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h960q13 0 22.5 -9.5t9.5 -22.5zM1536 864v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h960q13 0 22.5 -9.5 t9.5 -22.5zM1664 160v832q0 13 -9.5 22.5t-22.5 9.5h-1472q-13 0 -22.5 -9.5t-9.5 -22.5v-832q0 -13 9.5 -22.5t22.5 -9.5h1472q13 0 22.5 9.5t9.5 22.5zM1792 1248v-1088q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1472q66 0 113 -47 t47 -113z" /> +<glyph unicode="" horiz-adv-x="1152" d="M320 768h512v192q0 106 -75 181t-181 75t-181 -75t-75 -181v-192zM1152 672v-576q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v576q0 40 28 68t68 28h32v192q0 184 132 316t316 132t316 -132t132 -316v-192h32q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="1792" d="M320 1280q0 -72 -64 -110v-1266q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v1266q-64 38 -64 110q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1792 1216v-763q0 -25 -12.5 -38.5t-39.5 -27.5q-215 -116 -369 -116q-61 0 -123.5 22t-108.5 48 t-115.5 48t-142.5 22q-192 0 -464 -146q-17 -9 -33 -9q-26 0 -45 19t-19 45v742q0 32 31 55q21 14 79 43q236 120 421 120q107 0 200 -29t219 -88q38 -19 88 -19q54 0 117.5 21t110 47t88 47t54.5 21q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1664 650q0 -166 -60 -314l-20 -49l-185 -33q-22 -83 -90.5 -136.5t-156.5 -53.5v-32q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-32q71 0 130 -35.5t93 -95.5l68 12q29 95 29 193q0 148 -88 279t-236.5 209t-315.5 78 t-315.5 -78t-236.5 -209t-88 -279q0 -98 29 -193l68 -12q34 60 93 95.5t130 35.5v32q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v32q-88 0 -156.5 53.5t-90.5 136.5l-185 33l-20 49q-60 148 -60 314q0 151 67 291t179 242.5 t266 163.5t320 61t320 -61t266 -163.5t179 -242.5t67 -291z" /> +<glyph unicode="" horiz-adv-x="768" d="M768 1184v-1088q0 -26 -19 -45t-45 -19t-45 19l-333 333h-262q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h262l333 333q19 19 45 19t45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1152" d="M768 1184v-1088q0 -26 -19 -45t-45 -19t-45 19l-333 333h-262q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h262l333 333q19 19 45 19t45 -19t19 -45zM1152 640q0 -76 -42.5 -141.5t-112.5 -93.5q-10 -5 -25 -5q-26 0 -45 18.5t-19 45.5q0 21 12 35.5t29 25t34 23t29 35.5 t12 57t-12 57t-29 35.5t-34 23t-29 25t-12 35.5q0 27 19 45.5t45 18.5q15 0 25 -5q70 -27 112.5 -93t42.5 -142z" /> +<glyph unicode="" horiz-adv-x="1664" d="M768 1184v-1088q0 -26 -19 -45t-45 -19t-45 19l-333 333h-262q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h262l333 333q19 19 45 19t45 -19t19 -45zM1152 640q0 -76 -42.5 -141.5t-112.5 -93.5q-10 -5 -25 -5q-26 0 -45 18.5t-19 45.5q0 21 12 35.5t29 25t34 23t29 35.5 t12 57t-12 57t-29 35.5t-34 23t-29 25t-12 35.5q0 27 19 45.5t45 18.5q15 0 25 -5q70 -27 112.5 -93t42.5 -142zM1408 640q0 -153 -85 -282.5t-225 -188.5q-13 -5 -25 -5q-27 0 -46 19t-19 45q0 39 39 59q56 29 76 44q74 54 115.5 135.5t41.5 173.5t-41.5 173.5 t-115.5 135.5q-20 15 -76 44q-39 20 -39 59q0 26 19 45t45 19q13 0 26 -5q140 -59 225 -188.5t85 -282.5zM1664 640q0 -230 -127 -422.5t-338 -283.5q-13 -5 -26 -5q-26 0 -45 19t-19 45q0 36 39 59q7 4 22.5 10.5t22.5 10.5q46 25 82 51q123 91 192 227t69 289t-69 289 t-192 227q-36 26 -82 51q-7 4 -22.5 10.5t-22.5 10.5q-39 23 -39 59q0 26 19 45t45 19q13 0 26 -5q211 -91 338 -283.5t127 -422.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M384 384v-128h-128v128h128zM384 1152v-128h-128v128h128zM1152 1152v-128h-128v128h128zM128 129h384v383h-384v-383zM128 896h384v384h-384v-384zM896 896h384v384h-384v-384zM640 640v-640h-640v640h640zM1152 128v-128h-128v128h128zM1408 128v-128h-128v128h128z M1408 640v-384h-384v128h-128v-384h-128v640h384v-128h128v128h128zM640 1408v-640h-640v640h640zM1408 1408v-640h-640v640h640z" /> +<glyph unicode="" horiz-adv-x="1792" d="M63 0h-63v1408h63v-1408zM126 1h-32v1407h32v-1407zM220 1h-31v1407h31v-1407zM377 1h-31v1407h31v-1407zM534 1h-62v1407h62v-1407zM660 1h-31v1407h31v-1407zM723 1h-31v1407h31v-1407zM786 1h-31v1407h31v-1407zM943 1h-63v1407h63v-1407zM1100 1h-63v1407h63v-1407z M1226 1h-63v1407h63v-1407zM1352 1h-63v1407h63v-1407zM1446 1h-63v1407h63v-1407zM1635 1h-94v1407h94v-1407zM1698 1h-32v1407h32v-1407zM1792 0h-63v1408h63v-1408z" /> +<glyph unicode="" d="M448 1088q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1515 512q0 -53 -37 -90l-491 -492q-39 -37 -91 -37q-53 0 -90 37l-715 716q-38 37 -64.5 101t-26.5 117v416q0 52 38 90t90 38h416q53 0 117 -26.5t102 -64.5 l715 -714q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1920" d="M448 1088q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1515 512q0 -53 -37 -90l-491 -492q-39 -37 -91 -37q-53 0 -90 37l-715 716q-38 37 -64.5 101t-26.5 117v416q0 52 38 90t90 38h416q53 0 117 -26.5t102 -64.5 l715 -714q37 -39 37 -91zM1899 512q0 -53 -37 -90l-491 -492q-39 -37 -91 -37q-36 0 -59 14t-53 45l470 470q37 37 37 90q0 52 -37 91l-715 714q-38 38 -102 64.5t-117 26.5h224q53 0 117 -26.5t102 -64.5l715 -714q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1639 1058q40 -57 18 -129l-275 -906q-19 -64 -76.5 -107.5t-122.5 -43.5h-923q-77 0 -148.5 53.5t-99.5 131.5q-24 67 -2 127q0 4 3 27t4 37q1 8 -3 21.5t-3 19.5q2 11 8 21t16.5 23.5t16.5 23.5q23 38 45 91.5t30 91.5q3 10 0.5 30t-0.5 28q3 11 17 28t17 23 q21 36 42 92t25 90q1 9 -2.5 32t0.5 28q4 13 22 30.5t22 22.5q19 26 42.5 84.5t27.5 96.5q1 8 -3 25.5t-2 26.5q2 8 9 18t18 23t17 21q8 12 16.5 30.5t15 35t16 36t19.5 32t26.5 23.5t36 11.5t47.5 -5.5l-1 -3q38 9 51 9h761q74 0 114 -56t18 -130l-274 -906 q-36 -119 -71.5 -153.5t-128.5 -34.5h-869q-27 0 -38 -15q-11 -16 -1 -43q24 -70 144 -70h923q29 0 56 15.5t35 41.5l300 987q7 22 5 57q38 -15 59 -43zM575 1056q-4 -13 2 -22.5t20 -9.5h608q13 0 25.5 9.5t16.5 22.5l21 64q4 13 -2 22.5t-20 9.5h-608q-13 0 -25.5 -9.5 t-16.5 -22.5zM492 800q-4 -13 2 -22.5t20 -9.5h608q13 0 25.5 9.5t16.5 22.5l21 64q4 13 -2 22.5t-20 9.5h-608q-13 0 -25.5 -9.5t-16.5 -22.5z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1164 1408q23 0 44 -9q33 -13 52.5 -41t19.5 -62v-1289q0 -34 -19.5 -62t-52.5 -41q-19 -8 -44 -8q-48 0 -83 32l-441 424l-441 -424q-36 -33 -83 -33q-23 0 -44 9q-33 13 -52.5 41t-19.5 62v1289q0 34 19.5 62t52.5 41q21 9 44 9h1048z" /> +<glyph unicode="" horiz-adv-x="1664" d="M384 0h896v256h-896v-256zM384 640h896v384h-160q-40 0 -68 28t-28 68v160h-640v-640zM1536 576q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 576v-416q0 -13 -9.5 -22.5t-22.5 -9.5h-224v-160q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68 v160h-224q-13 0 -22.5 9.5t-9.5 22.5v416q0 79 56.5 135.5t135.5 56.5h64v544q0 40 28 68t68 28h672q40 0 88 -20t76 -48l152 -152q28 -28 48 -76t20 -88v-256h64q79 0 135.5 -56.5t56.5 -135.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M960 864q119 0 203.5 -84.5t84.5 -203.5t-84.5 -203.5t-203.5 -84.5t-203.5 84.5t-84.5 203.5t84.5 203.5t203.5 84.5zM1664 1280q106 0 181 -75t75 -181v-896q0 -106 -75 -181t-181 -75h-1408q-106 0 -181 75t-75 181v896q0 106 75 181t181 75h224l51 136 q19 49 69.5 84.5t103.5 35.5h512q53 0 103.5 -35.5t69.5 -84.5l51 -136h224zM960 128q185 0 316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M725 977l-170 -450q73 -1 153.5 -2t119 -1.5t52.5 -0.5l29 2q-32 95 -92 241q-53 132 -92 211zM21 -128h-21l2 79q22 7 80 18q89 16 110 31q20 16 48 68l237 616l280 724h75h53l11 -21l205 -480q103 -242 124 -297q39 -102 96 -235q26 -58 65 -164q24 -67 65 -149 q22 -49 35 -57q22 -19 69 -23q47 -6 103 -27q6 -39 6 -57q0 -14 -1 -26q-80 0 -192 8q-93 8 -189 8q-79 0 -135 -2l-200 -11l-58 -2q0 45 4 78l131 28q56 13 68 23q12 12 12 27t-6 32l-47 114l-92 228l-450 2q-29 -65 -104 -274q-23 -64 -23 -84q0 -31 17 -43 q26 -21 103 -32q3 0 13.5 -2t30 -5t40.5 -6q1 -28 1 -58q0 -17 -2 -27q-66 0 -349 20l-48 -8q-81 -14 -167 -14z" /> +<glyph unicode="" horiz-adv-x="1408" d="M555 15q76 -32 140 -32q131 0 216 41t122 113q38 70 38 181q0 114 -41 180q-58 94 -141 126q-80 32 -247 32q-74 0 -101 -10v-144l-1 -173l3 -270q0 -15 12 -44zM541 761q43 -7 109 -7q175 0 264 65t89 224q0 112 -85 187q-84 75 -255 75q-52 0 -130 -13q0 -44 2 -77 q7 -122 6 -279l-1 -98q0 -43 1 -77zM0 -128l2 94q45 9 68 12q77 12 123 31q17 27 21 51q9 66 9 194l-2 497q-5 256 -9 404q-1 87 -11 109q-1 4 -12 12q-18 12 -69 15q-30 2 -114 13l-4 83l260 6l380 13l45 1q5 0 14 0.5t14 0.5q1 0 21.5 -0.5t40.5 -0.5h74q88 0 191 -27 q43 -13 96 -39q57 -29 102 -76q44 -47 65 -104t21 -122q0 -70 -32 -128t-95 -105q-26 -20 -150 -77q177 -41 267 -146q92 -106 92 -236q0 -76 -29 -161q-21 -62 -71 -117q-66 -72 -140 -108q-73 -36 -203 -60q-82 -15 -198 -11l-197 4q-84 2 -298 -11q-33 -3 -272 -11z" /> +<glyph unicode="" horiz-adv-x="1024" d="M0 -126l17 85q4 1 77 20q76 19 116 39q29 37 41 101l27 139l56 268l12 64q8 44 17 84.5t16 67t12.5 46.5t9 30.5t3.5 11.5l29 157l16 63l22 135l8 50v38q-41 22 -144 28q-28 2 -38 4l19 103l317 -14q39 -2 73 -2q66 0 214 9q33 2 68 4.5t36 2.5q-2 -19 -6 -38 q-7 -29 -13 -51q-55 -19 -109 -31q-64 -16 -101 -31q-12 -31 -24 -88q-9 -44 -13 -82q-44 -199 -66 -306l-61 -311l-38 -158l-43 -235l-12 -45q-2 -7 1 -27q64 -15 119 -21q36 -5 66 -10q-1 -29 -7 -58q-7 -31 -9 -41q-18 0 -23 -1q-24 -2 -42 -2q-9 0 -28 3q-19 4 -145 17 l-198 2q-41 1 -174 -11q-74 -7 -98 -9z" /> +<glyph unicode="" horiz-adv-x="1792" d="M81 1407l54 -27q20 -5 211 -5h130l19 3l115 1l215 -1h293l34 -2q14 -1 28 7t21 16l7 8l42 1q15 0 28 -1v-104.5t1 -131.5l1 -100l-1 -58q0 -32 -4 -51q-39 -15 -68 -18q-25 43 -54 128q-8 24 -15.5 62.5t-11.5 65.5t-6 29q-13 15 -27 19q-7 2 -42.5 2t-103.5 -1t-111 -1 q-34 0 -67 -5q-10 -97 -8 -136l1 -152v-332l3 -359l-1 -147q-1 -46 11 -85q49 -25 89 -32q2 0 18 -5t44 -13t43 -12q30 -8 50 -18q5 -45 5 -50q0 -10 -3 -29q-14 -1 -34 -1q-110 0 -187 10q-72 8 -238 8q-88 0 -233 -14q-48 -4 -70 -4q-2 22 -2 26l-1 26v9q21 33 79 49 q139 38 159 50q9 21 12 56q8 192 6 433l-5 428q-1 62 -0.5 118.5t0.5 102.5t-2 57t-6 15q-6 5 -14 6q-38 6 -148 6q-43 0 -100 -13.5t-73 -24.5q-13 -9 -22 -33t-22 -75t-24 -84q-6 -19 -19.5 -32t-20.5 -13q-44 27 -56 44v297v86zM1744 128q33 0 42 -18.5t-11 -44.5 l-126 -162q-20 -26 -49 -26t-49 26l-126 162q-20 26 -11 44.5t42 18.5h80v1024h-80q-33 0 -42 18.5t11 44.5l126 162q20 26 49 26t49 -26l126 -162q20 -26 11 -44.5t-42 -18.5h-80v-1024h80z" /> +<glyph unicode="" d="M81 1407l54 -27q20 -5 211 -5h130l19 3l115 1l446 -1h318l34 -2q14 -1 28 7t21 16l7 8l42 1q15 0 28 -1v-104.5t1 -131.5l1 -100l-1 -58q0 -32 -4 -51q-39 -15 -68 -18q-25 43 -54 128q-8 24 -15.5 62.5t-11.5 65.5t-6 29q-13 15 -27 19q-7 2 -58.5 2t-138.5 -1t-128 -1 q-94 0 -127 -5q-10 -97 -8 -136l1 -152v52l3 -359l-1 -147q-1 -46 11 -85q49 -25 89 -32q2 0 18 -5t44 -13t43 -12q30 -8 50 -18q5 -45 5 -50q0 -10 -3 -29q-14 -1 -34 -1q-110 0 -187 10q-72 8 -238 8q-82 0 -233 -13q-45 -5 -70 -5q-2 22 -2 26l-1 26v9q21 33 79 49 q139 38 159 50q9 21 12 56q6 137 6 433l-5 44q0 265 -2 278q-2 11 -6 15q-6 5 -14 6q-38 6 -148 6q-50 0 -168.5 -14t-132.5 -24q-13 -9 -22 -33t-22 -75t-24 -84q-6 -19 -19.5 -32t-20.5 -13q-44 27 -56 44v297v86zM1505 113q26 -20 26 -49t-26 -49l-162 -126 q-26 -20 -44.5 -11t-18.5 42v80h-1024v-80q0 -33 -18.5 -42t-44.5 11l-162 126q-26 20 -26 49t26 49l162 126q26 20 44.5 11t18.5 -42v-80h1024v80q0 33 18.5 42t44.5 -11z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1408 576v-128q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1280q26 0 45 -19t19 -45zM1664 960v-128q0 -26 -19 -45 t-45 -19h-1536q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1536q26 0 45 -19t19 -45zM1280 1344v-128q0 -26 -19 -45t-45 -19h-1152q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1408 576v-128q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h896q26 0 45 -19t19 -45zM1664 960v-128q0 -26 -19 -45t-45 -19 h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1280 1344v-128q0 -26 -19 -45t-45 -19h-640q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h640q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 576v-128q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1280q26 0 45 -19t19 -45zM1792 960v-128q0 -26 -19 -45 t-45 -19h-1536q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1536q26 0 45 -19t19 -45zM1792 1344v-128q0 -26 -19 -45t-45 -19h-1152q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 576v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 960v-128q0 -26 -19 -45 t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 1344v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M256 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5zM256 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5 t9.5 -22.5zM256 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1344 q13 0 22.5 -9.5t9.5 -22.5zM256 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5zM1792 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5 t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5zM1792 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5zM1792 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192 q0 13 9.5 22.5t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M384 992v-576q0 -13 -9.5 -22.5t-22.5 -9.5q-14 0 -23 9l-288 288q-9 9 -9 23t9 23l288 288q9 9 23 9q13 0 22.5 -9.5t9.5 -22.5zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5 t9.5 -22.5zM1792 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088q13 0 22.5 -9.5t9.5 -22.5zM1792 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088 q13 0 22.5 -9.5t9.5 -22.5zM1792 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5t9.5 -22.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M352 704q0 -14 -9 -23l-288 -288q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5v576q0 13 9.5 22.5t22.5 9.5q14 0 23 -9l288 -288q9 -9 9 -23zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5 t9.5 -22.5zM1792 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088q13 0 22.5 -9.5t9.5 -22.5zM1792 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088 q13 0 22.5 -9.5t9.5 -22.5zM1792 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5t9.5 -22.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 1184v-1088q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-403 403v-166q0 -119 -84.5 -203.5t-203.5 -84.5h-704q-119 0 -203.5 84.5t-84.5 203.5v704q0 119 84.5 203.5t203.5 84.5h704q119 0 203.5 -84.5t84.5 -203.5v-165l403 402q18 19 45 19q12 0 25 -5 q39 -17 39 -59z" /> +<glyph unicode="" horiz-adv-x="1920" d="M640 960q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1664 576v-448h-1408v192l320 320l160 -160l512 512zM1760 1280h-1600q-13 0 -22.5 -9.5t-9.5 -22.5v-1216q0 -13 9.5 -22.5t22.5 -9.5h1600q13 0 22.5 9.5t9.5 22.5v1216 q0 13 -9.5 22.5t-22.5 9.5zM1920 1248v-1216q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" /> +<glyph unicode="" d="M363 0l91 91l-235 235l-91 -91v-107h128v-128h107zM886 928q0 22 -22 22q-10 0 -17 -7l-542 -542q-7 -7 -7 -17q0 -22 22 -22q10 0 17 7l542 542q7 7 7 17zM832 1120l416 -416l-832 -832h-416v416zM1515 1024q0 -53 -37 -90l-166 -166l-416 416l166 165q36 38 90 38 q53 0 91 -38l235 -234q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1024" d="M768 896q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1024 896q0 -109 -33 -179l-364 -774q-16 -33 -47.5 -52t-67.5 -19t-67.5 19t-46.5 52l-365 774q-33 70 -33 179q0 212 150 362t362 150t362 -150t150 -362z" /> +<glyph unicode="" d="M768 96v1088q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M512 384q0 36 -20 69q-1 1 -15.5 22.5t-25.5 38t-25 44t-21 50.5q-4 16 -21 16t-21 -16q-7 -23 -21 -50.5t-25 -44t-25.5 -38t-15.5 -22.5q-20 -33 -20 -69q0 -53 37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1024 512q0 -212 -150 -362t-362 -150t-362 150t-150 362 q0 145 81 275q6 9 62.5 90.5t101 151t99.5 178t83 201.5q9 30 34 47t51 17t51.5 -17t33.5 -47q28 -93 83 -201.5t99.5 -178t101 -151t62.5 -90.5q81 -127 81 -275z" /> +<glyph unicode="" horiz-adv-x="1792" d="M888 352l116 116l-152 152l-116 -116v-56h96v-96h56zM1328 1072q-16 16 -33 -1l-350 -350q-17 -17 -1 -33t33 1l350 350q17 17 1 33zM1408 478v-190q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832 q63 0 117 -25q15 -7 18 -23q3 -17 -9 -29l-49 -49q-14 -14 -32 -8q-23 6 -45 6h-832q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v126q0 13 9 22l64 64q15 15 35 7t20 -29zM1312 1216l288 -288l-672 -672h-288v288zM1756 1084l-92 -92 l-288 288l92 92q28 28 68 28t68 -28l152 -152q28 -28 28 -68t-28 -68z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1408 547v-259q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h255v0q13 0 22.5 -9.5t9.5 -22.5q0 -27 -26 -32q-77 -26 -133 -60q-10 -4 -16 -4h-112q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832 q66 0 113 47t47 113v214q0 19 18 29q28 13 54 37q16 16 35 8q21 -9 21 -29zM1645 1043l-384 -384q-18 -19 -45 -19q-12 0 -25 5q-39 17 -39 59v192h-160q-323 0 -438 -131q-119 -137 -74 -473q3 -23 -20 -34q-8 -2 -12 -2q-16 0 -26 13q-10 14 -21 31t-39.5 68.5t-49.5 99.5 t-38.5 114t-17.5 122q0 49 3.5 91t14 90t28 88t47 81.5t68.5 74t94.5 61.5t124.5 48.5t159.5 30.5t196.5 11h160v192q0 42 39 59q13 5 25 5q26 0 45 -19l384 -384q19 -19 19 -45t-19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1408 606v-318q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832q63 0 117 -25q15 -7 18 -23q3 -17 -9 -29l-49 -49q-10 -10 -23 -10q-3 0 -9 2q-23 6 -45 6h-832q-66 0 -113 -47t-47 -113v-832 q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v254q0 13 9 22l64 64q10 10 23 10q6 0 12 -3q20 -8 20 -29zM1639 1095l-814 -814q-24 -24 -57 -24t-57 24l-430 430q-24 24 -24 57t24 57l110 110q24 24 57 24t57 -24l263 -263l647 647q24 24 57 24t57 -24l110 -110 q24 -24 24 -57t-24 -57z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 640q0 -26 -19 -45l-256 -256q-19 -19 -45 -19t-45 19t-19 45v128h-384v-384h128q26 0 45 -19t19 -45t-19 -45l-256 -256q-19 -19 -45 -19t-45 19l-256 256q-19 19 -19 45t19 45t45 19h128v384h-384v-128q0 -26 -19 -45t-45 -19t-45 19l-256 256q-19 19 -19 45 t19 45l256 256q19 19 45 19t45 -19t19 -45v-128h384v384h-128q-26 0 -45 19t-19 45t19 45l256 256q19 19 45 19t45 -19l256 -256q19 -19 19 -45t-19 -45t-45 -19h-128v-384h384v128q0 26 19 45t45 19t45 -19l256 -256q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="1024" d="M979 1395q19 19 32 13t13 -32v-1472q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-678q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-678q4 11 13 19z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1747 1395q19 19 32 13t13 -32v-1472q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-710q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-678q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-678q4 11 13 19l710 710 q19 19 32 13t13 -32v-710q4 11 13 19z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1619 1395q19 19 32 13t13 -32v-1472q0 -26 -13 -32t-32 13l-710 710q-8 9 -13 19v-710q0 -26 -13 -32t-32 13l-710 710q-19 19 -19 45t19 45l710 710q19 19 32 13t13 -32v-710q5 11 13 19z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1384 609l-1328 -738q-23 -13 -39.5 -3t-16.5 36v1472q0 26 16.5 36t39.5 -3l1328 -738q23 -13 23 -31t-23 -31z" /> +<glyph unicode="" d="M1536 1344v-1408q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h512q26 0 45 -19t19 -45zM640 1344v-1408q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h512q26 0 45 -19t19 -45z" /> +<glyph unicode="" d="M1536 1344v-1408q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h1408q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M45 -115q-19 -19 -32 -13t-13 32v1472q0 26 13 32t32 -13l710 -710q8 -8 13 -19v710q0 26 13 32t32 -13l710 -710q19 -19 19 -45t-19 -45l-710 -710q-19 -19 -32 -13t-13 32v710q-5 -10 -13 -19z" /> +<glyph unicode="" horiz-adv-x="1792" d="M45 -115q-19 -19 -32 -13t-13 32v1472q0 26 13 32t32 -13l710 -710q8 -8 13 -19v710q0 26 13 32t32 -13l710 -710q8 -8 13 -19v678q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-1408q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v678q-5 -10 -13 -19l-710 -710 q-19 -19 -32 -13t-13 32v710q-5 -10 -13 -19z" /> +<glyph unicode="" horiz-adv-x="1024" d="M45 -115q-19 -19 -32 -13t-13 32v1472q0 26 13 32t32 -13l710 -710q8 -8 13 -19v678q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-1408q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v678q-5 -10 -13 -19z" /> +<glyph unicode="" horiz-adv-x="1538" d="M14 557l710 710q19 19 45 19t45 -19l710 -710q19 -19 13 -32t-32 -13h-1472q-26 0 -32 13t13 32zM1473 0h-1408q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1408q26 0 45 -19t19 -45v-256q0 -26 -19 -45t-45 -19z" /> +<glyph unicode="" horiz-adv-x="1152" d="M742 -37l-652 651q-37 37 -37 90.5t37 90.5l652 651q37 37 90.5 37t90.5 -37l75 -75q37 -37 37 -90.5t-37 -90.5l-486 -486l486 -485q37 -38 37 -91t-37 -90l-75 -75q-37 -37 -90.5 -37t-90.5 37z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1099 704q0 -52 -37 -91l-652 -651q-37 -37 -90 -37t-90 37l-76 75q-37 39 -37 91q0 53 37 90l486 486l-486 485q-37 39 -37 91q0 53 37 90l76 75q36 38 90 38t90 -38l652 -651q37 -37 37 -90z" /> +<glyph unicode="" d="M1216 576v128q0 26 -19 45t-45 19h-256v256q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-256h-256q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h256v-256q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v256h256q26 0 45 19t19 45zM1536 640q0 -209 -103 -385.5 t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1216 576v128q0 26 -19 45t-45 19h-768q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h768q26 0 45 19t19 45zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5 t103 -385.5z" /> +<glyph unicode="" d="M1149 414q0 26 -19 45l-181 181l181 181q19 19 19 45q0 27 -19 46l-90 90q-19 19 -46 19q-26 0 -45 -19l-181 -181l-181 181q-19 19 -45 19q-27 0 -46 -19l-90 -90q-19 -19 -19 -46q0 -26 19 -45l181 -181l-181 -181q-19 -19 -19 -45q0 -27 19 -46l90 -90q19 -19 46 -19 q26 0 45 19l181 181l181 -181q19 -19 45 -19q27 0 46 19l90 90q19 19 19 46zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1284 802q0 28 -18 46l-91 90q-19 19 -45 19t-45 -19l-408 -407l-226 226q-19 19 -45 19t-45 -19l-91 -90q-18 -18 -18 -46q0 -27 18 -45l362 -362q19 -19 45 -19q27 0 46 19l543 543q18 18 18 45zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103 t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M896 160v192q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h192q14 0 23 9t9 23zM1152 832q0 88 -55.5 163t-138.5 116t-170 41q-243 0 -371 -213q-15 -24 8 -42l132 -100q7 -6 19 -6q16 0 25 12q53 68 86 92q34 24 86 24q48 0 85.5 -26t37.5 -59 q0 -38 -20 -61t-68 -45q-63 -28 -115.5 -86.5t-52.5 -125.5v-36q0 -14 9 -23t23 -9h192q14 0 23 9t9 23q0 19 21.5 49.5t54.5 49.5q32 18 49 28.5t46 35t44.5 48t28 60.5t12.5 81zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5 t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1024 160v160q0 14 -9 23t-23 9h-96v512q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-160q0 -14 9 -23t23 -9h96v-320h-96q-14 0 -23 -9t-9 -23v-160q0 -14 9 -23t23 -9h448q14 0 23 9t9 23zM896 1056v160q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-160q0 -14 9 -23 t23 -9h192q14 0 23 9t9 23zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1197 512h-109q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h109q-32 108 -112.5 188.5t-188.5 112.5v-109q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v109q-108 -32 -188.5 -112.5t-112.5 -188.5h109q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-109 q32 -108 112.5 -188.5t188.5 -112.5v109q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-109q108 32 188.5 112.5t112.5 188.5zM1536 704v-128q0 -26 -19 -45t-45 -19h-143q-37 -161 -154.5 -278.5t-278.5 -154.5v-143q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v143 q-161 37 -278.5 154.5t-154.5 278.5h-143q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h143q37 161 154.5 278.5t278.5 154.5v143q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-143q161 -37 278.5 -154.5t154.5 -278.5h143q26 0 45 -19t19 -45z" /> +<glyph unicode="" d="M1097 457l-146 -146q-10 -10 -23 -10t-23 10l-137 137l-137 -137q-10 -10 -23 -10t-23 10l-146 146q-10 10 -10 23t10 23l137 137l-137 137q-10 10 -10 23t10 23l146 146q10 10 23 10t23 -10l137 -137l137 137q10 10 23 10t23 -10l146 -146q10 -10 10 -23t-10 -23 l-137 -137l137 -137q10 -10 10 -23t-10 -23zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5 t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1171 723l-422 -422q-19 -19 -45 -19t-45 19l-294 294q-19 19 -19 45t19 45l102 102q19 19 45 19t45 -19l147 -147l275 275q19 19 45 19t45 -19l102 -102q19 -19 19 -45t-19 -45zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198 t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1312 643q0 161 -87 295l-754 -753q137 -89 297 -89q111 0 211.5 43.5t173.5 116.5t116 174.5t43 212.5zM313 344l755 754q-135 91 -300 91q-148 0 -273 -73t-198 -199t-73 -274q0 -162 89 -299zM1536 643q0 -157 -61 -300t-163.5 -246t-245 -164t-298.5 -61t-298.5 61 t-245 164t-163.5 246t-61 300t61 299.5t163.5 245.5t245 164t298.5 61t298.5 -61t245 -164t163.5 -245.5t61 -299.5z" /> +<glyph unicode="" d="M1536 640v-128q0 -53 -32.5 -90.5t-84.5 -37.5h-704l293 -294q38 -36 38 -90t-38 -90l-75 -76q-37 -37 -90 -37q-52 0 -91 37l-651 652q-37 37 -37 90q0 52 37 91l651 650q38 38 91 38q52 0 90 -38l75 -74q38 -38 38 -91t-38 -91l-293 -293h704q52 0 84.5 -37.5 t32.5 -90.5z" /> +<glyph unicode="" d="M1472 576q0 -54 -37 -91l-651 -651q-39 -37 -91 -37q-51 0 -90 37l-75 75q-38 38 -38 91t38 91l293 293h-704q-52 0 -84.5 37.5t-32.5 90.5v128q0 53 32.5 90.5t84.5 37.5h704l-293 294q-38 36 -38 90t38 90l75 75q38 38 90 38q53 0 91 -38l651 -651q37 -35 37 -90z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1611 565q0 -51 -37 -90l-75 -75q-38 -38 -91 -38q-54 0 -90 38l-294 293v-704q0 -52 -37.5 -84.5t-90.5 -32.5h-128q-53 0 -90.5 32.5t-37.5 84.5v704l-294 -293q-36 -38 -90 -38t-90 38l-75 75q-38 38 -38 90q0 53 38 91l651 651q35 37 90 37q54 0 91 -37l651 -651 q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1611 704q0 -53 -37 -90l-651 -652q-39 -37 -91 -37q-53 0 -90 37l-651 652q-38 36 -38 90q0 53 38 91l74 75q39 37 91 37q53 0 90 -37l294 -294v704q0 52 38 90t90 38h128q52 0 90 -38t38 -90v-704l294 294q37 37 90 37q52 0 91 -37l75 -75q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 896q0 -26 -19 -45l-512 -512q-19 -19 -45 -19t-45 19t-19 45v256h-224q-98 0 -175.5 -6t-154 -21.5t-133 -42.5t-105.5 -69.5t-80 -101t-48.5 -138.5t-17.5 -181q0 -55 5 -123q0 -6 2.5 -23.5t2.5 -26.5q0 -15 -8.5 -25t-23.5 -10q-16 0 -28 17q-7 9 -13 22 t-13.5 30t-10.5 24q-127 285 -127 451q0 199 53 333q162 403 875 403h224v256q0 26 19 45t45 19t45 -19l512 -512q19 -19 19 -45z" /> +<glyph unicode="" d="M755 480q0 -13 -10 -23l-332 -332l144 -144q19 -19 19 -45t-19 -45t-45 -19h-448q-26 0 -45 19t-19 45v448q0 26 19 45t45 19t45 -19l144 -144l332 332q10 10 23 10t23 -10l114 -114q10 -10 10 -23zM1536 1344v-448q0 -26 -19 -45t-45 -19t-45 19l-144 144l-332 -332 q-10 -10 -23 -10t-23 10l-114 114q-10 10 -10 23t10 23l332 332l-144 144q-19 19 -19 45t19 45t45 19h448q26 0 45 -19t19 -45z" /> +<glyph unicode="" d="M768 576v-448q0 -26 -19 -45t-45 -19t-45 19l-144 144l-332 -332q-10 -10 -23 -10t-23 10l-114 114q-10 10 -10 23t10 23l332 332l-144 144q-19 19 -19 45t19 45t45 19h448q26 0 45 -19t19 -45zM1523 1248q0 -13 -10 -23l-332 -332l144 -144q19 -19 19 -45t-19 -45 t-45 -19h-448q-26 0 -45 19t-19 45v448q0 26 19 45t45 19t45 -19l144 -144l332 332q10 10 23 10t23 -10l114 -114q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1408 800v-192q0 -40 -28 -68t-68 -28h-416v-416q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v416h-416q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h416v416q0 40 28 68t68 28h192q40 0 68 -28t28 -68v-416h416q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1408 800v-192q0 -40 -28 -68t-68 -28h-1216q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h1216q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1482 486q46 -26 59.5 -77.5t-12.5 -97.5l-64 -110q-26 -46 -77.5 -59.5t-97.5 12.5l-266 153v-307q0 -52 -38 -90t-90 -38h-128q-52 0 -90 38t-38 90v307l-266 -153q-46 -26 -97.5 -12.5t-77.5 59.5l-64 110q-26 46 -12.5 97.5t59.5 77.5l266 154l-266 154 q-46 26 -59.5 77.5t12.5 97.5l64 110q26 46 77.5 59.5t97.5 -12.5l266 -153v307q0 52 38 90t90 38h128q52 0 90 -38t38 -90v-307l266 153q46 26 97.5 12.5t77.5 -59.5l64 -110q26 -46 12.5 -97.5t-59.5 -77.5l-266 -154z" /> +<glyph unicode="" d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM896 161v190q0 14 -9 23.5t-22 9.5h-192q-13 0 -23 -10t-10 -23v-190q0 -13 10 -23t23 -10h192 q13 0 22 9.5t9 23.5zM894 505l18 621q0 12 -10 18q-10 8 -24 8h-220q-14 0 -24 -8q-10 -6 -10 -18l17 -621q0 -10 10 -17.5t24 -7.5h185q14 0 23.5 7.5t10.5 17.5z" /> +<glyph unicode="" d="M928 180v56v468v192h-320v-192v-468v-56q0 -25 18 -38.5t46 -13.5h192q28 0 46 13.5t18 38.5zM472 1024h195l-126 161q-26 31 -69 31q-40 0 -68 -28t-28 -68t28 -68t68 -28zM1160 1120q0 40 -28 68t-68 28q-43 0 -69 -31l-125 -161h194q40 0 68 28t28 68zM1536 864v-320 q0 -14 -9 -23t-23 -9h-96v-416q0 -40 -28 -68t-68 -28h-1088q-40 0 -68 28t-28 68v416h-96q-14 0 -23 9t-9 23v320q0 14 9 23t23 9h440q-93 0 -158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5q107 0 168 -77l128 -165l128 165q61 77 168 77q93 0 158.5 -65.5t65.5 -158.5 t-65.5 -158.5t-158.5 -65.5h440q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1280 832q0 26 -19 45t-45 19q-172 0 -318 -49.5t-259.5 -134t-235.5 -219.5q-19 -21 -19 -45q0 -26 19 -45t45 -19q24 0 45 19q27 24 74 71t67 66q137 124 268.5 176t313.5 52q26 0 45 19t19 45zM1792 1030q0 -95 -20 -193q-46 -224 -184.5 -383t-357.5 -268 q-214 -108 -438 -108q-148 0 -286 47q-15 5 -88 42t-96 37q-16 0 -39.5 -32t-45 -70t-52.5 -70t-60 -32q-30 0 -51 11t-31 24t-27 42q-2 4 -6 11t-5.5 10t-3 9.5t-1.5 13.5q0 35 31 73.5t68 65.5t68 56t31 48q0 4 -14 38t-16 44q-9 51 -9 104q0 115 43.5 220t119 184.5 t170.5 139t204 95.5q55 18 145 25.5t179.5 9t178.5 6t163.5 24t113.5 56.5l29.5 29.5t29.5 28t27 20t36.5 16t43.5 4.5q39 0 70.5 -46t47.5 -112t24 -124t8 -96z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1408 -160v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5zM1152 896q0 -78 -24.5 -144t-64 -112.5t-87.5 -88t-96 -77.5t-87.5 -72t-64 -81.5t-24.5 -96.5q0 -96 67 -224l-4 1l1 -1 q-90 41 -160 83t-138.5 100t-113.5 122.5t-72.5 150.5t-27.5 184q0 78 24.5 144t64 112.5t87.5 88t96 77.5t87.5 72t64 81.5t24.5 96.5q0 94 -66 224l3 -1l-1 1q90 -41 160 -83t138.5 -100t113.5 -122.5t72.5 -150.5t27.5 -184z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1664 576q-152 236 -381 353q61 -104 61 -225q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 121 61 225q-229 -117 -381 -353q133 -205 333.5 -326.5t434.5 -121.5t434.5 121.5t333.5 326.5zM944 960q0 20 -14 34t-34 14q-125 0 -214.5 -89.5 t-89.5 -214.5q0 -20 14 -34t34 -14t34 14t14 34q0 86 61 147t147 61q20 0 34 14t14 34zM1792 576q0 -34 -20 -69q-140 -230 -376.5 -368.5t-499.5 -138.5t-499.5 139t-376.5 368q-20 35 -20 69t20 69q140 229 376.5 368t499.5 139t499.5 -139t376.5 -368q20 -35 20 -69z" /> +<glyph unicode="" horiz-adv-x="1792" d="M555 201l78 141q-87 63 -136 159t-49 203q0 121 61 225q-229 -117 -381 -353q167 -258 427 -375zM944 960q0 20 -14 34t-34 14q-125 0 -214.5 -89.5t-89.5 -214.5q0 -20 14 -34t34 -14t34 14t14 34q0 86 61 147t147 61q20 0 34 14t14 34zM1307 1151q0 -7 -1 -9 q-105 -188 -315 -566t-316 -567l-49 -89q-10 -16 -28 -16q-12 0 -134 70q-16 10 -16 28q0 12 44 87q-143 65 -263.5 173t-208.5 245q-20 31 -20 69t20 69q153 235 380 371t496 136q89 0 180 -17l54 97q10 16 28 16q5 0 18 -6t31 -15.5t33 -18.5t31.5 -18.5t19.5 -11.5 q16 -10 16 -27zM1344 704q0 -139 -79 -253.5t-209 -164.5l280 502q8 -45 8 -84zM1792 576q0 -35 -20 -69q-39 -64 -109 -145q-150 -172 -347.5 -267t-419.5 -95l74 132q212 18 392.5 137t301.5 307q-115 179 -282 294l63 112q95 -64 182.5 -153t144.5 -184q20 -34 20 -69z " /> +<glyph unicode="" horiz-adv-x="1792" d="M1024 161v190q0 14 -9.5 23.5t-22.5 9.5h-192q-13 0 -22.5 -9.5t-9.5 -23.5v-190q0 -14 9.5 -23.5t22.5 -9.5h192q13 0 22.5 9.5t9.5 23.5zM1022 535l18 459q0 12 -10 19q-13 11 -24 11h-220q-11 0 -24 -11q-10 -7 -10 -21l17 -457q0 -10 10 -16.5t24 -6.5h185 q14 0 23.5 6.5t10.5 16.5zM1008 1469l768 -1408q35 -63 -2 -126q-17 -29 -46.5 -46t-63.5 -17h-1536q-34 0 -63.5 17t-46.5 46q-37 63 -2 126l768 1408q17 31 47 49t65 18t65 -18t47 -49z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1376 1376q44 -52 12 -148t-108 -172l-161 -161l160 -696q5 -19 -12 -33l-128 -96q-7 -6 -19 -6q-4 0 -7 1q-15 3 -21 16l-279 508l-259 -259l53 -194q5 -17 -8 -31l-96 -96q-9 -9 -23 -9h-2q-15 2 -24 13l-189 252l-252 189q-11 7 -13 23q-1 13 9 25l96 97q9 9 23 9 q6 0 8 -1l194 -53l259 259l-508 279q-14 8 -17 24q-2 16 9 27l128 128q14 13 30 8l665 -159l160 160q76 76 172 108t148 -12z" /> +<glyph unicode="" horiz-adv-x="1664" d="M128 -128h288v288h-288v-288zM480 -128h320v288h-320v-288zM128 224h288v320h-288v-320zM480 224h320v320h-320v-320zM128 608h288v288h-288v-288zM864 -128h320v288h-320v-288zM480 608h320v288h-320v-288zM1248 -128h288v288h-288v-288zM864 224h320v320h-320v-320z M512 1088v288q0 13 -9.5 22.5t-22.5 9.5h-64q-13 0 -22.5 -9.5t-9.5 -22.5v-288q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5zM1248 224h288v320h-288v-320zM864 608h320v288h-320v-288zM1248 608h288v288h-288v-288zM1280 1088v288q0 13 -9.5 22.5t-22.5 9.5h-64 q-13 0 -22.5 -9.5t-9.5 -22.5v-288q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5zM1664 1152v-1280q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47 h64q66 0 113 -47t47 -113v-96h128q52 0 90 -38t38 -90z" /> +<glyph unicode="" horiz-adv-x="1792" d="M666 1055q-60 -92 -137 -273q-22 45 -37 72.5t-40.5 63.5t-51 56.5t-63 35t-81.5 14.5h-224q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h224q250 0 410 -225zM1792 256q0 -14 -9 -23l-320 -320q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5v192q-32 0 -85 -0.5t-81 -1t-73 1 t-71 5t-64 10.5t-63 18.5t-58 28.5t-59 40t-55 53.5t-56 69.5q59 93 136 273q22 -45 37 -72.5t40.5 -63.5t51 -56.5t63 -35t81.5 -14.5h256v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23zM1792 1152q0 -14 -9 -23l-320 -320q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5 v192h-256q-48 0 -87 -15t-69 -45t-51 -61.5t-45 -77.5q-32 -62 -78 -171q-29 -66 -49.5 -111t-54 -105t-64 -100t-74 -83t-90 -68.5t-106.5 -42t-128 -16.5h-224q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h224q48 0 87 15t69 45t51 61.5t45 77.5q32 62 78 171q29 66 49.5 111 t54 105t64 100t74 83t90 68.5t106.5 42t128 16.5h256v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 640q0 -174 -120 -321.5t-326 -233t-450 -85.5q-70 0 -145 8q-198 -175 -460 -242q-49 -14 -114 -22q-17 -2 -30.5 9t-17.5 29v1q-3 4 -0.5 12t2 10t4.5 9.5l6 9t7 8.5t8 9q7 8 31 34.5t34.5 38t31 39.5t32.5 51t27 59t26 76q-157 89 -247.5 220t-90.5 281 q0 130 71 248.5t191 204.5t286 136.5t348 50.5q244 0 450 -85.5t326 -233t120 -321.5z" /> +<glyph unicode="" d="M1536 704v-128q0 -201 -98.5 -362t-274 -251.5t-395.5 -90.5t-395.5 90.5t-274 251.5t-98.5 362v128q0 26 19 45t45 19h384q26 0 45 -19t19 -45v-128q0 -52 23.5 -90t53.5 -57t71 -30t64 -13t44 -2t44 2t64 13t71 30t53.5 57t23.5 90v128q0 26 19 45t45 19h384 q26 0 45 -19t19 -45zM512 1344v-384q0 -26 -19 -45t-45 -19h-384q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h384q26 0 45 -19t19 -45zM1536 1344v-384q0 -26 -19 -45t-45 -19h-384q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h384q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1611 320q0 -53 -37 -90l-75 -75q-38 -38 -91 -38q-54 0 -90 38l-486 485l-486 -485q-36 -38 -90 -38t-90 38l-75 75q-38 36 -38 90q0 53 38 91l651 651q37 37 90 37q52 0 91 -37l650 -651q38 -38 38 -91z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1611 832q0 -53 -37 -90l-651 -651q-38 -38 -91 -38q-54 0 -90 38l-651 651q-38 36 -38 90q0 53 38 91l74 75q39 37 91 37q53 0 90 -37l486 -486l486 486q37 37 90 37q52 0 91 -37l75 -75q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1280 32q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-8 0 -13.5 2t-9 7t-5.5 8t-3 11.5t-1 11.5v13v11v160v416h-192q-26 0 -45 19t-19 45q0 24 15 41l320 384q19 22 49 22t49 -22l320 -384q15 -17 15 -41q0 -26 -19 -45t-45 -19h-192v-384h576q16 0 25 -11l160 -192q7 -11 7 -21 zM1920 448q0 -24 -15 -41l-320 -384q-20 -23 -49 -23t-49 23l-320 384q-15 17 -15 41q0 26 19 45t45 19h192v384h-576q-16 0 -25 12l-160 192q-7 9 -7 20q0 13 9.5 22.5t22.5 9.5h960q8 0 13.5 -2t9 -7t5.5 -8t3 -11.5t1 -11.5v-13v-11v-160v-416h192q26 0 45 -19t19 -45z " /> +<glyph unicode="" horiz-adv-x="1664" d="M640 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1536 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1664 1088v-512q0 -24 -16 -42.5t-41 -21.5 l-1044 -122q1 -7 4.5 -21.5t6 -26.5t2.5 -22q0 -16 -24 -64h920q26 0 45 -19t19 -45t-19 -45t-45 -19h-1024q-26 0 -45 19t-19 45q0 14 11 39.5t29.5 59.5t20.5 38l-177 823h-204q-26 0 -45 19t-19 45t19 45t45 19h256q16 0 28.5 -6.5t20 -15.5t13 -24.5t7.5 -26.5 t5.5 -29.5t4.5 -25.5h1201q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1664 928v-704q0 -92 -66 -158t-158 -66h-1216q-92 0 -158 66t-66 158v960q0 92 66 158t158 66h320q92 0 158 -66t66 -158v-32h672q92 0 158 -66t66 -158z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1879 584q0 -31 -31 -66l-336 -396q-43 -51 -120.5 -86.5t-143.5 -35.5h-1088q-34 0 -60.5 13t-26.5 43q0 31 31 66l336 396q43 51 120.5 86.5t143.5 35.5h1088q34 0 60.5 -13t26.5 -43zM1536 928v-160h-832q-94 0 -197 -47.5t-164 -119.5l-337 -396l-5 -6q0 4 -0.5 12.5 t-0.5 12.5v960q0 92 66 158t158 66h320q92 0 158 -66t66 -158v-32h544q92 0 158 -66t66 -158z" /> +<glyph unicode="" horiz-adv-x="768" d="M704 1216q0 -26 -19 -45t-45 -19h-128v-1024h128q26 0 45 -19t19 -45t-19 -45l-256 -256q-19 -19 -45 -19t-45 19l-256 256q-19 19 -19 45t19 45t45 19h128v1024h-128q-26 0 -45 19t-19 45t19 45l256 256q19 19 45 19t45 -19l256 -256q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 640q0 -26 -19 -45l-256 -256q-19 -19 -45 -19t-45 19t-19 45v128h-1024v-128q0 -26 -19 -45t-45 -19t-45 19l-256 256q-19 19 -19 45t19 45l256 256q19 19 45 19t45 -19t19 -45v-128h1024v128q0 26 19 45t45 19t45 -19l256 -256q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="1920" d="M512 512v-384h-256v384h256zM896 1024v-896h-256v896h256zM1280 768v-640h-256v640h256zM1664 1152v-1024h-256v1024h256zM1792 32v1216q0 13 -9.5 22.5t-22.5 9.5h-1600q-13 0 -22.5 -9.5t-9.5 -22.5v-1216q0 -13 9.5 -22.5t22.5 -9.5h1600q13 0 22.5 9.5t9.5 22.5z M1920 1248v-1216q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" /> +<glyph unicode="" d="M1280 926q-56 -25 -121 -34q68 40 93 117q-65 -38 -134 -51q-61 66 -153 66q-87 0 -148.5 -61.5t-61.5 -148.5q0 -29 5 -48q-129 7 -242 65t-192 155q-29 -50 -29 -106q0 -114 91 -175q-47 1 -100 26v-2q0 -75 50 -133.5t123 -72.5q-29 -8 -51 -8q-13 0 -39 4 q21 -63 74.5 -104t121.5 -42q-116 -90 -261 -90q-26 0 -50 3q148 -94 322 -94q112 0 210 35.5t168 95t120.5 137t75 162t24.5 168.5q0 18 -1 27q63 45 105 109zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5 t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1307 618l23 219h-198v109q0 49 15.5 68.5t71.5 19.5h110v219h-175q-152 0 -218 -72t-66 -213v-131h-131v-219h131v-635h262v635h175zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960 q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M928 704q0 14 -9 23t-23 9q-66 0 -113 -47t-47 -113q0 -14 9 -23t23 -9t23 9t9 23q0 40 28 68t68 28q14 0 23 9t9 23zM1152 574q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181zM128 0h1536v128h-1536v-128zM1280 574q0 159 -112.5 271.5 t-271.5 112.5t-271.5 -112.5t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5zM256 1216h384v128h-384v-128zM128 1024h1536v118v138h-828l-64 -128h-644v-128zM1792 1280v-1280q0 -53 -37.5 -90.5t-90.5 -37.5h-1536q-53 0 -90.5 37.5t-37.5 90.5v1280 q0 53 37.5 90.5t90.5 37.5h1536q53 0 90.5 -37.5t37.5 -90.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M832 1024q0 80 -56 136t-136 56t-136 -56t-56 -136q0 -42 19 -83q-41 19 -83 19q-80 0 -136 -56t-56 -136t56 -136t136 -56t136 56t56 136q0 42 -19 83q41 -19 83 -19q80 0 136 56t56 136zM1683 320q0 -17 -49 -66t-66 -49q-9 0 -28.5 16t-36.5 33t-38.5 40t-24.5 26 l-96 -96l220 -220q28 -28 28 -68q0 -42 -39 -81t-81 -39q-40 0 -68 28l-671 671q-176 -131 -365 -131q-163 0 -265.5 102.5t-102.5 265.5q0 160 95 313t248 248t313 95q163 0 265.5 -102.5t102.5 -265.5q0 -189 -131 -365l355 -355l96 96q-3 3 -26 24.5t-40 38.5t-33 36.5 t-16 28.5q0 17 49 66t66 49q13 0 23 -10q6 -6 46 -44.5t82 -79.5t86.5 -86t73 -78t28.5 -41z" /> +<glyph unicode="" horiz-adv-x="1920" d="M896 640q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1664 128q0 52 -38 90t-90 38t-90 -38t-38 -90q0 -53 37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1664 1152q0 52 -38 90t-90 38t-90 -38t-38 -90q0 -53 37.5 -90.5t90.5 -37.5 t90.5 37.5t37.5 90.5zM1280 731v-185q0 -10 -7 -19.5t-16 -10.5l-155 -24q-11 -35 -32 -76q34 -48 90 -115q7 -10 7 -20q0 -12 -7 -19q-23 -30 -82.5 -89.5t-78.5 -59.5q-11 0 -21 7l-115 90q-37 -19 -77 -31q-11 -108 -23 -155q-7 -24 -30 -24h-186q-11 0 -20 7.5t-10 17.5 l-23 153q-34 10 -75 31l-118 -89q-7 -7 -20 -7q-11 0 -21 8q-144 133 -144 160q0 9 7 19q10 14 41 53t47 61q-23 44 -35 82l-152 24q-10 1 -17 9.5t-7 19.5v185q0 10 7 19.5t16 10.5l155 24q11 35 32 76q-34 48 -90 115q-7 11 -7 20q0 12 7 20q22 30 82 89t79 59q11 0 21 -7 l115 -90q34 18 77 32q11 108 23 154q7 24 30 24h186q11 0 20 -7.5t10 -17.5l23 -153q34 -10 75 -31l118 89q8 7 20 7q11 0 21 -8q144 -133 144 -160q0 -9 -7 -19q-12 -16 -42 -54t-45 -60q23 -48 34 -82l152 -23q10 -2 17 -10.5t7 -19.5zM1920 198v-140q0 -16 -149 -31 q-12 -27 -30 -52q51 -113 51 -138q0 -4 -4 -7q-122 -71 -124 -71q-8 0 -46 47t-52 68q-20 -2 -30 -2t-30 2q-14 -21 -52 -68t-46 -47q-2 0 -124 71q-4 3 -4 7q0 25 51 138q-18 25 -30 52q-149 15 -149 31v140q0 16 149 31q13 29 30 52q-51 113 -51 138q0 4 4 7q4 2 35 20 t59 34t30 16q8 0 46 -46.5t52 -67.5q20 2 30 2t30 -2q51 71 92 112l6 2q4 0 124 -70q4 -3 4 -7q0 -25 -51 -138q17 -23 30 -52q149 -15 149 -31zM1920 1222v-140q0 -16 -149 -31q-12 -27 -30 -52q51 -113 51 -138q0 -4 -4 -7q-122 -71 -124 -71q-8 0 -46 47t-52 68 q-20 -2 -30 -2t-30 2q-14 -21 -52 -68t-46 -47q-2 0 -124 71q-4 3 -4 7q0 25 51 138q-18 25 -30 52q-149 15 -149 31v140q0 16 149 31q13 29 30 52q-51 113 -51 138q0 4 4 7q4 2 35 20t59 34t30 16q8 0 46 -46.5t52 -67.5q20 2 30 2t30 -2q51 71 92 112l6 2q4 0 124 -70 q4 -3 4 -7q0 -25 -51 -138q17 -23 30 -52q149 -15 149 -31z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1408 768q0 -139 -94 -257t-256.5 -186.5t-353.5 -68.5q-86 0 -176 16q-124 -88 -278 -128q-36 -9 -86 -16h-3q-11 0 -20.5 8t-11.5 21q-1 3 -1 6.5t0.5 6.5t2 6l2.5 5t3.5 5.5t4 5t4.5 5t4 4.5q5 6 23 25t26 29.5t22.5 29t25 38.5t20.5 44q-124 72 -195 177t-71 224 q0 139 94 257t256.5 186.5t353.5 68.5t353.5 -68.5t256.5 -186.5t94 -257zM1792 512q0 -120 -71 -224.5t-195 -176.5q10 -24 20.5 -44t25 -38.5t22.5 -29t26 -29.5t23 -25q1 -1 4 -4.5t4.5 -5t4 -5t3.5 -5.5l2.5 -5t2 -6t0.5 -6.5t-1 -6.5q-3 -14 -13 -22t-22 -7 q-50 7 -86 16q-154 40 -278 128q-90 -16 -176 -16q-271 0 -472 132q58 -4 88 -4q161 0 309 45t264 129q125 92 192 212t67 254q0 77 -23 152q129 -71 204 -178t75 -230z" /> +<glyph unicode="" d="M256 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 768q0 51 -39 89.5t-89 38.5h-352q0 58 48 159.5t48 160.5q0 98 -32 145t-128 47q-26 -26 -38 -85t-30.5 -125.5t-59.5 -109.5q-22 -23 -77 -91q-4 -5 -23 -30t-31.5 -41t-34.5 -42.5 t-40 -44t-38.5 -35.5t-40 -27t-35.5 -9h-32v-640h32q13 0 31.5 -3t33 -6.5t38 -11t35 -11.5t35.5 -12.5t29 -10.5q211 -73 342 -73h121q192 0 192 167q0 26 -5 56q30 16 47.5 52.5t17.5 73.5t-18 69q53 50 53 119q0 25 -10 55.5t-25 47.5q32 1 53.5 47t21.5 81zM1536 769 q0 -89 -49 -163q9 -33 9 -69q0 -77 -38 -144q3 -21 3 -43q0 -101 -60 -178q1 -139 -85 -219.5t-227 -80.5h-36h-93q-96 0 -189.5 22.5t-216.5 65.5q-116 40 -138 40h-288q-53 0 -90.5 37.5t-37.5 90.5v640q0 53 37.5 90.5t90.5 37.5h274q36 24 137 155q58 75 107 128 q24 25 35.5 85.5t30.5 126.5t62 108q39 37 90 37q84 0 151 -32.5t102 -101.5t35 -186q0 -93 -48 -192h176q104 0 180 -76t76 -179z" /> +<glyph unicode="" d="M256 1088q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 512q0 35 -21.5 81t-53.5 47q15 17 25 47.5t10 55.5q0 69 -53 119q18 32 18 69t-17.5 73.5t-47.5 52.5q5 30 5 56q0 85 -49 126t-136 41h-128q-131 0 -342 -73q-5 -2 -29 -10.5 t-35.5 -12.5t-35 -11.5t-38 -11t-33 -6.5t-31.5 -3h-32v-640h32q16 0 35.5 -9t40 -27t38.5 -35.5t40 -44t34.5 -42.5t31.5 -41t23 -30q55 -68 77 -91q41 -43 59.5 -109.5t30.5 -125.5t38 -85q96 0 128 47t32 145q0 59 -48 160.5t-48 159.5h352q50 0 89 38.5t39 89.5z M1536 511q0 -103 -76 -179t-180 -76h-176q48 -99 48 -192q0 -118 -35 -186q-35 -69 -102 -101.5t-151 -32.5q-51 0 -90 37q-34 33 -54 82t-25.5 90.5t-17.5 84.5t-31 64q-48 50 -107 127q-101 131 -137 155h-274q-53 0 -90.5 37.5t-37.5 90.5v640q0 53 37.5 90.5t90.5 37.5 h288q22 0 138 40q128 44 223 66t200 22h112q140 0 226.5 -79t85.5 -216v-5q60 -77 60 -178q0 -22 -3 -43q38 -67 38 -144q0 -36 -9 -69q49 -74 49 -163z" /> +<glyph unicode="" horiz-adv-x="896" d="M832 1504v-1339l-449 -236q-22 -12 -40 -12q-21 0 -31.5 14.5t-10.5 35.5q0 6 2 20l86 500l-364 354q-25 27 -25 48q0 37 56 46l502 73l225 455q19 41 49 41z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1664 940q0 81 -21.5 143t-55 98.5t-81.5 59.5t-94 31t-98 8t-112 -25.5t-110.5 -64t-86.5 -72t-60 -61.5q-18 -22 -49 -22t-49 22q-24 28 -60 61.5t-86.5 72t-110.5 64t-112 25.5t-98 -8t-94 -31t-81.5 -59.5t-55 -98.5t-21.5 -143q0 -168 187 -355l581 -560l580 559 q188 188 188 356zM1792 940q0 -221 -229 -450l-623 -600q-18 -18 -44 -18t-44 18l-624 602q-10 8 -27.5 26t-55.5 65.5t-68 97.5t-53.5 121t-23.5 138q0 220 127 344t351 124q62 0 126.5 -21.5t120 -58t95.5 -68.5t76 -68q36 36 76 68t95.5 68.5t120 58t126.5 21.5 q224 0 351 -124t127 -344z" /> +<glyph unicode="" horiz-adv-x="1664" d="M640 96q0 -4 1 -20t0.5 -26.5t-3 -23.5t-10 -19.5t-20.5 -6.5h-320q-119 0 -203.5 84.5t-84.5 203.5v704q0 119 84.5 203.5t203.5 84.5h320q13 0 22.5 -9.5t9.5 -22.5q0 -4 1 -20t0.5 -26.5t-3 -23.5t-10 -19.5t-20.5 -6.5h-320q-66 0 -113 -47t-47 -113v-704 q0 -66 47 -113t113 -47h288h11h13t11.5 -1t11.5 -3t8 -5.5t7 -9t2 -13.5zM1568 640q0 -26 -19 -45l-544 -544q-19 -19 -45 -19t-45 19t-19 45v288h-448q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h448v288q0 26 19 45t45 19t45 -19l544 -544q19 -19 19 -45z" /> +<glyph unicode="" d="M237 122h231v694h-231v-694zM483 1030q-1 52 -36 86t-93 34t-94.5 -34t-36.5 -86q0 -51 35.5 -85.5t92.5 -34.5h1q59 0 95 34.5t36 85.5zM1068 122h231v398q0 154 -73 233t-193 79q-136 0 -209 -117h2v101h-231q3 -66 0 -694h231v388q0 38 7 56q15 35 45 59.5t74 24.5 q116 0 116 -157v-371zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1152" d="M480 672v448q0 14 -9 23t-23 9t-23 -9t-9 -23v-448q0 -14 9 -23t23 -9t23 9t9 23zM1152 320q0 -26 -19 -45t-45 -19h-429l-51 -483q-2 -12 -10.5 -20.5t-20.5 -8.5h-1q-27 0 -32 27l-76 485h-404q-26 0 -45 19t-19 45q0 123 78.5 221.5t177.5 98.5v512q-52 0 -90 38 t-38 90t38 90t90 38h640q52 0 90 -38t38 -90t-38 -90t-90 -38v-512q99 0 177.5 -98.5t78.5 -221.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1408 608v-320q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h704q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-704q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v320 q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1792 1472v-512q0 -26 -19 -45t-45 -19t-45 19l-176 176l-652 -652q-10 -10 -23 -10t-23 10l-114 114q-10 10 -10 23t10 23l652 652l-176 176q-19 19 -19 45t19 45t45 19h512q26 0 45 -19t19 -45z" /> +<glyph unicode="" d="M1184 640q0 -26 -19 -45l-544 -544q-19 -19 -45 -19t-45 19t-19 45v288h-448q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h448v288q0 26 19 45t45 19t45 -19l544 -544q19 -19 19 -45zM1536 992v-704q0 -119 -84.5 -203.5t-203.5 -84.5h-320q-13 0 -22.5 9.5t-9.5 22.5 q0 4 -1 20t-0.5 26.5t3 23.5t10 19.5t20.5 6.5h320q66 0 113 47t47 113v704q0 66 -47 113t-113 47h-288h-11h-13t-11.5 1t-11.5 3t-8 5.5t-7 9t-2 13.5q0 4 -1 20t-0.5 26.5t3 23.5t10 19.5t20.5 6.5h320q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M458 653q-74 162 -74 371h-256v-96q0 -78 94.5 -162t235.5 -113zM1536 928v96h-256q0 -209 -74 -371q141 29 235.5 113t94.5 162zM1664 1056v-128q0 -71 -41.5 -143t-112 -130t-173 -97.5t-215.5 -44.5q-42 -54 -95 -95q-38 -34 -52.5 -72.5t-14.5 -89.5q0 -54 30.5 -91 t97.5 -37q75 0 133.5 -45.5t58.5 -114.5v-64q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23v64q0 69 58.5 114.5t133.5 45.5q67 0 97.5 37t30.5 91q0 51 -14.5 89.5t-52.5 72.5q-53 41 -95 95q-113 5 -215.5 44.5t-173 97.5t-112 130t-41.5 143v128q0 40 28 68t68 28h288v96 q0 66 47 113t113 47h576q66 0 113 -47t47 -113v-96h288q40 0 68 -28t28 -68z" /> +<glyph unicode="" d="M394 184q-8 -9 -20 3q-13 11 -4 19q8 9 20 -3q12 -11 4 -19zM352 245q9 -12 0 -19q-8 -6 -17 7t0 18q9 7 17 -6zM291 305q-5 -7 -13 -2q-10 5 -7 12q3 5 13 2q10 -5 7 -12zM322 271q-6 -7 -16 3q-9 11 -2 16q6 6 16 -3q9 -11 2 -16zM451 159q-4 -12 -19 -6q-17 4 -13 15 t19 7q16 -5 13 -16zM514 154q0 -11 -16 -11q-17 -2 -17 11q0 11 16 11q17 2 17 -11zM572 164q2 -10 -14 -14t-18 8t14 15q16 2 18 -9zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-224q-16 0 -24.5 1t-19.5 5t-16 14.5t-5 27.5v239q0 97 -52 142q57 6 102.5 18t94 39 t81 66.5t53 105t20.5 150.5q0 121 -79 206q37 91 -8 204q-28 9 -81 -11t-92 -44l-38 -24q-93 26 -192 26t-192 -26q-16 11 -42.5 27t-83.5 38.5t-86 13.5q-44 -113 -7 -204q-79 -85 -79 -206q0 -85 20.5 -150t52.5 -105t80.5 -67t94 -39t102.5 -18q-40 -36 -49 -103 q-21 -10 -45 -15t-57 -5t-65.5 21.5t-55.5 62.5q-19 32 -48.5 52t-49.5 24l-20 3q-21 0 -29 -4.5t-5 -11.5t9 -14t13 -12l7 -5q22 -10 43.5 -38t31.5 -51l10 -23q13 -38 44 -61.5t67 -30t69.5 -7t55.5 3.5l23 4q0 -38 0.5 -103t0.5 -68q0 -22 -11 -33.5t-22 -13t-33 -1.5 h-224q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1280 64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1536 64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 288v-320q0 -40 -28 -68t-68 -28h-1472q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h427q21 -56 70.5 -92 t110.5 -36h256q61 0 110.5 36t70.5 92h427q40 0 68 -28t28 -68zM1339 936q-17 -40 -59 -40h-256v-448q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v448h-256q-42 0 -59 40q-17 39 14 69l448 448q18 19 45 19t45 -19l448 -448q31 -30 14 -69z" /> +<glyph unicode="" d="M1407 710q0 44 -7 113.5t-18 96.5q-12 30 -17 44t-9 36.5t-4 48.5q0 23 5 68.5t5 67.5q0 37 -10 55q-4 1 -13 1q-19 0 -58 -4.5t-59 -4.5q-60 0 -176 24t-175 24q-43 0 -94.5 -11.5t-85 -23.5t-89.5 -34q-137 -54 -202 -103q-96 -73 -159.5 -189.5t-88 -236t-24.5 -248.5 q0 -40 12.5 -120t12.5 -121q0 -23 -11 -66.5t-11 -65.5t12 -36.5t34 -14.5q24 0 72.5 11t73.5 11q57 0 169.5 -15.5t169.5 -15.5q181 0 284 36q129 45 235.5 152.5t166 245.5t59.5 275zM1535 712q0 -165 -70 -327.5t-196 -288t-281 -180.5q-124 -44 -326 -44 q-57 0 -170 14.5t-169 14.5q-24 0 -72.5 -14.5t-73.5 -14.5q-73 0 -123.5 55.5t-50.5 128.5q0 24 11 68t11 67q0 40 -12.5 120.5t-12.5 121.5q0 111 18 217.5t54.5 209.5t100.5 194t150 156q78 59 232 120q194 78 316 78q60 0 175.5 -24t173.5 -24q19 0 57 5t58 5 q81 0 118 -50.5t37 -134.5q0 -23 -5 -68t-5 -68q0 -10 1 -18.5t3 -17t4 -13.5t6.5 -16t6.5 -17q16 -40 25 -118.5t9 -136.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1408 296q0 -27 -10 -70.5t-21 -68.5q-21 -50 -122 -106q-94 -51 -186 -51q-27 0 -52.5 3.5t-57.5 12.5t-47.5 14.5t-55.5 20.5t-49 18q-98 35 -175 83q-128 79 -264.5 215.5t-215.5 264.5q-48 77 -83 175q-3 9 -18 49t-20.5 55.5t-14.5 47.5t-12.5 57.5t-3.5 52.5 q0 92 51 186q56 101 106 122q25 11 68.5 21t70.5 10q14 0 21 -3q18 -6 53 -76q11 -19 30 -54t35 -63.5t31 -53.5q3 -4 17.5 -25t21.5 -35.5t7 -28.5q0 -20 -28.5 -50t-62 -55t-62 -53t-28.5 -46q0 -9 5 -22.5t8.5 -20.5t14 -24t11.5 -19q76 -137 174 -235t235 -174 q2 -1 19 -11.5t24 -14t20.5 -8.5t22.5 -5q18 0 46 28.5t53 62t55 62t50 28.5q14 0 28.5 -7t35.5 -21.5t25 -17.5q25 -15 53.5 -31t63.5 -35t54 -30q70 -35 76 -53q3 -7 3 -21z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1120 1280h-832q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v832q0 66 -47 113t-113 47zM1408 1120v-832q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832 q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1152 1280h-1024v-1242l423 406l89 85l89 -85l423 -406v1242zM1164 1408q23 0 44 -9q33 -13 52.5 -41t19.5 -62v-1289q0 -34 -19.5 -62t-52.5 -41q-19 -8 -44 -8q-48 0 -83 32l-441 424l-441 -424q-36 -33 -83 -33q-23 0 -44 9q-33 13 -52.5 41t-19.5 62v1289 q0 34 19.5 62t52.5 41q21 9 44 9h1048z" /> +<glyph unicode="" d="M1280 343q0 11 -2 16q-3 8 -38.5 29.5t-88.5 49.5l-53 29q-5 3 -19 13t-25 15t-21 5q-18 0 -47 -32.5t-57 -65.5t-44 -33q-7 0 -16.5 3.5t-15.5 6.5t-17 9.5t-14 8.5q-99 55 -170.5 126.5t-126.5 170.5q-2 3 -8.5 14t-9.5 17t-6.5 15.5t-3.5 16.5q0 13 20.5 33.5t45 38.5 t45 39.5t20.5 36.5q0 10 -5 21t-15 25t-13 19q-3 6 -15 28.5t-25 45.5t-26.5 47.5t-25 40.5t-16.5 18t-16 2q-48 0 -101 -22q-46 -21 -80 -94.5t-34 -130.5q0 -16 2.5 -34t5 -30.5t9 -33t10 -29.5t12.5 -33t11 -30q60 -164 216.5 -320.5t320.5 -216.5q6 -2 30 -11t33 -12.5 t29.5 -10t33 -9t30.5 -5t34 -2.5q57 0 130.5 34t94.5 80q22 53 22 101zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1620 1128q-67 -98 -162 -167q1 -14 1 -42q0 -130 -38 -259.5t-115.5 -248.5t-184.5 -210.5t-258 -146t-323 -54.5q-271 0 -496 145q35 -4 78 -4q225 0 401 138q-105 2 -188 64.5t-114 159.5q33 -5 61 -5q43 0 85 11q-112 23 -185.5 111.5t-73.5 205.5v4q68 -38 146 -41 q-66 44 -105 115t-39 154q0 88 44 163q121 -149 294.5 -238.5t371.5 -99.5q-8 38 -8 74q0 134 94.5 228.5t228.5 94.5q140 0 236 -102q109 21 205 78q-37 -115 -142 -178q93 10 186 50z" /> +<glyph unicode="" horiz-adv-x="768" d="M511 980h257l-30 -284h-227v-824h-341v824h-170v284h170v171q0 182 86 275.5t283 93.5h227v-284h-142q-39 0 -62.5 -6.5t-34 -23.5t-13.5 -34.5t-3 -49.5v-142z" /> +<glyph unicode="" d="M1536 640q0 -251 -146.5 -451.5t-378.5 -277.5q-27 -5 -39.5 7t-12.5 30v211q0 97 -52 142q57 6 102.5 18t94 39t81 66.5t53 105t20.5 150.5q0 121 -79 206q37 91 -8 204q-28 9 -81 -11t-92 -44l-38 -24q-93 26 -192 26t-192 -26q-16 11 -42.5 27t-83.5 38.5t-86 13.5 q-44 -113 -7 -204q-79 -85 -79 -206q0 -85 20.5 -150t52.5 -105t80.5 -67t94 -39t102.5 -18q-40 -36 -49 -103q-21 -10 -45 -15t-57 -5t-65.5 21.5t-55.5 62.5q-19 32 -48.5 52t-49.5 24l-20 3q-21 0 -29 -4.5t-5 -11.5t9 -14t13 -12l7 -5q22 -10 43.5 -38t31.5 -51l10 -23 q13 -38 44 -61.5t67 -30t69.5 -7t55.5 3.5l23 4q0 -38 0.5 -89t0.5 -54q0 -18 -13 -30t-40 -7q-232 77 -378.5 277.5t-146.5 451.5q0 209 103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1664 960v-256q0 -26 -19 -45t-45 -19h-64q-26 0 -45 19t-19 45v256q0 106 -75 181t-181 75t-181 -75t-75 -181v-192h96q40 0 68 -28t28 -68v-576q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v576q0 40 28 68t68 28h672v192q0 185 131.5 316.5t316.5 131.5 t316.5 -131.5t131.5 -316.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1760 1408q66 0 113 -47t47 -113v-1216q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1600zM160 1280q-13 0 -22.5 -9.5t-9.5 -22.5v-224h1664v224q0 13 -9.5 22.5t-22.5 9.5h-1600zM1760 0q13 0 22.5 9.5t9.5 22.5v608h-1664v-608 q0 -13 9.5 -22.5t22.5 -9.5h1600zM256 128v128h256v-128h-256zM640 128v128h384v-128h-384z" /> +<glyph unicode="" horiz-adv-x="1408" d="M384 192q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM896 69q2 -28 -17 -48q-18 -21 -47 -21h-135q-25 0 -43 16.5t-20 41.5q-22 229 -184.5 391.5t-391.5 184.5q-25 2 -41.5 20t-16.5 43v135q0 29 21 47q17 17 43 17h5q160 -13 306 -80.5 t259 -181.5q114 -113 181.5 -259t80.5 -306zM1408 67q2 -27 -18 -47q-18 -20 -46 -20h-143q-26 0 -44.5 17.5t-19.5 42.5q-12 215 -101 408.5t-231.5 336t-336 231.5t-408.5 102q-25 1 -42.5 19.5t-17.5 43.5v143q0 28 20 46q18 18 44 18h3q262 -13 501.5 -120t425.5 -294 q187 -186 294 -425.5t120 -501.5z" /> +<glyph unicode="" d="M1040 320q0 -33 -23.5 -56.5t-56.5 -23.5t-56.5 23.5t-23.5 56.5t23.5 56.5t56.5 23.5t56.5 -23.5t23.5 -56.5zM1296 320q0 -33 -23.5 -56.5t-56.5 -23.5t-56.5 23.5t-23.5 56.5t23.5 56.5t56.5 23.5t56.5 -23.5t23.5 -56.5zM1408 160v320q0 13 -9.5 22.5t-22.5 9.5 h-1216q-13 0 -22.5 -9.5t-9.5 -22.5v-320q0 -13 9.5 -22.5t22.5 -9.5h1216q13 0 22.5 9.5t9.5 22.5zM178 640h1180l-157 482q-4 13 -16 21.5t-26 8.5h-782q-14 0 -26 -8.5t-16 -21.5zM1536 480v-320q0 -66 -47 -113t-113 -47h-1216q-66 0 -113 47t-47 113v320q0 25 16 75 l197 606q17 53 63 86t101 33h782q55 0 101 -33t63 -86l197 -606q16 -50 16 -75z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1664 896q53 0 90.5 -37.5t37.5 -90.5t-37.5 -90.5t-90.5 -37.5v-384q0 -52 -38 -90t-90 -38q-417 347 -812 380q-58 -19 -91 -66t-31 -100.5t40 -92.5q-20 -33 -23 -65.5t6 -58t33.5 -55t48 -50t61.5 -50.5q-29 -58 -111.5 -83t-168.5 -11.5t-132 55.5q-7 23 -29.5 87.5 t-32 94.5t-23 89t-15 101t3.5 98.5t22 110.5h-122q-66 0 -113 47t-47 113v192q0 66 47 113t113 47h480q435 0 896 384q52 0 90 -38t38 -90v-384zM1536 292v954q-394 -302 -768 -343v-270q377 -42 768 -341z" /> +<glyph unicode="" horiz-adv-x="1664" d="M848 -160q0 16 -16 16q-59 0 -101.5 42.5t-42.5 101.5q0 16 -16 16t-16 -16q0 -73 51.5 -124.5t124.5 -51.5q16 0 16 16zM183 128h1298q-164 181 -246.5 411.5t-82.5 484.5q0 256 -320 256t-320 -256q0 -254 -82.5 -484.5t-246.5 -411.5zM1664 128q0 -52 -38 -90t-90 -38 h-448q0 -106 -75 -181t-181 -75t-181 75t-75 181h-448q-52 0 -90 38t-38 90q190 161 287 397.5t97 498.5q0 165 96 262t264 117q-8 18 -8 37q0 40 28 68t68 28t68 -28t28 -68q0 -19 -8 -37q168 -20 264 -117t96 -262q0 -262 97 -498.5t287 -397.5z" /> +<glyph unicode="" d="M1376 640l138 -135q30 -28 20 -70q-12 -41 -52 -51l-188 -48l53 -186q12 -41 -19 -70q-29 -31 -70 -19l-186 53l-48 -188q-10 -40 -51 -52q-12 -2 -19 -2q-31 0 -51 22l-135 138l-135 -138q-28 -30 -70 -20q-41 11 -51 52l-48 188l-186 -53q-41 -12 -70 19q-31 29 -19 70 l53 186l-188 48q-40 10 -52 51q-10 42 20 70l138 135l-138 135q-30 28 -20 70q12 41 52 51l188 48l-53 186q-12 41 19 70q29 31 70 19l186 -53l48 188q10 41 51 51q41 12 70 -19l135 -139l135 139q29 30 70 19q41 -10 51 -51l48 -188l186 53q41 12 70 -19q31 -29 19 -70 l-53 -186l188 -48q40 -10 52 -51q10 -42 -20 -70z" /> +<glyph unicode="" horiz-adv-x="1792" d="M256 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 768q0 51 -39 89.5t-89 38.5h-576q0 20 15 48.5t33 55t33 68t15 84.5q0 67 -44.5 97.5t-115.5 30.5q-24 0 -90 -139q-24 -44 -37 -65q-40 -64 -112 -145q-71 -81 -101 -106 q-69 -57 -140 -57h-32v-640h32q72 0 167 -32t193.5 -64t179.5 -32q189 0 189 167q0 26 -5 56q30 16 47.5 52.5t17.5 73.5t-18 69q53 50 53 119q0 25 -10 55.5t-25 47.5h331q52 0 90 38t38 90zM1792 769q0 -105 -75.5 -181t-180.5 -76h-169q-4 -62 -37 -119q3 -21 3 -43 q0 -101 -60 -178q1 -139 -85 -219.5t-227 -80.5q-133 0 -322 69q-164 59 -223 59h-288q-53 0 -90.5 37.5t-37.5 90.5v640q0 53 37.5 90.5t90.5 37.5h288q10 0 21.5 4.5t23.5 14t22.5 18t24 22.5t20.5 21.5t19 21.5t14 17q65 74 100 129q13 21 33 62t37 72t40.5 63t55 49.5 t69.5 17.5q125 0 206.5 -67t81.5 -189q0 -68 -22 -128h374q104 0 180 -76t76 -179z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1376 128h32v640h-32q-35 0 -67.5 12t-62.5 37t-50 46t-49 54q-2 3 -3.5 4.5t-4 4.5t-4.5 5q-72 81 -112 145q-14 22 -38 68q-1 3 -10.5 22.5t-18.5 36t-20 35.5t-21.5 30.5t-18.5 11.5q-71 0 -115.5 -30.5t-44.5 -97.5q0 -43 15 -84.5t33 -68t33 -55t15 -48.5h-576 q-50 0 -89 -38.5t-39 -89.5q0 -52 38 -90t90 -38h331q-15 -17 -25 -47.5t-10 -55.5q0 -69 53 -119q-18 -32 -18 -69t17.5 -73.5t47.5 -52.5q-4 -24 -4 -56q0 -85 48.5 -126t135.5 -41q84 0 183 32t194 64t167 32zM1664 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45 t45 -19t45 19t19 45zM1792 768v-640q0 -53 -37.5 -90.5t-90.5 -37.5h-288q-59 0 -223 -59q-190 -69 -317 -69q-142 0 -230 77.5t-87 217.5l1 5q-61 76 -61 178q0 22 3 43q-33 57 -37 119h-169q-105 0 -180.5 76t-75.5 181q0 103 76 179t180 76h374q-22 60 -22 128 q0 122 81.5 189t206.5 67q38 0 69.5 -17.5t55 -49.5t40.5 -63t37 -72t33 -62q35 -55 100 -129q2 -3 14 -17t19 -21.5t20.5 -21.5t24 -22.5t22.5 -18t23.5 -14t21.5 -4.5h288q53 0 90.5 -37.5t37.5 -90.5z" /> +<glyph unicode="" d="M1280 -64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 700q0 189 -167 189q-26 0 -56 -5q-16 30 -52.5 47.5t-73.5 17.5t-69 -18q-50 53 -119 53q-25 0 -55.5 -10t-47.5 -25v331q0 52 -38 90t-90 38q-51 0 -89.5 -39t-38.5 -89v-576 q-20 0 -48.5 15t-55 33t-68 33t-84.5 15q-67 0 -97.5 -44.5t-30.5 -115.5q0 -24 139 -90q44 -24 65 -37q64 -40 145 -112q81 -71 106 -101q57 -69 57 -140v-32h640v32q0 72 32 167t64 193.5t32 179.5zM1536 705q0 -133 -69 -322q-59 -164 -59 -223v-288q0 -53 -37.5 -90.5 t-90.5 -37.5h-640q-53 0 -90.5 37.5t-37.5 90.5v288q0 10 -4.5 21.5t-14 23.5t-18 22.5t-22.5 24t-21.5 20.5t-21.5 19t-17 14q-74 65 -129 100q-21 13 -62 33t-72 37t-63 40.5t-49.5 55t-17.5 69.5q0 125 67 206.5t189 81.5q68 0 128 -22v374q0 104 76 180t179 76 q105 0 181 -75.5t76 -180.5v-169q62 -4 119 -37q21 3 43 3q101 0 178 -60q139 1 219.5 -85t80.5 -227z" /> +<glyph unicode="" d="M1408 576q0 84 -32 183t-64 194t-32 167v32h-640v-32q0 -35 -12 -67.5t-37 -62.5t-46 -50t-54 -49q-9 -8 -14 -12q-81 -72 -145 -112q-22 -14 -68 -38q-3 -1 -22.5 -10.5t-36 -18.5t-35.5 -20t-30.5 -21.5t-11.5 -18.5q0 -71 30.5 -115.5t97.5 -44.5q43 0 84.5 15t68 33 t55 33t48.5 15v-576q0 -50 38.5 -89t89.5 -39q52 0 90 38t38 90v331q46 -35 103 -35q69 0 119 53q32 -18 69 -18t73.5 17.5t52.5 47.5q24 -4 56 -4q85 0 126 48.5t41 135.5zM1280 1344q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1536 580 q0 -142 -77.5 -230t-217.5 -87l-5 1q-76 -61 -178 -61q-22 0 -43 3q-54 -30 -119 -37v-169q0 -105 -76 -180.5t-181 -75.5q-103 0 -179 76t-76 180v374q-54 -22 -128 -22q-121 0 -188.5 81.5t-67.5 206.5q0 38 17.5 69.5t49.5 55t63 40.5t72 37t62 33q55 35 129 100 q3 2 17 14t21.5 19t21.5 20.5t22.5 24t18 22.5t14 23.5t4.5 21.5v288q0 53 37.5 90.5t90.5 37.5h640q53 0 90.5 -37.5t37.5 -90.5v-288q0 -59 59 -223q69 -190 69 -317z" /> +<glyph unicode="" d="M1280 576v128q0 26 -19 45t-45 19h-502l189 189q19 19 19 45t-19 45l-91 91q-18 18 -45 18t-45 -18l-362 -362l-91 -91q-18 -18 -18 -45t18 -45l91 -91l362 -362q18 -18 45 -18t45 18l91 91q18 18 18 45t-18 45l-189 189h502q26 0 45 19t19 45zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1285 640q0 27 -18 45l-91 91l-362 362q-18 18 -45 18t-45 -18l-91 -91q-18 -18 -18 -45t18 -45l189 -189h-502q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h502l-189 -189q-19 -19 -19 -45t19 -45l91 -91q18 -18 45 -18t45 18l362 362l91 91q18 18 18 45zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1284 641q0 27 -18 45l-362 362l-91 91q-18 18 -45 18t-45 -18l-91 -91l-362 -362q-18 -18 -18 -45t18 -45l91 -91q18 -18 45 -18t45 18l189 189v-502q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v502l189 -189q19 -19 45 -19t45 19l91 91q18 18 18 45zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1284 639q0 27 -18 45l-91 91q-18 18 -45 18t-45 -18l-189 -189v502q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-502l-189 189q-19 19 -45 19t-45 -19l-91 -91q-18 -18 -18 -45t18 -45l362 -362l91 -91q18 -18 45 -18t45 18l91 91l362 362q18 18 18 45zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM1042 887q-2 -1 -9.5 -9.5t-13.5 -9.5q2 0 4.5 5t5 11t3.5 7q6 7 22 15q14 6 52 12q34 8 51 -11 q-2 2 9.5 13t14.5 12q3 2 15 4.5t15 7.5l2 22q-12 -1 -17.5 7t-6.5 21q0 -2 -6 -8q0 7 -4.5 8t-11.5 -1t-9 -1q-10 3 -15 7.5t-8 16.5t-4 15q-2 5 -9.5 10.5t-9.5 10.5q-1 2 -2.5 5.5t-3 6.5t-4 5.5t-5.5 2.5t-7 -5t-7.5 -10t-4.5 -5q-3 2 -6 1.5t-4.5 -1t-4.5 -3t-5 -3.5 q-3 -2 -8.5 -3t-8.5 -2q15 5 -1 11q-10 4 -16 3q9 4 7.5 12t-8.5 14h5q-1 4 -8.5 8.5t-17.5 8.5t-13 6q-8 5 -34 9.5t-33 0.5q-5 -6 -4.5 -10.5t4 -14t3.5 -12.5q1 -6 -5.5 -13t-6.5 -12q0 -7 14 -15.5t10 -21.5q-3 -8 -16 -16t-16 -12q-5 -8 -1.5 -18.5t10.5 -16.5 q2 -2 1.5 -4t-3.5 -4.5t-5.5 -4t-6.5 -3.5l-3 -2q-11 -5 -20.5 6t-13.5 26q-7 25 -16 30q-23 8 -29 -1q-5 13 -41 26q-25 9 -58 4q6 1 0 15q-7 15 -19 12q3 6 4 17.5t1 13.5q3 13 12 23q1 1 7 8.5t9.5 13.5t0.5 6q35 -4 50 11q5 5 11.5 17t10.5 17q9 6 14 5.5t14.5 -5.5 t14.5 -5q14 -1 15.5 11t-7.5 20q12 -1 3 17q-5 7 -8 9q-12 4 -27 -5q-8 -4 2 -8q-1 1 -9.5 -10.5t-16.5 -17.5t-16 5q-1 1 -5.5 13.5t-9.5 13.5q-8 0 -16 -15q3 8 -11 15t-24 8q19 12 -8 27q-7 4 -20.5 5t-19.5 -4q-5 -7 -5.5 -11.5t5 -8t10.5 -5.5t11.5 -4t8.5 -3 q14 -10 8 -14q-2 -1 -8.5 -3.5t-11.5 -4.5t-6 -4q-3 -4 0 -14t-2 -14q-5 5 -9 17.5t-7 16.5q7 -9 -25 -6l-10 1q-4 0 -16 -2t-20.5 -1t-13.5 8q-4 8 0 20q1 4 4 2q-4 3 -11 9.5t-10 8.5q-46 -15 -94 -41q6 -1 12 1q5 2 13 6.5t10 5.5q34 14 42 7l5 5q14 -16 20 -25 q-7 4 -30 1q-20 -6 -22 -12q7 -12 5 -18q-4 3 -11.5 10t-14.5 11t-15 5q-16 0 -22 -1q-146 -80 -235 -222q7 -7 12 -8q4 -1 5 -9t2.5 -11t11.5 3q9 -8 3 -19q1 1 44 -27q19 -17 21 -21q3 -11 -10 -18q-1 2 -9 9t-9 4q-3 -5 0.5 -18.5t10.5 -12.5q-7 0 -9.5 -16t-2.5 -35.5 t-1 -23.5l2 -1q-3 -12 5.5 -34.5t21.5 -19.5q-13 -3 20 -43q6 -8 8 -9q3 -2 12 -7.5t15 -10t10 -10.5q4 -5 10 -22.5t14 -23.5q-2 -6 9.5 -20t10.5 -23q-1 0 -2.5 -1t-2.5 -1q3 -7 15.5 -14t15.5 -13q1 -3 2 -10t3 -11t8 -2q2 20 -24 62q-15 25 -17 29q-3 5 -5.5 15.5 t-4.5 14.5q2 0 6 -1.5t8.5 -3.5t7.5 -4t2 -3q-3 -7 2 -17.5t12 -18.5t17 -19t12 -13q6 -6 14 -19.5t0 -13.5q9 0 20 -10t17 -20q5 -8 8 -26t5 -24q2 -7 8.5 -13.5t12.5 -9.5l16 -8t13 -7q5 -2 18.5 -10.5t21.5 -11.5q10 -4 16 -4t14.5 2.5t13.5 3.5q15 2 29 -15t21 -21 q36 -19 55 -11q-2 -1 0.5 -7.5t8 -15.5t9 -14.5t5.5 -8.5q5 -6 18 -15t18 -15q6 4 7 9q-3 -8 7 -20t18 -10q14 3 14 32q-31 -15 -49 18q0 1 -2.5 5.5t-4 8.5t-2.5 8.5t0 7.5t5 3q9 0 10 3.5t-2 12.5t-4 13q-1 8 -11 20t-12 15q-5 -9 -16 -8t-16 9q0 -1 -1.5 -5.5t-1.5 -6.5 q-13 0 -15 1q1 3 2.5 17.5t3.5 22.5q1 4 5.5 12t7.5 14.5t4 12.5t-4.5 9.5t-17.5 2.5q-19 -1 -26 -20q-1 -3 -3 -10.5t-5 -11.5t-9 -7q-7 -3 -24 -2t-24 5q-13 8 -22.5 29t-9.5 37q0 10 2.5 26.5t3 25t-5.5 24.5q3 2 9 9.5t10 10.5q2 1 4.5 1.5t4.5 0t4 1.5t3 6q-1 1 -4 3 q-3 3 -4 3q7 -3 28.5 1.5t27.5 -1.5q15 -11 22 2q0 1 -2.5 9.5t-0.5 13.5q5 -27 29 -9q3 -3 15.5 -5t17.5 -5q3 -2 7 -5.5t5.5 -4.5t5 0.5t8.5 6.5q10 -14 12 -24q11 -40 19 -44q7 -3 11 -2t4.5 9.5t0 14t-1.5 12.5l-1 8v18l-1 8q-15 3 -18.5 12t1.5 18.5t15 18.5q1 1 8 3.5 t15.5 6.5t12.5 8q21 19 15 35q7 0 11 9q-1 0 -5 3t-7.5 5t-4.5 2q9 5 2 16q5 3 7.5 11t7.5 10q9 -12 21 -2q7 8 1 16q5 7 20.5 10.5t18.5 9.5q7 -2 8 2t1 12t3 12q4 5 15 9t13 5l17 11q3 4 0 4q18 -2 31 11q10 11 -6 20q3 6 -3 9.5t-15 5.5q3 1 11.5 0.5t10.5 1.5 q15 10 -7 16q-17 5 -43 -12zM879 10q206 36 351 189q-3 3 -12.5 4.5t-12.5 3.5q-18 7 -24 8q1 7 -2.5 13t-8 9t-12.5 8t-11 7q-2 2 -7 6t-7 5.5t-7.5 4.5t-8.5 2t-10 -1l-3 -1q-3 -1 -5.5 -2.5t-5.5 -3t-4 -3t0 -2.5q-21 17 -36 22q-5 1 -11 5.5t-10.5 7t-10 1.5t-11.5 -7 q-5 -5 -6 -15t-2 -13q-7 5 0 17.5t2 18.5q-3 6 -10.5 4.5t-12 -4.5t-11.5 -8.5t-9 -6.5t-8.5 -5.5t-8.5 -7.5q-3 -4 -6 -12t-5 -11q-2 4 -11.5 6.5t-9.5 5.5q2 -10 4 -35t5 -38q7 -31 -12 -48q-27 -25 -29 -40q-4 -22 12 -26q0 -7 -8 -20.5t-7 -21.5q0 -6 2 -16z" /> +<glyph unicode="" horiz-adv-x="1664" d="M384 64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1028 484l-682 -682q-37 -37 -90 -37q-52 0 -91 37l-106 108q-38 36 -38 90q0 53 38 91l681 681q39 -98 114.5 -173.5t173.5 -114.5zM1662 919q0 -39 -23 -106q-47 -134 -164.5 -217.5 t-258.5 -83.5q-185 0 -316.5 131.5t-131.5 316.5t131.5 316.5t316.5 131.5q58 0 121.5 -16.5t107.5 -46.5q16 -11 16 -28t-16 -28l-293 -169v-224l193 -107q5 3 79 48.5t135.5 81t70.5 35.5q15 0 23.5 -10t8.5 -25z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1024 128h640v128h-640v-128zM640 640h1024v128h-1024v-128zM1280 1152h384v128h-384v-128zM1792 320v-256q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 832v-256q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19 t-19 45v256q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 1344v-256q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1664q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1403 1241q17 -41 -14 -70l-493 -493v-742q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-256 256q-19 19 -19 45v486l-493 493q-31 29 -14 70q17 39 59 39h1280q42 0 59 -39z" /> +<glyph unicode="" horiz-adv-x="1792" d="M640 1280h512v128h-512v-128zM1792 640v-480q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v480h672v-160q0 -26 19 -45t45 -19h320q26 0 45 19t19 45v160h672zM1024 640v-128h-256v128h256zM1792 1120v-384h-1792v384q0 66 47 113t113 47h352v160q0 40 28 68 t68 28h576q40 0 68 -28t28 -68v-160h352q66 0 113 -47t47 -113z" /> +<glyph unicode="" d="M1283 995l-355 -355l355 -355l144 144q29 31 70 14q39 -17 39 -59v-448q0 -26 -19 -45t-45 -19h-448q-42 0 -59 40q-17 39 14 69l144 144l-355 355l-355 -355l144 -144q31 -30 14 -69q-17 -40 -59 -40h-448q-26 0 -45 19t-19 45v448q0 42 40 59q39 17 69 -14l144 -144 l355 355l-355 355l-144 -144q-19 -19 -45 -19q-12 0 -24 5q-40 17 -40 59v448q0 26 19 45t45 19h448q42 0 59 -40q17 -39 -14 -69l-144 -144l355 -355l355 355l-144 144q-31 30 -14 69q17 40 59 40h448q26 0 45 -19t19 -45v-448q0 -42 -39 -59q-13 -5 -25 -5q-26 0 -45 19z " /> +<glyph unicode="" horiz-adv-x="1920" d="M593 640q-162 -5 -265 -128h-134q-82 0 -138 40.5t-56 118.5q0 353 124 353q6 0 43.5 -21t97.5 -42.5t119 -21.5q67 0 133 23q-5 -37 -5 -66q0 -139 81 -256zM1664 3q0 -120 -73 -189.5t-194 -69.5h-874q-121 0 -194 69.5t-73 189.5q0 53 3.5 103.5t14 109t26.5 108.5 t43 97.5t62 81t85.5 53.5t111.5 20q10 0 43 -21.5t73 -48t107 -48t135 -21.5t135 21.5t107 48t73 48t43 21.5q61 0 111.5 -20t85.5 -53.5t62 -81t43 -97.5t26.5 -108.5t14 -109t3.5 -103.5zM640 1280q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75 t75 -181zM1344 896q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5zM1920 671q0 -78 -56 -118.5t-138 -40.5h-134q-103 123 -265 128q81 117 81 256q0 29 -5 66q66 -23 133 -23q59 0 119 21.5t97.5 42.5 t43.5 21q124 0 124 -353zM1792 1280q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1456 320q0 40 -28 68l-208 208q-28 28 -68 28q-42 0 -72 -32q3 -3 19 -18.5t21.5 -21.5t15 -19t13 -25.5t3.5 -27.5q0 -40 -28 -68t-68 -28q-15 0 -27.5 3.5t-25.5 13t-19 15t-21.5 21.5t-18.5 19q-33 -31 -33 -73q0 -40 28 -68l206 -207q27 -27 68 -27q40 0 68 26 l147 146q28 28 28 67zM753 1025q0 40 -28 68l-206 207q-28 28 -68 28q-39 0 -68 -27l-147 -146q-28 -28 -28 -67q0 -40 28 -68l208 -208q27 -27 68 -27q42 0 72 31q-3 3 -19 18.5t-21.5 21.5t-15 19t-13 25.5t-3.5 27.5q0 40 28 68t68 28q15 0 27.5 -3.5t25.5 -13t19 -15 t21.5 -21.5t18.5 -19q33 31 33 73zM1648 320q0 -120 -85 -203l-147 -146q-83 -83 -203 -83q-121 0 -204 85l-206 207q-83 83 -83 203q0 123 88 209l-88 88q-86 -88 -208 -88q-120 0 -204 84l-208 208q-84 84 -84 204t85 203l147 146q83 83 203 83q121 0 204 -85l206 -207 q83 -83 83 -203q0 -123 -88 -209l88 -88q86 88 208 88q120 0 204 -84l208 -208q84 -84 84 -204z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1920 384q0 -159 -112.5 -271.5t-271.5 -112.5h-1088q-185 0 -316.5 131.5t-131.5 316.5q0 132 71 241.5t187 163.5q-2 28 -2 43q0 212 150 362t362 150q158 0 286.5 -88t187.5 -230q70 62 166 62q106 0 181 -75t75 -181q0 -75 -41 -138q129 -30 213 -134.5t84 -239.5z " /> +<glyph unicode="" horiz-adv-x="1664" d="M1527 88q56 -89 21.5 -152.5t-140.5 -63.5h-1152q-106 0 -140.5 63.5t21.5 152.5l503 793v399h-64q-26 0 -45 19t-19 45t19 45t45 19h512q26 0 45 -19t19 -45t-19 -45t-45 -19h-64v-399zM748 813l-272 -429h712l-272 429l-20 31v37v399h-128v-399v-37z" /> +<glyph unicode="" horiz-adv-x="1792" d="M960 640q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1260 576l507 -398q28 -20 25 -56q-5 -35 -35 -51l-128 -64q-13 -7 -29 -7q-17 0 -31 8l-690 387l-110 -66q-8 -4 -12 -5q14 -49 10 -97q-7 -77 -56 -147.5t-132 -123.5q-132 -84 -277 -84 q-136 0 -222 78q-90 84 -79 207q7 76 56 147t131 124q132 84 278 84q83 0 151 -31q9 13 22 22l122 73l-122 73q-13 9 -22 22q-68 -31 -151 -31q-146 0 -278 84q-82 53 -131 124t-56 147q-5 59 15.5 113t63.5 93q85 79 222 79q145 0 277 -84q83 -52 132 -123t56 -148 q4 -48 -10 -97q4 -1 12 -5l110 -66l690 387q14 8 31 8q16 0 29 -7l128 -64q30 -16 35 -51q3 -36 -25 -56zM579 836q46 42 21 108t-106 117q-92 59 -192 59q-74 0 -113 -36q-46 -42 -21 -108t106 -117q92 -59 192 -59q74 0 113 36zM494 91q81 51 106 117t-21 108 q-39 36 -113 36q-100 0 -192 -59q-81 -51 -106 -117t21 -108q39 -36 113 -36q100 0 192 59zM672 704l96 -58v11q0 36 33 56l14 8l-79 47l-26 -26q-3 -3 -10 -11t-12 -12q-2 -2 -4 -3.5t-3 -2.5zM896 480l96 -32l736 576l-128 64l-768 -431v-113l-160 -96l9 -8q2 -2 7 -6 q4 -4 11 -12t11 -12l26 -26zM1600 64l128 64l-520 408l-177 -138q-2 -3 -13 -7z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1696 1152q40 0 68 -28t28 -68v-1216q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v288h-544q-40 0 -68 28t-28 68v672q0 40 20 88t48 76l408 408q28 28 76 48t88 20h416q40 0 68 -28t28 -68v-328q68 40 128 40h416zM1152 939l-299 -299h299v299zM512 1323l-299 -299 h299v299zM708 676l316 316v416h-384v-416q0 -40 -28 -68t-68 -28h-416v-640h512v256q0 40 20 88t48 76zM1664 -128v1152h-384v-416q0 -40 -28 -68t-68 -28h-416v-640h896z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1404 151q0 -117 -79 -196t-196 -79q-135 0 -235 100l-777 776q-113 115 -113 271q0 159 110 270t269 111q158 0 273 -113l605 -606q10 -10 10 -22q0 -16 -30.5 -46.5t-46.5 -30.5q-13 0 -23 10l-606 607q-79 77 -181 77q-106 0 -179 -75t-73 -181q0 -105 76 -181 l776 -777q63 -63 145 -63q64 0 106 42t42 106q0 82 -63 145l-581 581q-26 24 -60 24q-29 0 -48 -19t-19 -48q0 -32 25 -59l410 -410q10 -10 10 -22q0 -16 -31 -47t-47 -31q-12 0 -22 10l-410 410q-63 61 -63 149q0 82 57 139t139 57q88 0 149 -63l581 -581q100 -98 100 -235 z" /> +<glyph unicode="" d="M384 0h768v384h-768v-384zM1280 0h128v896q0 14 -10 38.5t-20 34.5l-281 281q-10 10 -34 20t-39 10v-416q0 -40 -28 -68t-68 -28h-576q-40 0 -68 28t-28 68v416h-128v-1280h128v416q0 40 28 68t68 28h832q40 0 68 -28t28 -68v-416zM896 928v320q0 13 -9.5 22.5t-22.5 9.5 h-192q-13 0 -22.5 -9.5t-9.5 -22.5v-320q0 -13 9.5 -22.5t22.5 -9.5h192q13 0 22.5 9.5t9.5 22.5zM1536 896v-928q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h928q40 0 88 -20t76 -48l280 -280q28 -28 48 -76t20 -88z" /> +<glyph unicode="" d="M1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1536 192v-128q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1536 704v-128q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1536 1216v-128q0 -26 -19 -45 t-45 -19h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M384 128q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM384 640q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5 t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5zM384 1152q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1792 736v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5z M1792 1248v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M381 -84q0 -80 -54.5 -126t-135.5 -46q-106 0 -172 66l57 88q49 -45 106 -45q29 0 50.5 14.5t21.5 42.5q0 64 -105 56l-26 56q8 10 32.5 43.5t42.5 54t37 38.5v1q-16 0 -48.5 -1t-48.5 -1v-53h-106v152h333v-88l-95 -115q51 -12 81 -49t30 -88zM383 543v-159h-362 q-6 36 -6 54q0 51 23.5 93t56.5 68t66 47.5t56.5 43.5t23.5 45q0 25 -14.5 38.5t-39.5 13.5q-46 0 -81 -58l-85 59q24 51 71.5 79.5t105.5 28.5q73 0 123 -41.5t50 -112.5q0 -50 -34 -91.5t-75 -64.5t-75.5 -50.5t-35.5 -52.5h127v60h105zM1792 224v-192q0 -13 -9.5 -22.5 t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 14 9 23t23 9h1216q13 0 22.5 -9.5t9.5 -22.5zM384 1123v-99h-335v99h107q0 41 0.5 122t0.5 121v12h-2q-8 -17 -50 -54l-71 76l136 127h106v-404h108zM1792 736v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5 t-9.5 22.5v192q0 14 9 23t23 9h1216q13 0 22.5 -9.5t9.5 -22.5zM1792 1248v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1760 640q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1728q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h1728zM483 704q-28 35 -51 80q-48 97 -48 188q0 181 134 309q133 127 393 127q50 0 167 -19q66 -12 177 -48q10 -38 21 -118q14 -123 14 -183q0 -18 -5 -45l-12 -3l-84 6 l-14 2q-50 149 -103 205q-88 91 -210 91q-114 0 -182 -59q-67 -58 -67 -146q0 -73 66 -140t279 -129q69 -20 173 -66q58 -28 95 -52h-743zM990 448h411q7 -39 7 -92q0 -111 -41 -212q-23 -55 -71 -104q-37 -35 -109 -81q-80 -48 -153 -66q-80 -21 -203 -21q-114 0 -195 23 l-140 40q-57 16 -72 28q-8 8 -8 22v13q0 108 -2 156q-1 30 0 68l2 37v44l102 2q15 -34 30 -71t22.5 -56t12.5 -27q35 -57 80 -94q43 -36 105 -57q59 -22 132 -22q64 0 139 27q77 26 122 86q47 61 47 129q0 84 -81 157q-34 29 -137 71z" /> +<glyph unicode="" d="M48 1313q-37 2 -45 4l-3 88q13 1 40 1q60 0 112 -4q132 -7 166 -7q86 0 168 3q116 4 146 5q56 0 86 2l-1 -14l2 -64v-9q-60 -9 -124 -9q-60 0 -79 -25q-13 -14 -13 -132q0 -13 0.5 -32.5t0.5 -25.5l1 -229l14 -280q6 -124 51 -202q35 -59 96 -92q88 -47 177 -47 q104 0 191 28q56 18 99 51q48 36 65 64q36 56 53 114q21 73 21 229q0 79 -3.5 128t-11 122.5t-13.5 159.5l-4 59q-5 67 -24 88q-34 35 -77 34l-100 -2l-14 3l2 86h84l205 -10q76 -3 196 10l18 -2q6 -38 6 -51q0 -7 -4 -31q-45 -12 -84 -13q-73 -11 -79 -17q-15 -15 -15 -41 q0 -7 1.5 -27t1.5 -31q8 -19 22 -396q6 -195 -15 -304q-15 -76 -41 -122q-38 -65 -112 -123q-75 -57 -182 -89q-109 -33 -255 -33q-167 0 -284 46q-119 47 -179 122q-61 76 -83 195q-16 80 -16 237v333q0 188 -17 213q-25 36 -147 39zM1536 -96v64q0 14 -9 23t-23 9h-1472 q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h1472q14 0 23 9t9 23z" /> +<glyph unicode="" horiz-adv-x="1664" d="M512 160v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM512 544v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1024 160v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23 v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM512 928v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1024 544v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1536 160v192 q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1024 928v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1536 544v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192 q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1536 928v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1664 1248v-1088q0 -66 -47 -113t-113 -47h-1344q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1344q66 0 113 -47t47 -113 z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1190 955l293 293l-107 107l-293 -293zM1637 1248q0 -27 -18 -45l-1286 -1286q-18 -18 -45 -18t-45 18l-198 198q-18 18 -18 45t18 45l1286 1286q18 18 45 18t45 -18l198 -198q18 -18 18 -45zM286 1438l98 -30l-98 -30l-30 -98l-30 98l-98 30l98 30l30 98zM636 1276 l196 -60l-196 -60l-60 -196l-60 196l-196 60l196 60l60 196zM1566 798l98 -30l-98 -30l-30 -98l-30 98l-98 30l98 30l30 98zM926 1438l98 -30l-98 -30l-30 -98l-30 98l-98 30l98 30l30 98z" /> +<glyph unicode="" horiz-adv-x="1792" d="M640 128q0 52 -38 90t-90 38t-90 -38t-38 -90t38 -90t90 -38t90 38t38 90zM256 640h384v256h-158q-13 0 -22 -9l-195 -195q-9 -9 -9 -22v-30zM1536 128q0 52 -38 90t-90 38t-90 -38t-38 -90t38 -90t90 -38t90 38t38 90zM1792 1216v-1024q0 -15 -4 -26.5t-13.5 -18.5 t-16.5 -11.5t-23.5 -6t-22.5 -2t-25.5 0t-22.5 0.5q0 -106 -75 -181t-181 -75t-181 75t-75 181h-384q0 -106 -75 -181t-181 -75t-181 75t-75 181h-64q-3 0 -22.5 -0.5t-25.5 0t-22.5 2t-23.5 6t-16.5 11.5t-13.5 18.5t-4 26.5q0 26 19 45t45 19v320q0 8 -0.5 35t0 38 t2.5 34.5t6.5 37t14 30.5t22.5 30l198 198q19 19 50.5 32t58.5 13h160v192q0 26 19 45t45 19h1024q26 0 45 -19t19 -45z" /> +<glyph unicode="" d="M1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103q-111 0 -218 32q59 93 78 164q9 34 54 211q20 -39 73 -67.5t114 -28.5q121 0 216 68.5t147 188.5t52 270q0 114 -59.5 214t-172.5 163t-255 63q-105 0 -196 -29t-154.5 -77t-109 -110.5t-67 -129.5t-21.5 -134 q0 -104 40 -183t117 -111q30 -12 38 20q2 7 8 31t8 30q6 23 -11 43q-51 61 -51 151q0 151 104.5 259.5t273.5 108.5q151 0 235.5 -82t84.5 -213q0 -170 -68.5 -289t-175.5 -119q-61 0 -98 43.5t-23 104.5q8 35 26.5 93.5t30 103t11.5 75.5q0 50 -27 83t-77 33 q-62 0 -105 -57t-43 -142q0 -73 25 -122l-99 -418q-17 -70 -13 -177q-206 91 -333 281t-127 423q0 209 103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-725q85 122 108 210q9 34 53 209q21 -39 73.5 -67t112.5 -28q181 0 295.5 147.5t114.5 373.5q0 84 -35 162.5t-96.5 139t-152.5 97t-197 36.5q-104 0 -194.5 -28.5t-153 -76.5 t-107.5 -109.5t-66.5 -128t-21.5 -132.5q0 -102 39.5 -180t116.5 -110q13 -5 23.5 0t14.5 19q10 44 15 61q6 23 -11 42q-50 62 -50 150q0 150 103.5 256.5t270.5 106.5q149 0 232.5 -81t83.5 -210q0 -168 -67.5 -286t-173.5 -118q-60 0 -97 43.5t-23 103.5q8 34 26.5 92.5 t29.5 102t11 74.5q0 49 -26.5 81.5t-75.5 32.5q-61 0 -103.5 -56.5t-42.5 -139.5q0 -72 24 -121l-98 -414q-24 -100 -7 -254h-183q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960z" /> +<glyph unicode="" d="M678 -57q0 -38 -10 -71h-380q-95 0 -171.5 56.5t-103.5 147.5q24 45 69 77.5t100 49.5t107 24t107 7q32 0 49 -2q6 -4 30.5 -21t33 -23t31 -23t32 -25.5t27.5 -25.5t26.5 -29.5t21 -30.5t17.5 -34.5t9.5 -36t4.5 -40.5zM385 294q-234 -7 -385 -85v433q103 -118 273 -118 q32 0 70 5q-21 -61 -21 -86q0 -67 63 -149zM558 805q0 -100 -43.5 -160.5t-140.5 -60.5q-51 0 -97 26t-78 67.5t-56 93.5t-35.5 104t-11.5 99q0 96 51.5 165t144.5 69q66 0 119 -41t84 -104t47 -130t16 -128zM1536 896v-736q0 -119 -84.5 -203.5t-203.5 -84.5h-468 q39 73 39 157q0 66 -22 122.5t-55.5 93t-72 71t-72 59.5t-55.5 54.5t-22 59.5q0 36 23 68t56 61.5t65.5 64.5t55.5 93t23 131t-26.5 145.5t-75.5 118.5q-6 6 -14 11t-12.5 7.5t-10 9.5t-10.5 17h135l135 64h-437q-138 0 -244.5 -38.5t-182.5 -133.5q0 126 81 213t207 87h960 q119 0 203.5 -84.5t84.5 -203.5v-96h-256v256h-128v-256h-256v-128h256v-256h128v256h256z" /> +<glyph unicode="" horiz-adv-x="1664" d="M876 71q0 21 -4.5 40.5t-9.5 36t-17.5 34.5t-21 30.5t-26.5 29.5t-27.5 25.5t-32 25.5t-31 23t-33 23t-30.5 21q-17 2 -50 2q-54 0 -106 -7t-108 -25t-98 -46t-69 -75t-27 -107q0 -68 35.5 -121.5t93 -84t120.5 -45.5t127 -15q59 0 112.5 12.5t100.5 39t74.5 73.5 t27.5 110zM756 933q0 60 -16.5 127.5t-47 130.5t-84 104t-119.5 41q-93 0 -144 -69t-51 -165q0 -47 11.5 -99t35.5 -104t56 -93.5t78 -67.5t97 -26q97 0 140.5 60.5t43.5 160.5zM625 1408h437l-135 -79h-135q71 -45 110 -126t39 -169q0 -74 -23 -131.5t-56 -92.5t-66 -64.5 t-56 -61t-23 -67.5q0 -26 16.5 -51t43 -48t58.5 -48t64 -55.5t58.5 -66t43 -85t16.5 -106.5q0 -160 -140 -282q-152 -131 -420 -131q-59 0 -119.5 10t-122 33.5t-108.5 58t-77 89t-30 121.5q0 61 37 135q32 64 96 110.5t145 71t155 36t150 13.5q-64 83 -64 149q0 12 2 23.5 t5 19.5t8 21.5t7 21.5q-40 -5 -70 -5q-149 0 -255.5 98t-106.5 246q0 140 95 250.5t234 141.5q94 20 187 20zM1664 1152v-128h-256v-256h-128v256h-256v128h256v256h128v-256h256z" /> +<glyph unicode="" horiz-adv-x="1920" d="M768 384h384v96h-128v448h-114l-148 -137l77 -80q42 37 55 57h2v-288h-128v-96zM1280 640q0 -70 -21 -142t-59.5 -134t-101.5 -101t-138 -39t-138 39t-101.5 101t-59.5 134t-21 142t21 142t59.5 134t101.5 101t138 39t138 -39t101.5 -101t59.5 -134t21 -142zM1792 384 v512q-106 0 -181 75t-75 181h-1152q0 -106 -75 -181t-181 -75v-512q106 0 181 -75t75 -181h1152q0 106 75 181t181 75zM1920 1216v-1152q0 -26 -19 -45t-45 -19h-1792q-26 0 -45 19t-19 45v1152q0 26 19 45t45 19h1792q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1024 832q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19l-448 448q-19 19 -19 45t19 45t45 19h896q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1024 320q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45l448 448q19 19 45 19t45 -19l448 -448q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="640" d="M640 1088v-896q0 -26 -19 -45t-45 -19t-45 19l-448 448q-19 19 -19 45t19 45l448 448q19 19 45 19t45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="640" d="M576 640q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19t-19 45v896q0 26 19 45t45 19t45 -19l448 -448q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M160 0h608v1152h-640v-1120q0 -13 9.5 -22.5t22.5 -9.5zM1536 32v1120h-640v-1152h608q13 0 22.5 9.5t9.5 22.5zM1664 1248v-1216q0 -66 -47 -113t-113 -47h-1344q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1344q66 0 113 -47t47 -113z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1024 448q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19l-448 448q-19 19 -19 45t19 45t45 19h896q26 0 45 -19t19 -45zM1024 832q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45l448 448q19 19 45 19t45 -19l448 -448q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1024 448q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19l-448 448q-19 19 -19 45t19 45t45 19h896q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1024 832q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45l448 448q19 19 45 19t45 -19l448 -448q19 -19 19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 826v-794q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v794q44 -49 101 -87q362 -246 497 -345q57 -42 92.5 -65.5t94.5 -48t110 -24.5h1h1q51 0 110 24.5t94.5 48t92.5 65.5q170 123 498 345q57 39 100 87zM1792 1120q0 -79 -49 -151t-122 -123 q-376 -261 -468 -325q-10 -7 -42.5 -30.5t-54 -38t-52 -32.5t-57.5 -27t-50 -9h-1h-1q-23 0 -50 9t-57.5 27t-52 32.5t-54 38t-42.5 30.5q-91 64 -262 182.5t-205 142.5q-62 42 -117 115.5t-55 136.5q0 78 41.5 130t118.5 52h1472q65 0 112.5 -47t47.5 -113z" /> +<glyph unicode="" d="M349 911v-991h-330v991h330zM370 1217q1 -73 -50.5 -122t-135.5 -49h-2q-82 0 -132 49t-50 122q0 74 51.5 122.5t134.5 48.5t133 -48.5t51 -122.5zM1536 488v-568h-329v530q0 105 -40.5 164.5t-126.5 59.5q-63 0 -105.5 -34.5t-63.5 -85.5q-11 -30 -11 -81v-553h-329 q2 399 2 647t-1 296l-1 48h329v-144h-2q20 32 41 56t56.5 52t87 43.5t114.5 15.5q171 0 275 -113.5t104 -332.5z" /> +<glyph unicode="" d="M1536 640q0 -156 -61 -298t-164 -245t-245 -164t-298 -61q-172 0 -327 72.5t-264 204.5q-7 10 -6.5 22.5t8.5 20.5l137 138q10 9 25 9q16 -2 23 -12q73 -95 179 -147t225 -52q104 0 198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5t-40.5 198.5t-109.5 163.5 t-163.5 109.5t-198.5 40.5q-98 0 -188 -35.5t-160 -101.5l137 -138q31 -30 14 -69q-17 -40 -59 -40h-448q-26 0 -45 19t-19 45v448q0 42 40 59q39 17 69 -14l130 -129q107 101 244.5 156.5t284.5 55.5q156 0 298 -61t245 -164t164 -245t61 -298z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1771 0q0 -53 -37 -90l-107 -108q-39 -37 -91 -37q-53 0 -90 37l-363 364q-38 36 -38 90q0 53 43 96l-256 256l-126 -126q-14 -14 -34 -14t-34 14q2 -2 12.5 -12t12.5 -13t10 -11.5t10 -13.5t6 -13.5t5.5 -16.5t1.5 -18q0 -38 -28 -68q-3 -3 -16.5 -18t-19 -20.5 t-18.5 -16.5t-22 -15.5t-22 -9t-26 -4.5q-40 0 -68 28l-408 408q-28 28 -28 68q0 13 4.5 26t9 22t15.5 22t16.5 18.5t20.5 19t18 16.5q30 28 68 28q10 0 18 -1.5t16.5 -5.5t13.5 -6t13.5 -10t11.5 -10t13 -12.5t12 -12.5q-14 14 -14 34t14 34l348 348q14 14 34 14t34 -14 q-2 2 -12.5 12t-12.5 13t-10 11.5t-10 13.5t-6 13.5t-5.5 16.5t-1.5 18q0 38 28 68q3 3 16.5 18t19 20.5t18.5 16.5t22 15.5t22 9t26 4.5q40 0 68 -28l408 -408q28 -28 28 -68q0 -13 -4.5 -26t-9 -22t-15.5 -22t-16.5 -18.5t-20.5 -19t-18 -16.5q-30 -28 -68 -28 q-10 0 -18 1.5t-16.5 5.5t-13.5 6t-13.5 10t-11.5 10t-13 12.5t-12 12.5q14 -14 14 -34t-14 -34l-126 -126l256 -256q43 43 96 43q52 0 91 -37l363 -363q37 -39 37 -91z" /> +<glyph unicode="" horiz-adv-x="1792" d="M384 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM576 832q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1004 351l101 382q6 26 -7.5 48.5t-38.5 29.5 t-48 -6.5t-30 -39.5l-101 -382q-60 -5 -107 -43.5t-63 -98.5q-20 -77 20 -146t117 -89t146 20t89 117q16 60 -6 117t-72 91zM1664 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1024 1024q0 53 -37.5 90.5 t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1472 832q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1792 384q0 -261 -141 -483q-19 -29 -54 -29h-1402q-35 0 -54 29 q-141 221 -141 483q0 182 71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" /> +<glyph unicode="" horiz-adv-x="1792" d="M896 1152q-204 0 -381.5 -69.5t-282 -187.5t-104.5 -255q0 -112 71.5 -213.5t201.5 -175.5l87 -50l-27 -96q-24 -91 -70 -172q152 63 275 171l43 38l57 -6q69 -8 130 -8q204 0 381.5 69.5t282 187.5t104.5 255t-104.5 255t-282 187.5t-381.5 69.5zM1792 640 q0 -174 -120 -321.5t-326 -233t-450 -85.5q-70 0 -145 8q-198 -175 -460 -242q-49 -14 -114 -22h-5q-15 0 -27 10.5t-16 27.5v1q-3 4 -0.5 12t2 10t4.5 9.5l6 9t7 8.5t8 9q7 8 31 34.5t34.5 38t31 39.5t32.5 51t27 59t26 76q-157 89 -247.5 220t-90.5 281q0 174 120 321.5 t326 233t450 85.5t450 -85.5t326 -233t120 -321.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M704 1152q-153 0 -286 -52t-211.5 -141t-78.5 -191q0 -82 53 -158t149 -132l97 -56l-35 -84q34 20 62 39l44 31l53 -10q78 -14 153 -14q153 0 286 52t211.5 141t78.5 191t-78.5 191t-211.5 141t-286 52zM704 1280q191 0 353.5 -68.5t256.5 -186.5t94 -257t-94 -257 t-256.5 -186.5t-353.5 -68.5q-86 0 -176 16q-124 -88 -278 -128q-36 -9 -86 -16h-3q-11 0 -20.5 8t-11.5 21q-1 3 -1 6.5t0.5 6.5t2 6l2.5 5t3.5 5.5t4 5t4.5 5t4 4.5q5 6 23 25t26 29.5t22.5 29t25 38.5t20.5 44q-124 72 -195 177t-71 224q0 139 94 257t256.5 186.5 t353.5 68.5zM1526 111q10 -24 20.5 -44t25 -38.5t22.5 -29t26 -29.5t23 -25q1 -1 4 -4.5t4.5 -5t4 -5t3.5 -5.5l2.5 -5t2 -6t0.5 -6.5t-1 -6.5q-3 -14 -13 -22t-22 -7q-50 7 -86 16q-154 40 -278 128q-90 -16 -176 -16q-271 0 -472 132q58 -4 88 -4q161 0 309 45t264 129 q125 92 192 212t67 254q0 77 -23 152q129 -71 204 -178t75 -230q0 -120 -71 -224.5t-195 -176.5z" /> +<glyph unicode="" horiz-adv-x="896" d="M885 970q18 -20 7 -44l-540 -1157q-13 -25 -42 -25q-4 0 -14 2q-17 5 -25.5 19t-4.5 30l197 808l-406 -101q-4 -1 -12 -1q-18 0 -31 11q-18 15 -13 39l201 825q4 14 16 23t28 9h328q19 0 32 -12.5t13 -29.5q0 -8 -5 -18l-171 -463l396 98q8 2 12 2q19 0 34 -15z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 288v-320q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h96v192h-512v-192h96q40 0 68 -28t28 -68v-320q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h96v192h-512v-192h96q40 0 68 -28t28 -68v-320 q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h96v192q0 52 38 90t90 38h512v192h-96q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h320q40 0 68 -28t28 -68v-320q0 -40 -28 -68t-68 -28h-96v-192h512q52 0 90 -38t38 -90v-192h96q40 0 68 -28t28 -68 z" /> +<glyph unicode="" horiz-adv-x="1664" d="M896 708v-580q0 -104 -76 -180t-180 -76t-180 76t-76 180q0 26 19 45t45 19t45 -19t19 -45q0 -50 39 -89t89 -39t89 39t39 89v580q33 11 64 11t64 -11zM1664 681q0 -13 -9.5 -22.5t-22.5 -9.5q-11 0 -23 10q-49 46 -93 69t-102 23q-68 0 -128 -37t-103 -97 q-7 -10 -17.5 -28t-14.5 -24q-11 -17 -28 -17q-18 0 -29 17q-4 6 -14.5 24t-17.5 28q-43 60 -102.5 97t-127.5 37t-127.5 -37t-102.5 -97q-7 -10 -17.5 -28t-14.5 -24q-11 -17 -29 -17q-17 0 -28 17q-4 6 -14.5 24t-17.5 28q-43 60 -103 97t-128 37q-58 0 -102 -23t-93 -69 q-12 -10 -23 -10q-13 0 -22.5 9.5t-9.5 22.5q0 5 1 7q45 183 172.5 319.5t298 204.5t360.5 68q140 0 274.5 -40t246.5 -113.5t194.5 -187t115.5 -251.5q1 -2 1 -7zM896 1408v-98q-42 2 -64 2t-64 -2v98q0 26 19 45t45 19t45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M768 -128h896v640h-416q-40 0 -68 28t-28 68v416h-384v-1152zM1024 1312v64q0 13 -9.5 22.5t-22.5 9.5h-704q-13 0 -22.5 -9.5t-9.5 -22.5v-64q0 -13 9.5 -22.5t22.5 -9.5h704q13 0 22.5 9.5t9.5 22.5zM1280 640h299l-299 299v-299zM1792 512v-672q0 -40 -28 -68t-68 -28 h-960q-40 0 -68 28t-28 68v160h-544q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h1088q40 0 68 -28t28 -68v-328q21 -13 36 -28l408 -408q28 -28 48 -76t20 -88z" /> +<glyph unicode="" horiz-adv-x="1024" d="M736 960q0 -13 -9.5 -22.5t-22.5 -9.5t-22.5 9.5t-9.5 22.5q0 46 -54 71t-106 25q-13 0 -22.5 9.5t-9.5 22.5t9.5 22.5t22.5 9.5q50 0 99.5 -16t87 -54t37.5 -90zM896 960q0 72 -34.5 134t-90 101.5t-123 62t-136.5 22.5t-136.5 -22.5t-123 -62t-90 -101.5t-34.5 -134 q0 -101 68 -180q10 -11 30.5 -33t30.5 -33q128 -153 141 -298h228q13 145 141 298q10 11 30.5 33t30.5 33q68 79 68 180zM1024 960q0 -155 -103 -268q-45 -49 -74.5 -87t-59.5 -95.5t-34 -107.5q47 -28 47 -82q0 -37 -25 -64q25 -27 25 -64q0 -52 -45 -81q13 -23 13 -47 q0 -46 -31.5 -71t-77.5 -25q-20 -44 -60 -70t-87 -26t-87 26t-60 70q-46 0 -77.5 25t-31.5 71q0 24 13 47q-45 29 -45 81q0 37 25 64q-25 27 -25 64q0 54 47 82q-4 50 -34 107.5t-59.5 95.5t-74.5 87q-103 113 -103 268q0 99 44.5 184.5t117 142t164 89t186.5 32.5 t186.5 -32.5t164 -89t117 -142t44.5 -184.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 352v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5q-12 0 -24 10l-319 320q-9 9 -9 22q0 14 9 23l320 320q9 9 23 9q13 0 22.5 -9.5t9.5 -22.5v-192h1376q13 0 22.5 -9.5t9.5 -22.5zM1792 896q0 -14 -9 -23l-320 -320q-9 -9 -23 -9 q-13 0 -22.5 9.5t-9.5 22.5v192h-1376q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1376v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1280 608q0 14 -9 23t-23 9h-224v352q0 13 -9.5 22.5t-22.5 9.5h-192q-13 0 -22.5 -9.5t-9.5 -22.5v-352h-224q-13 0 -22.5 -9.5t-9.5 -22.5q0 -14 9 -23l352 -352q9 -9 23 -9t23 9l351 351q10 12 10 24zM1920 384q0 -159 -112.5 -271.5t-271.5 -112.5h-1088 q-185 0 -316.5 131.5t-131.5 316.5q0 130 70 240t188 165q-2 30 -2 43q0 212 150 362t362 150q156 0 285.5 -87t188.5 -231q71 62 166 62q106 0 181 -75t75 -181q0 -76 -41 -138q130 -31 213.5 -135.5t83.5 -238.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1280 672q0 14 -9 23l-352 352q-9 9 -23 9t-23 -9l-351 -351q-10 -12 -10 -24q0 -14 9 -23t23 -9h224v-352q0 -13 9.5 -22.5t22.5 -9.5h192q13 0 22.5 9.5t9.5 22.5v352h224q13 0 22.5 9.5t9.5 22.5zM1920 384q0 -159 -112.5 -271.5t-271.5 -112.5h-1088 q-185 0 -316.5 131.5t-131.5 316.5q0 130 70 240t188 165q-2 30 -2 43q0 212 150 362t362 150q156 0 285.5 -87t188.5 -231q71 62 166 62q106 0 181 -75t75 -181q0 -76 -41 -138q130 -31 213.5 -135.5t83.5 -238.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M384 192q0 -26 -19 -45t-45 -19t-45 19t-19 45t19 45t45 19t45 -19t19 -45zM1408 131q0 -121 -73 -190t-194 -69h-874q-121 0 -194 69t-73 190q0 68 5.5 131t24 138t47.5 132.5t81 103t120 60.5q-22 -52 -22 -120v-203q-58 -20 -93 -70t-35 -111q0 -80 56 -136t136 -56 t136 56t56 136q0 61 -35.5 111t-92.5 70v203q0 62 25 93q132 -104 295 -104t295 104q25 -31 25 -93v-64q-106 0 -181 -75t-75 -181v-89q-32 -29 -32 -71q0 -40 28 -68t68 -28t68 28t28 68q0 42 -32 71v89q0 52 38 90t90 38t90 -38t38 -90v-89q-32 -29 -32 -71q0 -40 28 -68 t68 -28t68 28t28 68q0 42 -32 71v89q0 68 -34.5 127.5t-93.5 93.5q0 10 0.5 42.5t0 48t-2.5 41.5t-7 47t-13 40q68 -15 120 -60.5t81 -103t47.5 -132.5t24 -138t5.5 -131zM1088 1024q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5 t271.5 -112.5t112.5 -271.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1280 832q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 832q0 -62 -35.5 -111t-92.5 -70v-395q0 -159 -131.5 -271.5t-316.5 -112.5t-316.5 112.5t-131.5 271.5v132q-164 20 -274 128t-110 252v512q0 26 19 45t45 19q6 0 16 -2q17 30 47 48 t65 18q53 0 90.5 -37.5t37.5 -90.5t-37.5 -90.5t-90.5 -37.5q-33 0 -64 18v-402q0 -106 94 -181t226 -75t226 75t94 181v402q-31 -18 -64 -18q-53 0 -90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5q35 0 65 -18t47 -48q10 2 16 2q26 0 45 -19t19 -45v-512q0 -144 -110 -252 t-274 -128v-132q0 -106 94 -181t226 -75t226 75t94 181v395q-57 21 -92.5 70t-35.5 111q0 80 56 136t136 56t136 -56t56 -136z" /> +<glyph unicode="" horiz-adv-x="1792" d="M640 1152h512v128h-512v-128zM288 1152v-1280h-64q-92 0 -158 66t-66 158v832q0 92 66 158t158 66h64zM1408 1152v-1280h-1024v1280h128v160q0 40 28 68t68 28h576q40 0 68 -28t28 -68v-160h128zM1792 928v-832q0 -92 -66 -158t-158 -66h-64v1280h64q92 0 158 -66 t66 -158z" /> +<glyph unicode="" horiz-adv-x="1664" d="M848 -160q0 16 -16 16q-59 0 -101.5 42.5t-42.5 101.5q0 16 -16 16t-16 -16q0 -73 51.5 -124.5t124.5 -51.5q16 0 16 16zM1664 128q0 -52 -38 -90t-90 -38h-448q0 -106 -75 -181t-181 -75t-181 75t-75 181h-448q-52 0 -90 38t-38 90q190 161 287 397.5t97 498.5 q0 165 96 262t264 117q-8 18 -8 37q0 40 28 68t68 28t68 -28t28 -68q0 -19 -8 -37q168 -20 264 -117t96 -262q0 -262 97 -498.5t287 -397.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1664 896q0 80 -56 136t-136 56h-64v-384h64q80 0 136 56t56 136zM0 128h1792q0 -106 -75 -181t-181 -75h-1280q-106 0 -181 75t-75 181zM1856 896q0 -159 -112.5 -271.5t-271.5 -112.5h-64v-32q0 -92 -66 -158t-158 -66h-704q-92 0 -158 66t-66 158v736q0 26 19 45 t45 19h1152q159 0 271.5 -112.5t112.5 -271.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M640 1472v-640q0 -61 -35.5 -111t-92.5 -70v-779q0 -52 -38 -90t-90 -38h-128q-52 0 -90 38t-38 90v779q-57 20 -92.5 70t-35.5 111v640q0 26 19 45t45 19t45 -19t19 -45v-416q0 -26 19 -45t45 -19t45 19t19 45v416q0 26 19 45t45 19t45 -19t19 -45v-416q0 -26 19 -45 t45 -19t45 19t19 45v416q0 26 19 45t45 19t45 -19t19 -45zM1408 1472v-1600q0 -52 -38 -90t-90 -38h-128q-52 0 -90 38t-38 90v512h-224q-13 0 -22.5 9.5t-9.5 22.5v800q0 132 94 226t226 94h256q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1024 352v-64q0 -14 -9 -23t-23 -9h-704q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h704q14 0 23 -9t9 -23zM1024 608v-64q0 -14 -9 -23t-23 -9h-704q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h704q14 0 23 -9t9 -23zM128 0h1024v768h-416q-40 0 -68 28t-28 68v416h-512v-1280z M768 896h376q-10 29 -22 41l-313 313q-12 12 -41 22v-376zM1280 864v-896q0 -40 -28 -68t-68 -28h-1088q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h640q40 0 88 -20t76 -48l312 -312q28 -28 48 -76t20 -88z" /> +<glyph unicode="" horiz-adv-x="1408" d="M384 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M640 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M1152 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M640 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M1152 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M640 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M1152 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M640 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M896 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M896 -128h384v1536h-1152v-1536h384v224q0 13 9.5 22.5t22.5 9.5h320q13 0 22.5 -9.5t9.5 -22.5v-224zM1408 1472v-1664q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v1664q0 26 19 45t45 19h1280q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1408" d="M384 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M640 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M1152 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M640 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M896 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z M896 -128h384v1152h-256v-32q0 -40 -28 -68t-68 -28h-448q-40 0 -68 28t-28 68v32h-256v-1152h384v224q0 13 9.5 22.5t22.5 9.5h320q13 0 22.5 -9.5t9.5 -22.5v-224zM896 1056v320q0 13 -9.5 22.5t-22.5 9.5h-64q-13 0 -22.5 -9.5t-9.5 -22.5v-96h-128v96q0 13 -9.5 22.5 t-22.5 9.5h-64q-13 0 -22.5 -9.5t-9.5 -22.5v-320q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5v96h128v-96q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5zM1408 1088v-1280q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v1280q0 26 19 45t45 19h320 v288q0 40 28 68t68 28h448q40 0 68 -28t28 -68v-288h320q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1920" d="M640 128q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM256 640h384v256h-158q-14 -2 -22 -9l-195 -195q-7 -12 -9 -22v-30zM1536 128q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5 t90.5 37.5t37.5 90.5zM1664 800v192q0 14 -9 23t-23 9h-224v224q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-224h-224q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h224v-224q0 -14 9 -23t23 -9h192q14 0 23 9t9 23v224h224q14 0 23 9t9 23zM1920 1344v-1152 q0 -26 -19 -45t-45 -19h-192q0 -106 -75 -181t-181 -75t-181 75t-75 181h-384q0 -106 -75 -181t-181 -75t-181 75t-75 181h-128q-26 0 -45 19t-19 45t19 45t45 19v416q0 26 13 58t32 51l198 198q19 19 51 32t58 13h160v320q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1280 416v192q0 14 -9 23t-23 9h-224v224q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-224h-224q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h224v-224q0 -14 9 -23t23 -9h192q14 0 23 9t9 23v224h224q14 0 23 9t9 23zM640 1152h512v128h-512v-128zM256 1152v-1280h-32 q-92 0 -158 66t-66 158v832q0 92 66 158t158 66h32zM1440 1152v-1280h-1088v1280h160v160q0 40 28 68t68 28h576q40 0 68 -28t28 -68v-160h160zM1792 928v-832q0 -92 -66 -158t-158 -66h-32v1280h32q92 0 158 -66t66 -158z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1920 576q-1 -32 -288 -96l-352 -32l-224 -64h-64l-293 -352h69q26 0 45 -4.5t19 -11.5t-19 -11.5t-45 -4.5h-96h-160h-64v32h64v416h-160l-192 -224h-96l-32 32v192h32v32h128v8l-192 24v128l192 24v8h-128v32h-32v192l32 32h96l192 -224h160v416h-64v32h64h160h96 q26 0 45 -4.5t19 -11.5t-19 -11.5t-45 -4.5h-69l293 -352h64l224 -64l352 -32q261 -58 287 -93z" /> +<glyph unicode="" horiz-adv-x="1664" d="M640 640v384h-256v-256q0 -53 37.5 -90.5t90.5 -37.5h128zM1664 192v-192h-1152v192l128 192h-128q-159 0 -271.5 112.5t-112.5 271.5v320l-64 64l32 128h480l32 128h960l32 -192l-64 -32v-800z" /> +<glyph unicode="" d="M1280 192v896q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-320h-512v320q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-896q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v320h512v-320q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1536 1120v-960 q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1280 576v128q0 26 -19 45t-45 19h-320v320q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-320h-320q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h320v-320q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v320h320q26 0 45 19t19 45zM1536 1120v-960 q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M627 160q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23t-10 -23l-393 -393l393 -393q10 -10 10 -23zM1011 160q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23 t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23t-10 -23l-393 -393l393 -393q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1024" d="M595 576q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23zM979 576q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23 l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1075 224q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-393 393l-393 -393q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l466 -466q10 -10 10 -23zM1075 608q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-393 393l-393 -393 q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1075 672q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l393 -393l393 393q10 10 23 10t23 -10l50 -50q10 -10 10 -23zM1075 1056q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23 t10 23l50 50q10 10 23 10t23 -10l393 -393l393 393q10 10 23 10t23 -10l50 -50q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="640" d="M627 992q0 -13 -10 -23l-393 -393l393 -393q10 -10 10 -23t-10 -23l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="640" d="M595 576q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1075 352q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-393 393l-393 -393q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1075 800q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l393 -393l393 393q10 10 23 10t23 -10l50 -50q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1792 544v832q0 13 -9.5 22.5t-22.5 9.5h-1600q-13 0 -22.5 -9.5t-9.5 -22.5v-832q0 -13 9.5 -22.5t22.5 -9.5h1600q13 0 22.5 9.5t9.5 22.5zM1920 1376v-1088q0 -66 -47 -113t-113 -47h-544q0 -37 16 -77.5t32 -71t16 -43.5q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19 t-19 45q0 14 16 44t32 70t16 78h-544q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" /> +<glyph unicode="" horiz-adv-x="1920" d="M416 256q-66 0 -113 47t-47 113v704q0 66 47 113t113 47h1088q66 0 113 -47t47 -113v-704q0 -66 -47 -113t-113 -47h-1088zM384 1120v-704q0 -13 9.5 -22.5t22.5 -9.5h1088q13 0 22.5 9.5t9.5 22.5v704q0 13 -9.5 22.5t-22.5 9.5h-1088q-13 0 -22.5 -9.5t-9.5 -22.5z M1760 192h160v-96q0 -40 -47 -68t-113 -28h-1600q-66 0 -113 28t-47 68v96h160h1600zM1040 96q16 0 16 16t-16 16h-160q-16 0 -16 -16t16 -16h160z" /> +<glyph unicode="" horiz-adv-x="1152" d="M640 128q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1024 288v960q0 13 -9.5 22.5t-22.5 9.5h-832q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h832q13 0 22.5 9.5t9.5 22.5zM1152 1248v-1088q0 -66 -47 -113t-113 -47h-832 q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h832q66 0 113 -47t47 -113z" /> +<glyph unicode="" horiz-adv-x="768" d="M464 128q0 33 -23.5 56.5t-56.5 23.5t-56.5 -23.5t-23.5 -56.5t23.5 -56.5t56.5 -23.5t56.5 23.5t23.5 56.5zM672 288v704q0 13 -9.5 22.5t-22.5 9.5h-512q-13 0 -22.5 -9.5t-9.5 -22.5v-704q0 -13 9.5 -22.5t22.5 -9.5h512q13 0 22.5 9.5t9.5 22.5zM480 1136 q0 16 -16 16h-160q-16 0 -16 -16t16 -16h160q16 0 16 16zM768 1152v-1024q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v1024q0 52 38 90t90 38h512q52 0 90 -38t38 -90z" /> +<glyph unicode="" d="M768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103 t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M768 576v-384q0 -80 -56 -136t-136 -56h-384q-80 0 -136 56t-56 136v704q0 104 40.5 198.5t109.5 163.5t163.5 109.5t198.5 40.5h64q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-64q-106 0 -181 -75t-75 -181v-32q0 -40 28 -68t68 -28h224q80 0 136 -56t56 -136z M1664 576v-384q0 -80 -56 -136t-136 -56h-384q-80 0 -136 56t-56 136v704q0 104 40.5 198.5t109.5 163.5t163.5 109.5t198.5 40.5h64q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-64q-106 0 -181 -75t-75 -181v-32q0 -40 28 -68t68 -28h224q80 0 136 -56t56 -136z" /> +<glyph unicode="" horiz-adv-x="1664" d="M768 1216v-704q0 -104 -40.5 -198.5t-109.5 -163.5t-163.5 -109.5t-198.5 -40.5h-64q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h64q106 0 181 75t75 181v32q0 40 -28 68t-68 28h-224q-80 0 -136 56t-56 136v384q0 80 56 136t136 56h384q80 0 136 -56t56 -136zM1664 1216 v-704q0 -104 -40.5 -198.5t-109.5 -163.5t-163.5 -109.5t-198.5 -40.5h-64q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h64q106 0 181 75t75 181v32q0 40 -28 68t-68 28h-224q-80 0 -136 56t-56 136v384q0 80 56 136t136 56h384q80 0 136 -56t56 -136z" /> +<glyph unicode="" horiz-adv-x="1568" d="M496 192q0 -60 -42.5 -102t-101.5 -42q-60 0 -102 42t-42 102t42 102t102 42q59 0 101.5 -42t42.5 -102zM928 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM320 640q0 -66 -47 -113t-113 -47t-113 47t-47 113 t47 113t113 47t113 -47t47 -113zM1360 192q0 -46 -33 -79t-79 -33t-79 33t-33 79t33 79t79 33t79 -33t33 -79zM528 1088q0 -73 -51.5 -124.5t-124.5 -51.5t-124.5 51.5t-51.5 124.5t51.5 124.5t124.5 51.5t124.5 -51.5t51.5 -124.5zM992 1280q0 -80 -56 -136t-136 -56 t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1536 640q0 -40 -28 -68t-68 -28t-68 28t-28 68t28 68t68 28t68 -28t28 -68zM1328 1088q0 -33 -23.5 -56.5t-56.5 -23.5t-56.5 23.5t-23.5 56.5t23.5 56.5t56.5 23.5t56.5 -23.5t23.5 -56.5z" /> +<glyph unicode="" d="M1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 416q0 -166 -127 -451q-3 -7 -10.5 -24t-13.5 -30t-13 -22q-12 -17 -28 -17q-15 0 -23.5 10t-8.5 25q0 9 2.5 26.5t2.5 23.5q5 68 5 123q0 101 -17.5 181t-48.5 138.5t-80 101t-105.5 69.5t-133 42.5t-154 21.5t-175.5 6h-224v-256q0 -26 -19 -45t-45 -19t-45 19 l-512 512q-19 19 -19 45t19 45l512 512q19 19 45 19t45 -19t19 -45v-256h224q713 0 875 -403q53 -134 53 -333z" /> +<glyph unicode="" horiz-adv-x="1664" d="M640 320q0 -40 -12.5 -82t-43 -76t-72.5 -34t-72.5 34t-43 76t-12.5 82t12.5 82t43 76t72.5 34t72.5 -34t43 -76t12.5 -82zM1280 320q0 -40 -12.5 -82t-43 -76t-72.5 -34t-72.5 34t-43 76t-12.5 82t12.5 82t43 76t72.5 34t72.5 -34t43 -76t12.5 -82zM1440 320 q0 120 -69 204t-187 84q-41 0 -195 -21q-71 -11 -157 -11t-157 11q-152 21 -195 21q-118 0 -187 -84t-69 -204q0 -88 32 -153.5t81 -103t122 -60t140 -29.5t149 -7h168q82 0 149 7t140 29.5t122 60t81 103t32 153.5zM1664 496q0 -207 -61 -331q-38 -77 -105.5 -133t-141 -86 t-170 -47.5t-171.5 -22t-167 -4.5q-78 0 -142 3t-147.5 12.5t-152.5 30t-137 51.5t-121 81t-86 115q-62 123 -62 331q0 237 136 396q-27 82 -27 170q0 116 51 218q108 0 190 -39.5t189 -123.5q147 35 309 35q148 0 280 -32q105 82 187 121t189 39q51 -102 51 -218 q0 -87 -27 -168q136 -160 136 -398z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1536 224v704q0 40 -28 68t-68 28h-704q-40 0 -68 28t-28 68v64q0 40 -28 68t-68 28h-320q-40 0 -68 -28t-28 -68v-960q0 -40 28 -68t68 -28h1216q40 0 68 28t28 68zM1664 928v-704q0 -92 -66 -158t-158 -66h-1216q-92 0 -158 66t-66 158v960q0 92 66 158t158 66h320 q92 0 158 -66t66 -158v-32h672q92 0 158 -66t66 -158z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1781 605q0 35 -53 35h-1088q-40 0 -85.5 -21.5t-71.5 -52.5l-294 -363q-18 -24 -18 -40q0 -35 53 -35h1088q40 0 86 22t71 53l294 363q18 22 18 39zM640 768h768v160q0 40 -28 68t-68 28h-576q-40 0 -68 28t-28 68v64q0 40 -28 68t-68 28h-320q-40 0 -68 -28t-28 -68 v-853l256 315q44 53 116 87.5t140 34.5zM1909 605q0 -62 -46 -120l-295 -363q-43 -53 -116 -87.5t-140 -34.5h-1088q-92 0 -158 66t-66 158v960q0 92 66 158t158 66h320q92 0 158 -66t66 -158v-32h544q92 0 158 -66t66 -158v-160h192q54 0 99 -24.5t67 -70.5q15 -32 15 -68z " /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" d="M1134 461q-37 -121 -138 -195t-228 -74t-228 74t-138 195q-8 25 4 48.5t38 31.5q25 8 48.5 -4t31.5 -38q25 -80 92.5 -129.5t151.5 -49.5t151.5 49.5t92.5 129.5q8 26 32 38t49 4t37 -31.5t4 -48.5zM640 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5 t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1152 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5 t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1134 307q8 -25 -4 -48.5t-37 -31.5t-49 4t-32 38q-25 80 -92.5 129.5t-151.5 49.5t-151.5 -49.5t-92.5 -129.5q-8 -26 -31.5 -38t-48.5 -4q-26 8 -38 31.5t-4 48.5q37 121 138 195t228 74t228 -74t138 -195zM640 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5 t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1152 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204 t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1152 448q0 -26 -19 -45t-45 -19h-640q-26 0 -45 19t-19 45t19 45t45 19h640q26 0 45 -19t19 -45zM640 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1152 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5 t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M832 448v128q0 14 -9 23t-23 9h-192v192q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-192h-192q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h192v-192q0 -14 9 -23t23 -9h128q14 0 23 9t9 23v192h192q14 0 23 9t9 23zM1408 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5 t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1664 640q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1920 512q0 -212 -150 -362t-362 -150q-192 0 -338 128h-220q-146 -128 -338 -128q-212 0 -362 150 t-150 362t150 362t362 150h896q212 0 362 -150t150 -362z" /> +<glyph unicode="" horiz-adv-x="1920" d="M384 368v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM512 624v-96q0 -16 -16 -16h-224q-16 0 -16 16v96q0 16 16 16h224q16 0 16 -16zM384 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1408 368v-96q0 -16 -16 -16 h-864q-16 0 -16 16v96q0 16 16 16h864q16 0 16 -16zM768 624v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM640 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1024 624v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16 h96q16 0 16 -16zM896 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1280 624v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1664 368v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1152 880v-96 q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1408 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1664 880v-352q0 -16 -16 -16h-224q-16 0 -16 16v96q0 16 16 16h112v240q0 16 16 16h96q16 0 16 -16zM1792 128v896h-1664v-896 h1664zM1920 1024v-896q0 -53 -37.5 -90.5t-90.5 -37.5h-1664q-53 0 -90.5 37.5t-37.5 90.5v896q0 53 37.5 90.5t90.5 37.5h1664q53 0 90.5 -37.5t37.5 -90.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1664 491v616q-169 -91 -306 -91q-82 0 -145 32q-100 49 -184 76.5t-178 27.5q-173 0 -403 -127v-599q245 113 433 113q55 0 103.5 -7.5t98 -26t77 -31t82.5 -39.5l28 -14q44 -22 101 -22q120 0 293 92zM320 1280q0 -35 -17.5 -64t-46.5 -46v-1266q0 -14 -9 -23t-23 -9 h-64q-14 0 -23 9t-9 23v1266q-29 17 -46.5 46t-17.5 64q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1792 1216v-763q0 -39 -35 -57q-10 -5 -17 -9q-218 -116 -369 -116q-88 0 -158 35l-28 14q-64 33 -99 48t-91 29t-114 14q-102 0 -235.5 -44t-228.5 -102 q-15 -9 -33 -9q-16 0 -32 8q-32 19 -32 56v742q0 35 31 55q35 21 78.5 42.5t114 52t152.5 49.5t155 19q112 0 209 -31t209 -86q38 -19 89 -19q122 0 310 112q22 12 31 17q31 16 62 -2q31 -20 31 -55z" /> +<glyph unicode="" horiz-adv-x="1792" d="M832 536v192q-181 -16 -384 -117v-185q205 96 384 110zM832 954v197q-172 -8 -384 -126v-189q215 111 384 118zM1664 491v184q-235 -116 -384 -71v224q-20 6 -39 15q-5 3 -33 17t-34.5 17t-31.5 15t-34.5 15.5t-32.5 13t-36 12.5t-35 8.5t-39.5 7.5t-39.5 4t-44 2 q-23 0 -49 -3v-222h19q102 0 192.5 -29t197.5 -82q19 -9 39 -15v-188q42 -17 91 -17q120 0 293 92zM1664 918v189q-169 -91 -306 -91q-45 0 -78 8v-196q148 -42 384 90zM320 1280q0 -35 -17.5 -64t-46.5 -46v-1266q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v1266 q-29 17 -46.5 46t-17.5 64q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1792 1216v-763q0 -39 -35 -57q-10 -5 -17 -9q-218 -116 -369 -116q-88 0 -158 35l-28 14q-64 33 -99 48t-91 29t-114 14q-102 0 -235.5 -44t-228.5 -102q-15 -9 -33 -9q-16 0 -32 8 q-32 19 -32 56v742q0 35 31 55q35 21 78.5 42.5t114 52t152.5 49.5t155 19q112 0 209 -31t209 -86q38 -19 89 -19q122 0 310 112q22 12 31 17q31 16 62 -2q31 -20 31 -55z" /> +<glyph unicode="" horiz-adv-x="1664" d="M585 553l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23t-10 -23zM1664 96v-64q0 -14 -9 -23t-23 -9h-960q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h960q14 0 23 -9 t9 -23z" /> +<glyph unicode="" horiz-adv-x="1920" d="M617 137l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23t-10 -23l-393 -393l393 -393q10 -10 10 -23t-10 -23zM1208 1204l-373 -1291q-4 -13 -15.5 -19.5t-23.5 -2.5l-62 17q-13 4 -19.5 15.5t-2.5 24.5 l373 1291q4 13 15.5 19.5t23.5 2.5l62 -17q13 -4 19.5 -15.5t2.5 -24.5zM1865 553l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23t-10 -23z" /> +<glyph unicode="" horiz-adv-x="1792" d="M640 454v-70q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-512 512q-19 19 -19 45t19 45l512 512q29 31 70 14q39 -17 39 -59v-69l-397 -398q-19 -19 -19 -45t19 -45zM1792 416q0 -58 -17 -133.5t-38.5 -138t-48 -125t-40.5 -90.5l-20 -40q-8 -17 -28 -17q-6 0 -9 1 q-25 8 -23 34q43 400 -106 565q-64 71 -170.5 110.5t-267.5 52.5v-251q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-512 512q-19 19 -19 45t19 45l512 512q29 31 70 14q39 -17 39 -59v-262q411 -28 599 -221q169 -173 169 -509z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1186 579l257 250l-356 52l-66 10l-30 60l-159 322v-963l59 -31l318 -168l-60 355l-12 66zM1638 841l-363 -354l86 -500q5 -33 -6 -51.5t-34 -18.5q-17 0 -40 12l-449 236l-449 -236q-23 -12 -40 -12q-23 0 -34 18.5t-6 51.5l86 500l-364 354q-32 32 -23 59.5t54 34.5 l502 73l225 455q20 41 49 41q28 0 49 -41l225 -455l502 -73q45 -7 54 -34.5t-24 -59.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1401 1187l-640 -1280q-17 -35 -57 -35q-5 0 -15 2q-22 5 -35.5 22.5t-13.5 39.5v576h-576q-22 0 -39.5 13.5t-22.5 35.5t4 42t29 30l1280 640q13 7 29 7q27 0 45 -19q15 -14 18.5 -34.5t-6.5 -39.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M557 256h595v595zM512 301l595 595h-595v-595zM1664 224v-192q0 -14 -9 -23t-23 -9h-224v-224q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v224h-864q-14 0 -23 9t-9 23v864h-224q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h224v224q0 14 9 23t23 9h192q14 0 23 -9t9 -23 v-224h851l246 247q10 9 23 9t23 -9q9 -10 9 -23t-9 -23l-247 -246v-851h224q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1024" d="M288 64q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM288 1216q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM928 1088q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM1024 1088q0 -52 -26 -96.5t-70 -69.5 q-2 -287 -226 -414q-68 -38 -203 -81q-128 -40 -169.5 -71t-41.5 -100v-26q44 -25 70 -69.5t26 -96.5q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 52 26 96.5t70 69.5v820q-44 25 -70 69.5t-26 96.5q0 80 56 136t136 56t136 -56t56 -136q0 -52 -26 -96.5t-70 -69.5v-497 q54 26 154 57q55 17 87.5 29.5t70.5 31t59 39.5t40.5 51t28 69.5t8.5 91.5q-44 25 -70 69.5t-26 96.5q0 80 56 136t136 56t136 -56t56 -136z" /> +<glyph unicode="" horiz-adv-x="1664" d="M439 265l-256 -256q-10 -9 -23 -9q-12 0 -23 9q-9 10 -9 23t9 23l256 256q10 9 23 9t23 -9q9 -10 9 -23t-9 -23zM608 224v-320q0 -14 -9 -23t-23 -9t-23 9t-9 23v320q0 14 9 23t23 9t23 -9t9 -23zM384 448q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9t-9 23t9 23t23 9h320 q14 0 23 -9t9 -23zM1648 320q0 -120 -85 -203l-147 -146q-83 -83 -203 -83q-121 0 -204 85l-334 335q-21 21 -42 56l239 18l273 -274q27 -27 68 -27.5t68 26.5l147 146q28 28 28 67q0 40 -28 68l-274 275l18 239q35 -21 56 -42l336 -336q84 -86 84 -204zM1031 1044l-239 -18 l-273 274q-28 28 -68 28q-39 0 -68 -27l-147 -146q-28 -28 -28 -67q0 -40 28 -68l274 -274l-18 -240q-35 21 -56 42l-336 336q-84 86 -84 204q0 120 85 203l147 146q83 83 203 83q121 0 204 -85l334 -335q21 -21 42 -56zM1664 960q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9 t-9 23t9 23t23 9h320q14 0 23 -9t9 -23zM1120 1504v-320q0 -14 -9 -23t-23 -9t-23 9t-9 23v320q0 14 9 23t23 9t23 -9t9 -23zM1527 1353l-256 -256q-11 -9 -23 -9t-23 9q-9 10 -9 23t9 23l256 256q10 9 23 9t23 -9q9 -10 9 -23t-9 -23z" /> +<glyph unicode="" horiz-adv-x="1024" d="M704 280v-240q0 -16 -12 -28t-28 -12h-240q-16 0 -28 12t-12 28v240q0 16 12 28t28 12h240q16 0 28 -12t12 -28zM1020 880q0 -54 -15.5 -101t-35 -76.5t-55 -59.5t-57.5 -43.5t-61 -35.5q-41 -23 -68.5 -65t-27.5 -67q0 -17 -12 -32.5t-28 -15.5h-240q-15 0 -25.5 18.5 t-10.5 37.5v45q0 83 65 156.5t143 108.5q59 27 84 56t25 76q0 42 -46.5 74t-107.5 32q-65 0 -108 -29q-35 -25 -107 -115q-13 -16 -31 -16q-12 0 -25 8l-164 125q-13 10 -15.5 25t5.5 28q160 266 464 266q80 0 161 -31t146 -83t106 -127.5t41 -158.5z" /> +<glyph unicode="" horiz-adv-x="640" d="M640 192v-128q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h64v384h-64q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h384q26 0 45 -19t19 -45v-576h64q26 0 45 -19t19 -45zM512 1344v-192q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v192 q0 26 19 45t45 19h256q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="640" d="M512 288v-224q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v224q0 26 19 45t45 19h256q26 0 45 -19t19 -45zM542 1344l-28 -768q-1 -26 -20.5 -45t-45.5 -19h-256q-26 0 -45.5 19t-20.5 45l-28 768q-1 26 17.5 45t44.5 19h320q26 0 44.5 -19t17.5 -45z" /> +<glyph unicode="" d="M897 167v-167h-248l-159 252l-24 42q-8 9 -11 21h-3l-9 -21q-10 -20 -25 -44l-155 -250h-258v167h128l197 291l-185 272h-137v168h276l139 -228q2 -4 23 -42q8 -9 11 -21h3q3 9 11 21l25 42l140 228h257v-168h-125l-184 -267l204 -296h109zM1534 846v-206h-514l-3 27 q-4 28 -4 46q0 64 26 117t65 86.5t84 65t84 54.5t65 54t26 64q0 38 -29.5 62.5t-70.5 24.5q-51 0 -97 -39q-14 -11 -36 -38l-105 92q26 37 63 66q83 65 188 65q110 0 178 -59.5t68 -158.5q0 -56 -24.5 -103t-62 -76.5t-81.5 -58.5t-82 -50.5t-65.5 -51.5t-30.5 -63h232v80 h126z" /> +<glyph unicode="" d="M897 167v-167h-248l-159 252l-24 42q-8 9 -11 21h-3l-9 -21q-10 -20 -25 -44l-155 -250h-258v167h128l197 291l-185 272h-137v168h276l139 -228q2 -4 23 -42q8 -9 11 -21h3q3 9 11 21l25 42l140 228h257v-168h-125l-184 -267l204 -296h109zM1536 -50v-206h-514l-4 27 q-3 45 -3 46q0 64 26 117t65 86.5t84 65t84 54.5t65 54t26 64q0 38 -29.5 62.5t-70.5 24.5q-51 0 -97 -39q-14 -11 -36 -38l-105 92q26 37 63 66q80 65 188 65q110 0 178 -59.5t68 -158.5q0 -66 -34.5 -118.5t-84 -86t-99.5 -62.5t-87 -63t-41 -73h232v80h126z" /> +<glyph unicode="" horiz-adv-x="1920" d="M896 128l336 384h-768l-336 -384h768zM1909 1205q15 -34 9.5 -71.5t-30.5 -65.5l-896 -1024q-38 -44 -96 -44h-768q-38 0 -69.5 20.5t-47.5 54.5q-15 34 -9.5 71.5t30.5 65.5l896 1024q38 44 96 44h768q38 0 69.5 -20.5t47.5 -54.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1664 438q0 -81 -44.5 -135t-123.5 -54q-41 0 -77.5 17.5t-59 38t-56.5 38t-71 17.5q-110 0 -110 -124q0 -39 16 -115t15 -115v-5q-22 0 -33 -1q-34 -3 -97.5 -11.5t-115.5 -13.5t-98 -5q-61 0 -103 26.5t-42 83.5q0 37 17.5 71t38 56.5t38 59t17.5 77.5q0 79 -54 123.5 t-135 44.5q-84 0 -143 -45.5t-59 -127.5q0 -43 15 -83t33.5 -64.5t33.5 -53t15 -50.5q0 -45 -46 -89q-37 -35 -117 -35q-95 0 -245 24q-9 2 -27.5 4t-27.5 4l-13 2q-1 0 -3 1q-2 0 -2 1v1024q2 -1 17.5 -3.5t34 -5t21.5 -3.5q150 -24 245 -24q80 0 117 35q46 44 46 89 q0 22 -15 50.5t-33.5 53t-33.5 64.5t-15 83q0 82 59 127.5t144 45.5q80 0 134 -44.5t54 -123.5q0 -41 -17.5 -77.5t-38 -59t-38 -56.5t-17.5 -71q0 -57 42 -83.5t103 -26.5q64 0 180 15t163 17v-2q-1 -2 -3.5 -17.5t-5 -34t-3.5 -21.5q-24 -150 -24 -245q0 -80 35 -117 q44 -46 89 -46q22 0 50.5 15t53 33.5t64.5 33.5t83 15q82 0 127.5 -59t45.5 -143z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1152 832v-128q0 -221 -147.5 -384.5t-364.5 -187.5v-132h256q26 0 45 -19t19 -45t-19 -45t-45 -19h-640q-26 0 -45 19t-19 45t19 45t45 19h256v132q-217 24 -364.5 187.5t-147.5 384.5v128q0 26 19 45t45 19t45 -19t19 -45v-128q0 -185 131.5 -316.5t316.5 -131.5 t316.5 131.5t131.5 316.5v128q0 26 19 45t45 19t45 -19t19 -45zM896 1216v-512q0 -132 -94 -226t-226 -94t-226 94t-94 226v512q0 132 94 226t226 94t226 -94t94 -226z" /> +<glyph unicode="" horiz-adv-x="1408" d="M271 591l-101 -101q-42 103 -42 214v128q0 26 19 45t45 19t45 -19t19 -45v-128q0 -53 15 -113zM1385 1193l-361 -361v-128q0 -132 -94 -226t-226 -94q-55 0 -109 19l-96 -96q97 -51 205 -51q185 0 316.5 131.5t131.5 316.5v128q0 26 19 45t45 19t45 -19t19 -45v-128 q0 -221 -147.5 -384.5t-364.5 -187.5v-132h256q26 0 45 -19t19 -45t-19 -45t-45 -19h-640q-26 0 -45 19t-19 45t19 45t45 19h256v132q-125 13 -235 81l-254 -254q-10 -10 -23 -10t-23 10l-82 82q-10 10 -10 23t10 23l1234 1234q10 10 23 10t23 -10l82 -82q10 -10 10 -23 t-10 -23zM1005 1325l-621 -621v512q0 132 94 226t226 94q102 0 184.5 -59t116.5 -152z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1088 576v640h-448v-1137q119 63 213 137q235 184 235 360zM1280 1344v-768q0 -86 -33.5 -170.5t-83 -150t-118 -127.5t-126.5 -103t-121 -77.5t-89.5 -49.5t-42.5 -20q-12 -6 -26 -6t-26 6q-16 7 -42.5 20t-89.5 49.5t-121 77.5t-126.5 103t-118 127.5t-83 150 t-33.5 170.5v768q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M128 -128h1408v1024h-1408v-1024zM512 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1280 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1664 1152v-1280 q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h128q52 0 90 -38t38 -90z" /> +<glyph unicode="" horiz-adv-x="1408" d="M512 1344q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 1376v-320q0 -16 -12 -25q-8 -7 -20 -7q-4 0 -7 1l-448 96q-11 2 -18 11t-7 20h-256v-102q111 -23 183.5 -111t72.5 -203v-800q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v800 q0 106 62.5 190.5t161.5 114.5v111h-32q-59 0 -115 -23.5t-91.5 -53t-66 -66.5t-40.5 -53.5t-14 -24.5q-17 -35 -57 -35q-16 0 -29 7q-23 12 -31.5 37t3.5 49q5 10 14.5 26t37.5 53.5t60.5 70t85 67t108.5 52.5q-25 42 -25 86q0 66 47 113t113 47t113 -47t47 -113 q0 -33 -14 -64h302q0 11 7 20t18 11l448 96q3 1 7 1q12 0 20 -7q12 -9 12 -25z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1440 1088q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM1664 1376q0 -249 -75.5 -430.5t-253.5 -360.5q-81 -80 -195 -176l-20 -379q-2 -16 -16 -26l-384 -224q-7 -4 -16 -4q-12 0 -23 9l-64 64q-13 14 -8 32l85 276l-281 281l-276 -85q-3 -1 -9 -1 q-14 0 -23 9l-64 64q-17 19 -5 39l224 384q10 14 26 16l379 20q96 114 176 195q188 187 358 258t431 71q14 0 24 -9.5t10 -22.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1745 763l-164 -763h-334l178 832q13 56 -15 88q-27 33 -83 33h-169l-204 -953h-334l204 953h-286l-204 -953h-334l204 953l-153 327h1276q101 0 189.5 -40.5t147.5 -113.5q60 -73 81 -168.5t0 -194.5z" /> +<glyph unicode="" d="M909 141l102 102q19 19 19 45t-19 45l-307 307l307 307q19 19 19 45t-19 45l-102 102q-19 19 -45 19t-45 -19l-454 -454q-19 -19 -19 -45t19 -45l454 -454q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5 t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M717 141l454 454q19 19 19 45t-19 45l-454 454q-19 19 -45 19t-45 -19l-102 -102q-19 -19 -19 -45t19 -45l307 -307l-307 -307q-19 -19 -19 -45t19 -45l102 -102q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5 t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1165 397l102 102q19 19 19 45t-19 45l-454 454q-19 19 -45 19t-45 -19l-454 -454q-19 -19 -19 -45t19 -45l102 -102q19 -19 45 -19t45 19l307 307l307 -307q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5 t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M813 237l454 454q19 19 19 45t-19 45l-102 102q-19 19 -45 19t-45 -19l-307 -307l-307 307q-19 19 -45 19t-45 -19l-102 -102q-19 -19 -19 -45t19 -45l454 -454q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5 t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1130 939l16 175h-884l47 -534h612l-22 -228l-197 -53l-196 53l-13 140h-175l22 -278l362 -100h4v1l359 99l50 544h-644l-15 181h674zM0 1408h1408l-128 -1438l-578 -162l-574 162z" /> +<glyph unicode="" horiz-adv-x="1792" d="M275 1408h1505l-266 -1333l-804 -267l-698 267l71 356h297l-29 -147l422 -161l486 161l68 339h-1208l58 297h1209l38 191h-1208z" /> +<glyph unicode="" horiz-adv-x="1792" d="M960 1280q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1792 352v-352q0 -22 -20 -30q-8 -2 -12 -2q-13 0 -23 9l-93 93q-119 -143 -318.5 -226.5t-429.5 -83.5t-429.5 83.5t-318.5 226.5l-93 -93q-9 -9 -23 -9q-4 0 -12 2q-20 8 -20 30v352 q0 14 9 23t23 9h352q22 0 30 -20q8 -19 -7 -35l-100 -100q67 -91 189.5 -153.5t271.5 -82.5v647h-192q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h192v163q-58 34 -93 92.5t-35 128.5q0 106 75 181t181 75t181 -75t75 -181q0 -70 -35 -128.5t-93 -92.5v-163h192q26 0 45 -19 t19 -45v-128q0 -26 -19 -45t-45 -19h-192v-647q149 20 271.5 82.5t189.5 153.5l-100 100q-15 16 -7 35q8 20 30 20h352q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1056 768q40 0 68 -28t28 -68v-576q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v576q0 40 28 68t68 28h32v320q0 185 131.5 316.5t316.5 131.5t316.5 -131.5t131.5 -316.5q0 -26 -19 -45t-45 -19h-64q-26 0 -45 19t-19 45q0 106 -75 181t-181 75t-181 -75t-75 -181 v-320h736z" /> +<glyph unicode="" d="M1024 640q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181zM1152 640q0 159 -112.5 271.5t-271.5 112.5t-271.5 -112.5t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5zM1280 640q0 -212 -150 -362t-362 -150t-362 150 t-150 362t150 362t362 150t362 -150t150 -362zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640 q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M384 800v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM896 800v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM1408 800v-192q0 -40 -28 -68t-68 -28h-192 q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68z" /> +<glyph unicode="" horiz-adv-x="384" d="M384 288v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM384 800v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM384 1312v-192q0 -40 -28 -68t-68 -28h-192 q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68z" /> +<glyph unicode="" d="M512 256q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM863 162q-13 232 -177 396t-396 177q-14 1 -24 -9t-10 -23v-128q0 -13 8.5 -22t21.5 -10q154 -11 264 -121t121 -264q1 -13 10 -21.5t22 -8.5h128q13 0 23 10 t9 24zM1247 161q-5 154 -56 297.5t-139.5 260t-205 205t-260 139.5t-297.5 56q-14 1 -23 -9q-10 -10 -10 -23v-128q0 -13 9 -22t22 -10q204 -7 378 -111.5t278.5 -278.5t111.5 -378q1 -13 10 -22t22 -9h128q13 0 23 10q11 9 9 23zM1536 1120v-960q0 -119 -84.5 -203.5 t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM1152 585q32 18 32 55t-32 55l-544 320q-31 19 -64 1q-32 -19 -32 -56v-640q0 -37 32 -56 q16 -8 32 -8q17 0 32 9z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1024 1084l316 -316l-572 -572l-316 316zM813 105l618 618q19 19 19 45t-19 45l-362 362q-18 18 -45 18t-45 -18l-618 -618q-19 -19 -19 -45t19 -45l362 -362q18 -18 45 -18t45 18zM1702 742l-907 -908q-37 -37 -90.5 -37t-90.5 37l-126 126q56 56 56 136t-56 136 t-136 56t-136 -56l-125 126q-37 37 -37 90.5t37 90.5l907 906q37 37 90.5 37t90.5 -37l125 -125q-56 -56 -56 -136t56 -136t136 -56t136 56l126 -125q37 -37 37 -90.5t-37 -90.5z" /> +<glyph unicode="" d="M1280 576v128q0 26 -19 45t-45 19h-896q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h896q26 0 45 19t19 45zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5 t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1152 736v-64q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h832q14 0 23 -9t9 -23zM1280 288v832q0 66 -47 113t-113 47h-832q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113zM1408 1120v-832q0 -119 -84.5 -203.5 t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1018 933q-18 -37 -58 -37h-192v-864q0 -14 -9 -23t-23 -9h-704q-21 0 -29 18q-8 20 4 35l160 192q9 11 25 11h320v640h-192q-40 0 -58 37q-17 37 9 68l320 384q18 22 49 22t49 -22l320 -384q27 -32 9 -68z" /> +<glyph unicode="" horiz-adv-x="1024" d="M32 1280h704q13 0 22.5 -9.5t9.5 -23.5v-863h192q40 0 58 -37t-9 -69l-320 -384q-18 -22 -49 -22t-49 22l-320 384q-26 31 -9 69q18 37 58 37h192v640h-320q-14 0 -25 11l-160 192q-13 14 -4 34q9 19 29 19z" /> +<glyph unicode="" d="M685 237l614 614q19 19 19 45t-19 45l-102 102q-19 19 -45 19t-45 -19l-467 -467l-211 211q-19 19 -45 19t-45 -19l-102 -102q-19 -19 -19 -45t19 -45l358 -358q19 -19 45 -19t45 19zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5 t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M404 428l152 -152l-52 -52h-56v96h-96v56zM818 818q14 -13 -3 -30l-291 -291q-17 -17 -30 -3q-14 13 3 30l291 291q17 17 30 3zM544 128l544 544l-288 288l-544 -544v-288h288zM1152 736l92 92q28 28 28 68t-28 68l-152 152q-28 28 -68 28t-68 -28l-92 -92zM1536 1120 v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1280 608v480q0 26 -19 45t-45 19h-480q-42 0 -59 -39q-17 -41 14 -70l144 -144l-534 -534q-19 -19 -19 -45t19 -45l102 -102q19 -19 45 -19t45 19l534 534l144 -144q18 -19 45 -19q12 0 25 5q39 17 39 59zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960 q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1005 435l352 352q19 19 19 45t-19 45l-352 352q-30 31 -69 14q-40 -17 -40 -59v-160q-119 0 -216 -19.5t-162.5 -51t-114 -79t-76.5 -95.5t-44.5 -109t-21.5 -111.5t-5 -110.5q0 -181 167 -404q10 -12 25 -12q7 0 13 3q22 9 19 33q-44 354 62 473q46 52 130 75.5 t224 23.5v-160q0 -42 40 -59q12 -5 24 -5q26 0 45 19zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M640 448l256 128l-256 128v-256zM1024 1039v-542l-512 -256v542zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103 t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1145 861q18 -35 -5 -66l-320 -448q-19 -27 -52 -27t-52 27l-320 448q-23 31 -5 66q17 35 57 35h640q40 0 57 -35zM1280 160v960q0 13 -9.5 22.5t-22.5 9.5h-960q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5zM1536 1120 v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1145 419q-17 -35 -57 -35h-640q-40 0 -57 35q-18 35 5 66l320 448q19 27 52 27t52 -27l320 -448q23 -31 5 -66zM1280 160v960q0 13 -9.5 22.5t-22.5 9.5h-960q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5zM1536 1120v-960 q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1088 640q0 -33 -27 -52l-448 -320q-31 -23 -66 -5q-35 17 -35 57v640q0 40 35 57q35 18 66 -5l448 -320q27 -19 27 -52zM1280 160v960q0 14 -9 23t-23 9h-960q-14 0 -23 -9t-9 -23v-960q0 -14 9 -23t23 -9h960q14 0 23 9t9 23zM1536 1120v-960q0 -119 -84.5 -203.5 t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M976 229l35 -159q3 -12 -3 -22.5t-17 -14.5l-5 -1q-4 -2 -10.5 -3.5t-16 -4.5t-21.5 -5.5t-25.5 -5t-30 -5t-33.5 -4.5t-36.5 -3t-38.5 -1q-234 0 -409 130.5t-238 351.5h-95q-13 0 -22.5 9.5t-9.5 22.5v113q0 13 9.5 22.5t22.5 9.5h66q-2 57 1 105h-67q-14 0 -23 9 t-9 23v114q0 14 9 23t23 9h98q67 210 243.5 338t400.5 128q102 0 194 -23q11 -3 20 -15q6 -11 3 -24l-43 -159q-3 -13 -14 -19.5t-24 -2.5l-4 1q-4 1 -11.5 2.5l-17.5 3.5t-22.5 3.5t-26 3t-29 2.5t-29.5 1q-126 0 -226 -64t-150 -176h468q16 0 25 -12q10 -12 7 -26 l-24 -114q-5 -26 -32 -26h-488q-3 -37 0 -105h459q15 0 25 -12q9 -12 6 -27l-24 -112q-2 -11 -11 -18.5t-20 -7.5h-387q48 -117 149.5 -185.5t228.5 -68.5q18 0 36 1.5t33.5 3.5t29.5 4.5t24.5 5t18.5 4.5l12 3l5 2q13 5 26 -2q12 -7 15 -21z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1020 399v-367q0 -14 -9 -23t-23 -9h-956q-14 0 -23 9t-9 23v150q0 13 9.5 22.5t22.5 9.5h97v383h-95q-14 0 -23 9.5t-9 22.5v131q0 14 9 23t23 9h95v223q0 171 123.5 282t314.5 111q185 0 335 -125q9 -8 10 -20.5t-7 -22.5l-103 -127q-9 -11 -22 -12q-13 -2 -23 7 q-5 5 -26 19t-69 32t-93 18q-85 0 -137 -47t-52 -123v-215h305q13 0 22.5 -9t9.5 -23v-131q0 -13 -9.5 -22.5t-22.5 -9.5h-305v-379h414v181q0 13 9 22.5t23 9.5h162q14 0 23 -9.5t9 -22.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M978 351q0 -153 -99.5 -263.5t-258.5 -136.5v-175q0 -14 -9 -23t-23 -9h-135q-13 0 -22.5 9.5t-9.5 22.5v175q-66 9 -127.5 31t-101.5 44.5t-74 48t-46.5 37.5t-17.5 18q-17 21 -2 41l103 135q7 10 23 12q15 2 24 -9l2 -2q113 -99 243 -125q37 -8 74 -8q81 0 142.5 43 t61.5 122q0 28 -15 53t-33.5 42t-58.5 37.5t-66 32t-80 32.5q-39 16 -61.5 25t-61.5 26.5t-62.5 31t-56.5 35.5t-53.5 42.5t-43.5 49t-35.5 58t-21 66.5t-8.5 78q0 138 98 242t255 134v180q0 13 9.5 22.5t22.5 9.5h135q14 0 23 -9t9 -23v-176q57 -6 110.5 -23t87 -33.5 t63.5 -37.5t39 -29t15 -14q17 -18 5 -38l-81 -146q-8 -15 -23 -16q-14 -3 -27 7q-3 3 -14.5 12t-39 26.5t-58.5 32t-74.5 26t-85.5 11.5q-95 0 -155 -43t-60 -111q0 -26 8.5 -48t29.5 -41.5t39.5 -33t56 -31t60.5 -27t70 -27.5q53 -20 81 -31.5t76 -35t75.5 -42.5t62 -50 t53 -63.5t31.5 -76.5t13 -94z" /> +<glyph unicode="" horiz-adv-x="898" d="M898 1066v-102q0 -14 -9 -23t-23 -9h-168q-23 -144 -129 -234t-276 -110q167 -178 459 -536q14 -16 4 -34q-8 -18 -29 -18h-195q-16 0 -25 12q-306 367 -498 571q-9 9 -9 22v127q0 13 9.5 22.5t22.5 9.5h112q132 0 212.5 43t102.5 125h-427q-14 0 -23 9t-9 23v102 q0 14 9 23t23 9h413q-57 113 -268 113h-145q-13 0 -22.5 9.5t-9.5 22.5v133q0 14 9 23t23 9h832q14 0 23 -9t9 -23v-102q0 -14 -9 -23t-23 -9h-233q47 -61 64 -144h171q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1027" d="M603 0h-172q-13 0 -22.5 9t-9.5 23v330h-288q-13 0 -22.5 9t-9.5 23v103q0 13 9.5 22.5t22.5 9.5h288v85h-288q-13 0 -22.5 9t-9.5 23v104q0 13 9.5 22.5t22.5 9.5h214l-321 578q-8 16 0 32q10 16 28 16h194q19 0 29 -18l215 -425q19 -38 56 -125q10 24 30.5 68t27.5 61 l191 420q8 19 29 19h191q17 0 27 -16q9 -14 1 -31l-313 -579h215q13 0 22.5 -9.5t9.5 -22.5v-104q0 -14 -9.5 -23t-22.5 -9h-290v-85h290q13 0 22.5 -9.5t9.5 -22.5v-103q0 -14 -9.5 -23t-22.5 -9h-290v-330q0 -13 -9.5 -22.5t-22.5 -9.5z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1043 971q0 100 -65 162t-171 62h-320v-448h320q106 0 171 62t65 162zM1280 971q0 -193 -126.5 -315t-326.5 -122h-340v-118h505q14 0 23 -9t9 -23v-128q0 -14 -9 -23t-23 -9h-505v-192q0 -14 -9.5 -23t-22.5 -9h-167q-14 0 -23 9t-9 23v192h-224q-14 0 -23 9t-9 23v128 q0 14 9 23t23 9h224v118h-224q-14 0 -23 9t-9 23v149q0 13 9 22.5t23 9.5h224v629q0 14 9 23t23 9h539q200 0 326.5 -122t126.5 -315z" /> +<glyph unicode="" horiz-adv-x="1792" d="M514 341l81 299h-159l75 -300q1 -1 1 -3t1 -3q0 1 0.5 3.5t0.5 3.5zM630 768l35 128h-292l32 -128h225zM822 768h139l-35 128h-70zM1271 340l78 300h-162l81 -299q0 -1 0.5 -3.5t1.5 -3.5q0 1 0.5 3t0.5 3zM1382 768l33 128h-297l34 -128h230zM1792 736v-64q0 -14 -9 -23 t-23 -9h-213l-164 -616q-7 -24 -31 -24h-159q-24 0 -31 24l-166 616h-209l-167 -616q-7 -24 -31 -24h-159q-11 0 -19.5 7t-10.5 17l-160 616h-208q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h175l-33 128h-142q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h109l-89 344q-5 15 5 28 q10 12 26 12h137q26 0 31 -24l90 -360h359l97 360q7 24 31 24h126q24 0 31 -24l98 -360h365l93 360q5 24 31 24h137q16 0 26 -12q10 -13 5 -28l-91 -344h111q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-145l-34 -128h179q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1167 896q18 -182 -131 -258q117 -28 175 -103t45 -214q-7 -71 -32.5 -125t-64.5 -89t-97 -58.5t-121.5 -34.5t-145.5 -15v-255h-154v251q-80 0 -122 1v-252h-154v255q-18 0 -54 0.5t-55 0.5h-200l31 183h111q50 0 58 51v402h16q-6 1 -16 1v287q-13 68 -89 68h-111v164 l212 -1q64 0 97 1v252h154v-247q82 2 122 2v245h154v-252q79 -7 140 -22.5t113 -45t82.5 -78t36.5 -114.5zM952 351q0 36 -15 64t-37 46t-57.5 30.5t-65.5 18.5t-74 9t-69 3t-64.5 -1t-47.5 -1v-338q8 0 37 -0.5t48 -0.5t53 1.5t58.5 4t57 8.5t55.5 14t47.5 21t39.5 30 t24.5 40t9.5 51zM881 827q0 33 -12.5 58.5t-30.5 42t-48 28t-55 16.5t-61.5 8t-58 2.5t-54 -1t-39.5 -0.5v-307q5 0 34.5 -0.5t46.5 0t50 2t55 5.5t51.5 11t48.5 18.5t37 27t27 38.5t9 51z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1280 768v-800q0 -40 -28 -68t-68 -28h-1088q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h544v-544q0 -40 28 -68t68 -28h544zM1277 896h-509v509q82 -15 132 -65l312 -312q50 -50 65 -132z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1024 160v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704q14 0 23 9t9 23zM1024 416v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704q14 0 23 9t9 23zM1280 768v-800q0 -40 -28 -68t-68 -28h-1088q-40 0 -68 28 t-28 68v1344q0 40 28 68t68 28h544v-544q0 -40 28 -68t68 -28h544zM1277 896h-509v509q82 -15 132 -65l312 -312q50 -50 65 -132z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1191 1128h177l-72 218l-12 47q-2 16 -2 20h-4l-3 -20q0 -1 -3.5 -18t-7.5 -29zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23zM1572 -23 v-233h-584v90l369 529q12 18 21 27l11 9v3q-2 0 -6.5 -0.5t-7.5 -0.5q-12 -3 -30 -3h-232v-115h-120v229h567v-89l-369 -530q-6 -8 -21 -26l-11 -11v-2l14 2q9 2 30 2h248v119h121zM1661 874v-106h-288v106h75l-47 144h-243l-47 -144h75v-106h-287v106h70l230 662h162 l230 -662h70z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1191 104h177l-72 218l-12 47q-2 16 -2 20h-4l-3 -20q0 -1 -3.5 -18t-7.5 -29zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23zM1661 -150 v-106h-288v106h75l-47 144h-243l-47 -144h75v-106h-287v106h70l230 662h162l230 -662h70zM1572 1001v-233h-584v90l369 529q12 18 21 27l11 9v3q-2 0 -6.5 -0.5t-7.5 -0.5q-12 -3 -30 -3h-232v-115h-120v229h567v-89l-369 -530q-6 -8 -21 -26l-11 -10v-3l14 3q9 1 30 1h248 v119h121z" /> +<glyph unicode="" horiz-adv-x="1792" d="M736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23zM1792 -32v-192q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h832 q14 0 23 -9t9 -23zM1600 480v-192q0 -14 -9 -23t-23 -9h-640q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h640q14 0 23 -9t9 -23zM1408 992v-192q0 -14 -9 -23t-23 -9h-448q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h448q14 0 23 -9t9 -23zM1216 1504v-192q0 -14 -9 -23t-23 -9h-256 q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h256q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1216 -32v-192q0 -14 -9 -23t-23 -9h-256q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h256q14 0 23 -9t9 -23zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192 q14 0 23 -9t9 -23zM1408 480v-192q0 -14 -9 -23t-23 -9h-448q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h448q14 0 23 -9t9 -23zM1600 992v-192q0 -14 -9 -23t-23 -9h-640q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h640q14 0 23 -9t9 -23zM1792 1504v-192q0 -14 -9 -23t-23 -9h-832 q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h832q14 0 23 -9t9 -23z" /> +<glyph unicode="" d="M1346 223q0 63 -44 116t-103 53q-52 0 -83 -37t-31 -94t36.5 -95t104.5 -38q50 0 85 27t35 68zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23 zM1486 165q0 -62 -13 -121.5t-41 -114t-68 -95.5t-98.5 -65.5t-127.5 -24.5q-62 0 -108 16q-24 8 -42 15l39 113q15 -7 31 -11q37 -13 75 -13q84 0 134.5 58.5t66.5 145.5h-2q-21 -23 -61.5 -37t-84.5 -14q-106 0 -173 71.5t-67 172.5q0 105 72 178t181 73q123 0 205 -94.5 t82 -252.5zM1456 882v-114h-469v114h167v432q0 7 0.5 19t0.5 17v16h-2l-7 -12q-8 -13 -26 -31l-62 -58l-82 86l192 185h123v-654h165z" /> +<glyph unicode="" d="M1346 1247q0 63 -44 116t-103 53q-52 0 -83 -37t-31 -94t36.5 -95t104.5 -38q50 0 85 27t35 68zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9 t9 -23zM1456 -142v-114h-469v114h167v432q0 7 0.5 19t0.5 17v16h-2l-7 -12q-8 -13 -26 -31l-62 -58l-82 86l192 185h123v-654h165zM1486 1189q0 -62 -13 -121.5t-41 -114t-68 -95.5t-98.5 -65.5t-127.5 -24.5q-62 0 -108 16q-24 8 -42 15l39 113q15 -7 31 -11q37 -13 75 -13 q84 0 134.5 58.5t66.5 145.5h-2q-21 -23 -61.5 -37t-84.5 -14q-106 0 -173 71.5t-67 172.5q0 105 72 178t181 73q123 0 205 -94.5t82 -252.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M256 192q0 26 -19 45t-45 19q-27 0 -45.5 -19t-18.5 -45q0 -27 18.5 -45.5t45.5 -18.5q26 0 45 18.5t19 45.5zM416 704v-640q0 -26 -19 -45t-45 -19h-288q-26 0 -45 19t-19 45v640q0 26 19 45t45 19h288q26 0 45 -19t19 -45zM1600 704q0 -86 -55 -149q15 -44 15 -76 q3 -76 -43 -137q17 -56 0 -117q-15 -57 -54 -94q9 -112 -49 -181q-64 -76 -197 -78h-36h-76h-17q-66 0 -144 15.5t-121.5 29t-120.5 39.5q-123 43 -158 44q-26 1 -45 19.5t-19 44.5v641q0 25 18 43.5t43 20.5q24 2 76 59t101 121q68 87 101 120q18 18 31 48t17.5 48.5 t13.5 60.5q7 39 12.5 61t19.5 52t34 50q19 19 45 19q46 0 82.5 -10.5t60 -26t40 -40.5t24 -45t12 -50t5 -45t0.5 -39q0 -38 -9.5 -76t-19 -60t-27.5 -56q-3 -6 -10 -18t-11 -22t-8 -24h277q78 0 135 -57t57 -135z" /> +<glyph unicode="" horiz-adv-x="1664" d="M256 960q0 -26 -19 -45t-45 -19q-27 0 -45.5 19t-18.5 45q0 27 18.5 45.5t45.5 18.5q26 0 45 -18.5t19 -45.5zM416 448v640q0 26 -19 45t-45 19h-288q-26 0 -45 -19t-19 -45v-640q0 -26 19 -45t45 -19h288q26 0 45 19t19 45zM1545 597q55 -61 55 -149q-1 -78 -57.5 -135 t-134.5 -57h-277q4 -14 8 -24t11 -22t10 -18q18 -37 27 -57t19 -58.5t10 -76.5q0 -24 -0.5 -39t-5 -45t-12 -50t-24 -45t-40 -40.5t-60 -26t-82.5 -10.5q-26 0 -45 19q-20 20 -34 50t-19.5 52t-12.5 61q-9 42 -13.5 60.5t-17.5 48.5t-31 48q-33 33 -101 120q-49 64 -101 121 t-76 59q-25 2 -43 20.5t-18 43.5v641q0 26 19 44.5t45 19.5q35 1 158 44q77 26 120.5 39.5t121.5 29t144 15.5h17h76h36q133 -2 197 -78q58 -69 49 -181q39 -37 54 -94q17 -61 0 -117q46 -61 43 -137q0 -32 -15 -76z" /> +<glyph unicode="" d="M919 233v157q0 50 -29 50q-17 0 -33 -16v-224q16 -16 33 -16q29 0 29 49zM1103 355h66v34q0 51 -33 51t-33 -51v-34zM532 621v-70h-80v-423h-74v423h-78v70h232zM733 495v-367h-67v40q-39 -45 -76 -45q-33 0 -42 28q-6 16 -6 54v290h66v-270q0 -24 1 -26q1 -15 15 -15 q20 0 42 31v280h67zM985 384v-146q0 -52 -7 -73q-12 -42 -53 -42q-35 0 -68 41v-36h-67v493h67v-161q32 40 68 40q41 0 53 -42q7 -21 7 -74zM1236 255v-9q0 -29 -2 -43q-3 -22 -15 -40q-27 -40 -80 -40q-52 0 -81 38q-21 27 -21 86v129q0 59 20 86q29 38 80 38t78 -38 q21 -28 21 -86v-76h-133v-65q0 -51 34 -51q24 0 30 26q0 1 0.5 7t0.5 16.5v21.5h68zM785 1079v-156q0 -51 -32 -51t-32 51v156q0 52 32 52t32 -52zM1318 366q0 177 -19 260q-10 44 -43 73.5t-76 34.5q-136 15 -412 15q-275 0 -411 -15q-44 -5 -76.5 -34.5t-42.5 -73.5 q-20 -87 -20 -260q0 -176 20 -260q10 -43 42.5 -73t75.5 -35q137 -15 412 -15t412 15q43 5 75.5 35t42.5 73q20 84 20 260zM563 1017l90 296h-75l-51 -195l-53 195h-78l24 -69t23 -69q35 -103 46 -158v-201h74v201zM852 936v130q0 58 -21 87q-29 38 -78 38q-51 0 -78 -38 q-21 -29 -21 -87v-130q0 -58 21 -87q27 -38 78 -38q49 0 78 38q21 27 21 87zM1033 816h67v370h-67v-283q-22 -31 -42 -31q-15 0 -16 16q-1 2 -1 26v272h-67v-293q0 -37 6 -55q11 -27 43 -27q36 0 77 45v-40zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960 q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M971 292v-211q0 -67 -39 -67q-23 0 -45 22v301q22 22 45 22q39 0 39 -67zM1309 291v-46h-90v46q0 68 45 68t45 -68zM343 509h107v94h-312v-94h105v-569h100v569zM631 -60h89v494h-89v-378q-30 -42 -57 -42q-18 0 -21 21q-1 3 -1 35v364h-89v-391q0 -49 8 -73 q12 -37 58 -37q48 0 102 61v-54zM1060 88v197q0 73 -9 99q-17 56 -71 56q-50 0 -93 -54v217h-89v-663h89v48q45 -55 93 -55q54 0 71 55q9 27 9 100zM1398 98v13h-91q0 -51 -2 -61q-7 -36 -40 -36q-46 0 -46 69v87h179v103q0 79 -27 116q-39 51 -106 51q-68 0 -107 -51 q-28 -37 -28 -116v-173q0 -79 29 -116q39 -51 108 -51q72 0 108 53q18 27 21 54q2 9 2 58zM790 1011v210q0 69 -43 69t-43 -69v-210q0 -70 43 -70t43 70zM1509 260q0 -234 -26 -350q-14 -59 -58 -99t-102 -46q-184 -21 -555 -21t-555 21q-58 6 -102.5 46t-57.5 99 q-26 112 -26 350q0 234 26 350q14 59 58 99t103 47q183 20 554 20t555 -20q58 -7 102.5 -47t57.5 -99q26 -112 26 -350zM511 1536h102l-121 -399v-271h-100v271q-14 74 -61 212q-37 103 -65 187h106l71 -263zM881 1203v-175q0 -81 -28 -118q-37 -51 -106 -51q-67 0 -105 51 q-28 38 -28 118v175q0 80 28 117q38 51 105 51q69 0 106 -51q28 -37 28 -117zM1216 1365v-499h-91v55q-53 -62 -103 -62q-46 0 -59 37q-8 24 -8 75v394h91v-367q0 -33 1 -35q3 -22 21 -22q27 0 57 43v381h91z" /> +<glyph unicode="" horiz-adv-x="1408" d="M597 869q-10 -18 -257 -456q-27 -46 -65 -46h-239q-21 0 -31 17t0 36l253 448q1 0 0 1l-161 279q-12 22 -1 37q9 15 32 15h239q40 0 66 -45zM1403 1511q11 -16 0 -37l-528 -934v-1l336 -615q11 -20 1 -37q-10 -15 -32 -15h-239q-42 0 -66 45l-339 622q18 32 531 942 q25 45 64 45h241q22 0 31 -15z" /> +<glyph unicode="" d="M685 771q0 1 -126 222q-21 34 -52 34h-184q-18 0 -26 -11q-7 -12 1 -29l125 -216v-1l-196 -346q-9 -14 0 -28q8 -13 24 -13h185q31 0 50 36zM1309 1268q-7 12 -24 12h-187q-30 0 -49 -35l-411 -729q1 -2 262 -481q20 -35 52 -35h184q18 0 25 12q8 13 -1 28l-260 476v1 l409 723q8 16 0 28zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1280 640q0 37 -30 54l-512 320q-31 20 -65 2q-33 -18 -33 -56v-640q0 -38 33 -56q16 -8 31 -8q20 0 34 10l512 320q30 17 30 54zM1792 640q0 -96 -1 -150t-8.5 -136.5t-22.5 -147.5q-16 -73 -69 -123t-124 -58q-222 -25 -671 -25t-671 25q-71 8 -124.5 58t-69.5 123 q-14 65 -21.5 147.5t-8.5 136.5t-1 150t1 150t8.5 136.5t22.5 147.5q16 73 69 123t124 58q222 25 671 25t671 -25q71 -8 124.5 -58t69.5 -123q14 -65 21.5 -147.5t8.5 -136.5t1 -150z" /> +<glyph unicode="" horiz-adv-x="1792" d="M402 829l494 -305l-342 -285l-490 319zM1388 274v-108l-490 -293v-1l-1 1l-1 -1v1l-489 293v108l147 -96l342 284v2l1 -1l1 1v-2l343 -284zM554 1418l342 -285l-494 -304l-338 270zM1390 829l338 -271l-489 -319l-343 285zM1239 1418l489 -319l-338 -270l-494 304z" /> +<glyph unicode="" horiz-adv-x="1408" d="M928 135v-151l-707 -1v151zM1169 481v-701l-1 -35v-1h-1132l-35 1h-1v736h121v-618h928v618h120zM241 393l704 -65l-13 -150l-705 65zM309 709l683 -183l-39 -146l-683 183zM472 1058l609 -360l-77 -130l-609 360zM832 1389l398 -585l-124 -85l-399 584zM1285 1536 l121 -697l-149 -26l-121 697z" /> +<glyph unicode="" d="M1362 110v648h-135q20 -63 20 -131q0 -126 -64 -232.5t-174 -168.5t-240 -62q-197 0 -337 135.5t-140 327.5q0 68 20 131h-141v-648q0 -26 17.5 -43.5t43.5 -17.5h1069q25 0 43 17.5t18 43.5zM1078 643q0 124 -90.5 211.5t-218.5 87.5q-127 0 -217.5 -87.5t-90.5 -211.5 t90.5 -211.5t217.5 -87.5q128 0 218.5 87.5t90.5 211.5zM1362 1003v165q0 28 -20 48.5t-49 20.5h-174q-29 0 -49 -20.5t-20 -48.5v-165q0 -29 20 -49t49 -20h174q29 0 49 20t20 49zM1536 1211v-1142q0 -81 -58 -139t-139 -58h-1142q-81 0 -139 58t-58 139v1142q0 81 58 139 t139 58h1142q81 0 139 -58t58 -139z" /> +<glyph unicode="" d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960zM698 640q0 88 -62 150t-150 62t-150 -62t-62 -150t62 -150t150 -62t150 62t62 150zM1262 640q0 88 -62 150 t-150 62t-150 -62t-62 -150t62 -150t150 -62t150 62t62 150z" /> +<glyph unicode="" d="M768 914l201 -306h-402zM1133 384h94l-459 691l-459 -691h94l104 160h522zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M815 677q8 -63 -50.5 -101t-111.5 -6q-39 17 -53.5 58t-0.5 82t52 58q36 18 72.5 12t64 -35.5t27.5 -67.5zM926 698q-14 107 -113 164t-197 13q-63 -28 -100.5 -88.5t-34.5 -129.5q4 -91 77.5 -155t165.5 -56q91 8 152 84t50 168zM1165 1240q-20 27 -56 44.5t-58 22 t-71 12.5q-291 47 -566 -2q-43 -7 -66 -12t-55 -22t-50 -43q30 -28 76 -45.5t73.5 -22t87.5 -11.5q228 -29 448 -1q63 8 89.5 12t72.5 21.5t75 46.5zM1222 205q-8 -26 -15.5 -76.5t-14 -84t-28.5 -70t-58 -56.5q-86 -48 -189.5 -71.5t-202 -22t-201.5 18.5q-46 8 -81.5 18 t-76.5 27t-73 43.5t-52 61.5q-25 96 -57 292l6 16l18 9q223 -148 506.5 -148t507.5 148q21 -6 24 -23t-5 -45t-8 -37zM1403 1166q-26 -167 -111 -655q-5 -30 -27 -56t-43.5 -40t-54.5 -31q-252 -126 -610 -88q-248 27 -394 139q-15 12 -25.5 26.5t-17 35t-9 34t-6 39.5 t-5.5 35q-9 50 -26.5 150t-28 161.5t-23.5 147.5t-22 158q3 26 17.5 48.5t31.5 37.5t45 30t46 22.5t48 18.5q125 46 313 64q379 37 676 -50q155 -46 215 -122q16 -20 16.5 -51t-5.5 -54z" /> +<glyph unicode="" d="M848 666q0 43 -41 66t-77 1q-43 -20 -42.5 -72.5t43.5 -70.5q39 -23 81 4t36 72zM928 682q8 -66 -36 -121t-110 -61t-119 40t-56 113q-2 49 25.5 93t72.5 64q70 31 141.5 -10t81.5 -118zM1100 1073q-20 -21 -53.5 -34t-53 -16t-63.5 -8q-155 -20 -324 0q-44 6 -63 9.5 t-52.5 16t-54.5 32.5q13 19 36 31t40 15.5t47 8.5q198 35 408 1q33 -5 51 -8.5t43 -16t39 -31.5zM1142 327q0 7 5.5 26.5t3 32t-17.5 16.5q-161 -106 -365 -106t-366 106l-12 -6l-5 -12q26 -154 41 -210q47 -81 204 -108q249 -46 428 53q34 19 49 51.5t22.5 85.5t12.5 71z M1272 1020q9 53 -8 75q-43 55 -155 88q-216 63 -487 36q-132 -12 -226 -46q-38 -15 -59.5 -25t-47 -34t-29.5 -54q8 -68 19 -138t29 -171t24 -137q1 -5 5 -31t7 -36t12 -27t22 -28q105 -80 284 -100q259 -28 440 63q24 13 39.5 23t31 29t19.5 40q48 267 80 473zM1536 1120 v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M390 1408h219v-388h364v-241h-364v-394q0 -136 14 -172q13 -37 52 -60q50 -31 117 -31q117 0 232 76v-242q-102 -48 -178 -65q-77 -19 -173 -19q-105 0 -186 27q-78 25 -138 75q-58 51 -79 105q-22 54 -22 161v539h-170v217q91 30 155 84q64 55 103 132q39 78 54 196z " /> +<glyph unicode="" d="M1123 127v181q-88 -56 -174 -56q-51 0 -88 23q-29 17 -39 45q-11 30 -11 129v295h274v181h-274v291h-164q-11 -90 -40 -147t-78 -99q-48 -40 -116 -63v-163h127v-404q0 -78 17 -121q17 -42 59 -78q43 -37 104 -57q62 -20 140 -20q67 0 129 14q57 13 134 49zM1536 1120 v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="768" d="M765 237q8 -19 -5 -35l-350 -384q-10 -10 -23 -10q-14 0 -24 10l-355 384q-13 16 -5 35q9 19 29 19h224v1248q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1248h224q21 0 29 -19z" /> +<glyph unicode="" horiz-adv-x="768" d="M765 1043q-9 -19 -29 -19h-224v-1248q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v1248h-224q-21 0 -29 19t5 35l350 384q10 10 23 10q14 0 24 -10l355 -384q13 -16 5 -35z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1792 736v-192q0 -14 -9 -23t-23 -9h-1248v-224q0 -21 -19 -29t-35 5l-384 350q-10 10 -10 23q0 14 10 24l384 354q16 14 35 6q19 -9 19 -29v-224h1248q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1728 643q0 -14 -10 -24l-384 -354q-16 -14 -35 -6q-19 9 -19 29v224h-1248q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h1248v224q0 21 19 29t35 -5l384 -350q10 -10 10 -23z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1393 321q-39 -125 -123 -250q-129 -196 -257 -196q-49 0 -140 32q-86 32 -151 32q-61 0 -142 -33q-81 -34 -132 -34q-152 0 -301 259q-147 261 -147 503q0 228 113 374q112 144 284 144q72 0 177 -30q104 -30 138 -30q45 0 143 34q102 34 173 34q119 0 213 -65 q52 -36 104 -100q-79 -67 -114 -118q-65 -94 -65 -207q0 -124 69 -223t158 -126zM1017 1494q0 -61 -29 -136q-30 -75 -93 -138q-54 -54 -108 -72q-37 -11 -104 -17q3 149 78 257q74 107 250 148q1 -3 2.5 -11t2.5 -11q0 -4 0.5 -10t0.5 -10z" /> +<glyph unicode="" horiz-adv-x="1664" d="M682 530v-651l-682 94v557h682zM682 1273v-659h-682v565zM1664 530v-786l-907 125v661h907zM1664 1408v-794h-907v669z" /> +<glyph unicode="" horiz-adv-x="1408" d="M493 1053q16 0 27.5 11.5t11.5 27.5t-11.5 27.5t-27.5 11.5t-27 -11.5t-11 -27.5t11 -27.5t27 -11.5zM915 1053q16 0 27 11.5t11 27.5t-11 27.5t-27 11.5t-27.5 -11.5t-11.5 -27.5t11.5 -27.5t27.5 -11.5zM103 869q42 0 72 -30t30 -72v-430q0 -43 -29.5 -73t-72.5 -30 t-73 30t-30 73v430q0 42 30 72t73 30zM1163 850v-666q0 -46 -32 -78t-77 -32h-75v-227q0 -43 -30 -73t-73 -30t-73 30t-30 73v227h-138v-227q0 -43 -30 -73t-73 -30q-42 0 -72 30t-30 73l-1 227h-74q-46 0 -78 32t-32 78v666h918zM931 1255q107 -55 171 -153.5t64 -215.5 h-925q0 117 64 215.5t172 153.5l-71 131q-7 13 5 20q13 6 20 -6l72 -132q95 42 201 42t201 -42l72 132q7 12 20 6q12 -7 5 -20zM1408 767v-430q0 -43 -30 -73t-73 -30q-42 0 -72 30t-30 73v430q0 43 30 72.5t72 29.5q43 0 73 -29.5t30 -72.5z" /> +<glyph unicode="" d="M663 1125q-11 -1 -15.5 -10.5t-8.5 -9.5q-5 -1 -5 5q0 12 19 15h10zM750 1111q-4 -1 -11.5 6.5t-17.5 4.5q24 11 32 -2q3 -6 -3 -9zM399 684q-4 1 -6 -3t-4.5 -12.5t-5.5 -13.5t-10 -13q-7 -10 -1 -12q4 -1 12.5 7t12.5 18q1 3 2 7t2 6t1.5 4.5t0.5 4v3t-1 2.5t-3 2z M1254 325q0 18 -55 42q4 15 7.5 27.5t5 26t3 21.5t0.5 22.5t-1 19.5t-3.5 22t-4 20.5t-5 25t-5.5 26.5q-10 48 -47 103t-72 75q24 -20 57 -83q87 -162 54 -278q-11 -40 -50 -42q-31 -4 -38.5 18.5t-8 83.5t-11.5 107q-9 39 -19.5 69t-19.5 45.5t-15.5 24.5t-13 15t-7.5 7 q-14 62 -31 103t-29.5 56t-23.5 33t-15 40q-4 21 6 53.5t4.5 49.5t-44.5 25q-15 3 -44.5 18t-35.5 16q-8 1 -11 26t8 51t36 27q37 3 51 -30t4 -58q-11 -19 -2 -26.5t30 -0.5q13 4 13 36v37q-5 30 -13.5 50t-21 30.5t-23.5 15t-27 7.5q-107 -8 -89 -134q0 -15 -1 -15 q-9 9 -29.5 10.5t-33 -0.5t-15.5 5q1 57 -16 90t-45 34q-27 1 -41.5 -27.5t-16.5 -59.5q-1 -15 3.5 -37t13 -37.5t15.5 -13.5q10 3 16 14q4 9 -7 8q-7 0 -15.5 14.5t-9.5 33.5q-1 22 9 37t34 14q17 0 27 -21t9.5 -39t-1.5 -22q-22 -15 -31 -29q-8 -12 -27.5 -23.5 t-20.5 -12.5q-13 -14 -15.5 -27t7.5 -18q14 -8 25 -19.5t16 -19t18.5 -13t35.5 -6.5q47 -2 102 15q2 1 23 7t34.5 10.5t29.5 13t21 17.5q9 14 20 8q5 -3 6.5 -8.5t-3 -12t-16.5 -9.5q-20 -6 -56.5 -21.5t-45.5 -19.5q-44 -19 -70 -23q-25 -5 -79 2q-10 2 -9 -2t17 -19 q25 -23 67 -22q17 1 36 7t36 14t33.5 17.5t30 17t24.5 12t17.5 2.5t8.5 -11q0 -2 -1 -4.5t-4 -5t-6 -4.5t-8.5 -5t-9 -4.5t-10 -5t-9.5 -4.5q-28 -14 -67.5 -44t-66.5 -43t-49 -1q-21 11 -63 73q-22 31 -25 22q-1 -3 -1 -10q0 -25 -15 -56.5t-29.5 -55.5t-21 -58t11.5 -63 q-23 -6 -62.5 -90t-47.5 -141q-2 -18 -1.5 -69t-5.5 -59q-8 -24 -29 -3q-32 31 -36 94q-2 28 4 56q4 19 -1 18l-4 -5q-36 -65 10 -166q5 -12 25 -28t24 -20q20 -23 104 -90.5t93 -76.5q16 -15 17.5 -38t-14 -43t-45.5 -23q8 -15 29 -44.5t28 -54t7 -70.5q46 24 7 92 q-4 8 -10.5 16t-9.5 12t-2 6q3 5 13 9.5t20 -2.5q46 -52 166 -36q133 15 177 87q23 38 34 30q12 -6 10 -52q-1 -25 -23 -92q-9 -23 -6 -37.5t24 -15.5q3 19 14.5 77t13.5 90q2 21 -6.5 73.5t-7.5 97t23 70.5q15 18 51 18q1 37 34.5 53t72.5 10.5t60 -22.5zM626 1152 q3 17 -2.5 30t-11.5 15q-9 2 -9 -7q2 -5 5 -6q10 0 7 -15q-3 -20 8 -20q3 0 3 3zM1045 955q-2 8 -6.5 11.5t-13 5t-14.5 5.5q-5 3 -9.5 8t-7 8t-5.5 6.5t-4 4t-4 -1.5q-14 -16 7 -43.5t39 -31.5q9 -1 14.5 8t3.5 20zM867 1168q0 11 -5 19.5t-11 12.5t-9 3q-14 -1 -7 -7l4 -2 q14 -4 18 -31q0 -3 8 2zM921 1401q0 2 -2.5 5t-9 7t-9.5 6q-15 15 -24 15q-9 -1 -11.5 -7.5t-1 -13t-0.5 -12.5q-1 -4 -6 -10.5t-6 -9t3 -8.5q4 -3 8 0t11 9t15 9q1 1 9 1t15 2t9 7zM1486 60q20 -12 31 -24.5t12 -24t-2.5 -22.5t-15.5 -22t-23.5 -19.5t-30 -18.5 t-31.5 -16.5t-32 -15.5t-27 -13q-38 -19 -85.5 -56t-75.5 -64q-17 -16 -68 -19.5t-89 14.5q-18 9 -29.5 23.5t-16.5 25.5t-22 19.5t-47 9.5q-44 1 -130 1q-19 0 -57 -1.5t-58 -2.5q-44 -1 -79.5 -15t-53.5 -30t-43.5 -28.5t-53.5 -11.5q-29 1 -111 31t-146 43q-19 4 -51 9.5 t-50 9t-39.5 9.5t-33.5 14.5t-17 19.5q-10 23 7 66.5t18 54.5q1 16 -4 40t-10 42.5t-4.5 36.5t10.5 27q14 12 57 14t60 12q30 18 42 35t12 51q21 -73 -32 -106q-32 -20 -83 -15q-34 3 -43 -10q-13 -15 5 -57q2 -6 8 -18t8.5 -18t4.5 -17t1 -22q0 -15 -17 -49t-14 -48 q3 -17 37 -26q20 -6 84.5 -18.5t99.5 -20.5q24 -6 74 -22t82.5 -23t55.5 -4q43 6 64.5 28t23 48t-7.5 58.5t-19 52t-20 36.5q-121 190 -169 242q-68 74 -113 40q-11 -9 -15 15q-3 16 -2 38q1 29 10 52t24 47t22 42q8 21 26.5 72t29.5 78t30 61t39 54q110 143 124 195 q-12 112 -16 310q-2 90 24 151.5t106 104.5q39 21 104 21q53 1 106 -13.5t89 -41.5q57 -42 91.5 -121.5t29.5 -147.5q-5 -95 30 -214q34 -113 133 -218q55 -59 99.5 -163t59.5 -191q8 -49 5 -84.5t-12 -55.5t-20 -22q-10 -2 -23.5 -19t-27 -35.5t-40.5 -33.5t-61 -14 q-18 1 -31.5 5t-22.5 13.5t-13.5 15.5t-11.5 20.5t-9 19.5q-22 37 -41 30t-28 -49t7 -97q20 -70 1 -195q-10 -65 18 -100.5t73 -33t85 35.5q59 49 89.5 66.5t103.5 42.5q53 18 77 36.5t18.5 34.5t-25 28.5t-51.5 23.5q-33 11 -49.5 48t-15 72.5t15.5 47.5q1 -31 8 -56.5 t14.5 -40.5t20.5 -28.5t21 -19t21.5 -13t16.5 -9.5z" /> +<glyph unicode="" d="M1024 36q-42 241 -140 498h-2l-2 -1q-16 -6 -43 -16.5t-101 -49t-137 -82t-131 -114.5t-103 -148l-15 11q184 -150 418 -150q132 0 256 52zM839 643q-21 49 -53 111q-311 -93 -673 -93q-1 -7 -1 -21q0 -124 44 -236.5t124 -201.5q50 89 123.5 166.5t142.5 124.5t130.5 81 t99.5 48l37 13q4 1 13 3.5t13 4.5zM732 855q-120 213 -244 378q-138 -65 -234 -186t-128 -272q302 0 606 80zM1416 536q-210 60 -409 29q87 -239 128 -469q111 75 185 189.5t96 250.5zM611 1277q-1 0 -2 -1q1 1 2 1zM1201 1132q-185 164 -433 164q-76 0 -155 -19 q131 -170 246 -382q69 26 130 60.5t96.5 61.5t65.5 57t37.5 40.5zM1424 647q-3 232 -149 410l-1 -1q-9 -12 -19 -24.5t-43.5 -44.5t-71 -60.5t-100 -65t-131.5 -64.5q25 -53 44 -95q2 -6 6.5 -17.5t7.5 -16.5q36 5 74.5 7t73.5 2t69 -1.5t64 -4t56.5 -5.5t48 -6.5t36.5 -6 t25 -4.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1173 473q0 50 -19.5 91.5t-48.5 68.5t-73 49t-82.5 34t-87.5 23l-104 24q-30 7 -44 10.5t-35 11.5t-30 16t-16.5 21t-7.5 30q0 77 144 77q43 0 77 -12t54 -28.5t38 -33.5t40 -29t48 -12q47 0 75.5 32t28.5 77q0 55 -56 99.5t-142 67.5t-182 23q-68 0 -132 -15.5 t-119.5 -47t-89 -87t-33.5 -128.5q0 -61 19 -106.5t56 -75.5t80 -48.5t103 -32.5l146 -36q90 -22 112 -36q32 -20 32 -60q0 -39 -40 -64.5t-105 -25.5q-51 0 -91.5 16t-65 38.5t-45.5 45t-46 38.5t-54 16q-50 0 -75.5 -30t-25.5 -75q0 -92 122 -157.5t291 -65.5 q73 0 140 18.5t122.5 53.5t88.5 93.5t33 131.5zM1536 256q0 -159 -112.5 -271.5t-271.5 -112.5q-130 0 -234 80q-77 -16 -150 -16q-143 0 -273.5 55.5t-225 150t-150 225t-55.5 273.5q0 73 16 150q-80 104 -80 234q0 159 112.5 271.5t271.5 112.5q130 0 234 -80 q77 16 150 16q143 0 273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -73 -16 -150q80 -104 80 -234z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1483 512l-587 -587q-52 -53 -127.5 -53t-128.5 53l-587 587q-53 53 -53 128t53 128l587 587q53 53 128 53t128 -53l265 -265l-398 -399l-188 188q-42 42 -99 42q-59 0 -100 -41l-120 -121q-42 -40 -42 -99q0 -58 42 -100l406 -408q30 -28 67 -37l6 -4h28q60 0 99 41 l619 619l2 -3q53 -53 53 -128t-53 -128zM1406 1138l120 -120q14 -15 14 -36t-14 -36l-730 -730q-17 -15 -37 -15v0q-4 0 -6 1q-18 2 -30 14l-407 408q-14 15 -14 36t14 35l121 120q13 15 35 15t36 -15l252 -252l574 575q15 15 36 15t36 -15z" /> +<glyph unicode="" d="M704 192v1024q0 14 -9 23t-23 9h-480q-14 0 -23 -9t-9 -23v-1024q0 -14 9 -23t23 -9h480q14 0 23 9t9 23zM1376 576v640q0 14 -9 23t-23 9h-480q-14 0 -23 -9t-9 -23v-640q0 -14 9 -23t23 -9h480q14 0 23 9t9 23zM1536 1344v-1408q0 -26 -19 -45t-45 -19h-1408 q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h1408q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1280 480q0 -40 -28 -68t-68 -28q-51 0 -80 43l-227 341h-45v-132l247 -411q9 -15 9 -33q0 -26 -19 -45t-45 -19h-192v-272q0 -46 -33 -79t-79 -33h-160q-46 0 -79 33t-33 79v272h-192q-26 0 -45 19t-19 45q0 18 9 33l247 411v132h-45l-227 -341q-29 -43 -80 -43 q-40 0 -68 28t-28 68q0 29 16 53l256 384q73 107 176 107h384q103 0 176 -107l256 -384q16 -24 16 -53zM864 1280q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5z" /> +<glyph unicode="" horiz-adv-x="1024" d="M1024 832v-416q0 -40 -28 -68t-68 -28t-68 28t-28 68v352h-64v-912q0 -46 -33 -79t-79 -33t-79 33t-33 79v464h-64v-464q0 -46 -33 -79t-79 -33t-79 33t-33 79v912h-64v-352q0 -40 -28 -68t-68 -28t-68 28t-28 68v416q0 80 56 136t136 56h640q80 0 136 -56t56 -136z M736 1280q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5z" /> +<glyph unicode="" d="M773 234l350 473q16 22 24.5 59t-6 85t-61.5 79q-40 26 -83 25.5t-73.5 -17.5t-54.5 -45q-36 -40 -96 -40q-59 0 -95 40q-24 28 -54.5 45t-73.5 17.5t-84 -25.5q-46 -31 -60.5 -79t-6 -85t24.5 -59zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103 t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1472 640q0 117 -45.5 223.5t-123 184t-184 123t-223.5 45.5t-223.5 -45.5t-184 -123t-123 -184t-45.5 -223.5t45.5 -223.5t123 -184t184 -123t223.5 -45.5t223.5 45.5t184 123t123 184t45.5 223.5zM1748 363q-4 -15 -20 -20l-292 -96v-306q0 -16 -13 -26q-15 -10 -29 -4 l-292 94l-180 -248q-10 -13 -26 -13t-26 13l-180 248l-292 -94q-14 -6 -29 4q-13 10 -13 26v306l-292 96q-16 5 -20 20q-5 17 4 29l180 248l-180 248q-9 13 -4 29q4 15 20 20l292 96v306q0 16 13 26q15 10 29 4l292 -94l180 248q9 12 26 12t26 -12l180 -248l292 94 q14 6 29 -4q13 -10 13 -26v-306l292 -96q16 -5 20 -20q5 -16 -4 -29l-180 -248l180 -248q9 -12 4 -29z" /> +<glyph unicode="" d="M1262 233q-54 -9 -110 -9q-182 0 -337 90t-245 245t-90 337q0 192 104 357q-201 -60 -328.5 -229t-127.5 -384q0 -130 51 -248.5t136.5 -204t204 -136.5t248.5 -51q144 0 273.5 61.5t220.5 171.5zM1465 318q-94 -203 -283.5 -324.5t-413.5 -121.5q-156 0 -298 61 t-245 164t-164 245t-61 298q0 153 57.5 292.5t156 241.5t235.5 164.5t290 68.5q44 2 61 -39q18 -41 -15 -72q-86 -78 -131.5 -181.5t-45.5 -218.5q0 -148 73 -273t198 -198t273 -73q118 0 228 51q41 18 72 -13q14 -14 17.5 -34t-4.5 -38z" /> +<glyph unicode="" horiz-adv-x="1792" d="M1088 704q0 26 -19 45t-45 19h-256q-26 0 -45 -19t-19 -45t19 -45t45 -19h256q26 0 45 19t19 45zM1664 896v-960q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v960q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1728 1344v-256q0 -26 -19 -45t-45 -19h-1536 q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1536q26 0 45 -19t19 -45z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1632 576q0 -26 -19 -45t-45 -19h-224q0 -171 -67 -290l208 -209q19 -19 19 -45t-19 -45q-18 -19 -45 -19t-45 19l-198 197q-5 -5 -15 -13t-42 -28.5t-65 -36.5t-82 -29t-97 -13v896h-128v-896q-51 0 -101.5 13.5t-87 33t-66 39t-43.5 32.5l-15 14l-183 -207 q-20 -21 -48 -21q-24 0 -43 16q-19 18 -20.5 44.5t15.5 46.5l202 227q-58 114 -58 274h-224q-26 0 -45 19t-19 45t19 45t45 19h224v294l-173 173q-19 19 -19 45t19 45t45 19t45 -19l173 -173h844l173 173q19 19 45 19t45 -19t19 -45t-19 -45l-173 -173v-294h224q26 0 45 -19 t19 -45zM1152 1152h-640q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5z" /> +<glyph unicode="" horiz-adv-x="1920" d="M1917 1016q23 -64 -150 -294q-24 -32 -65 -85q-78 -100 -90 -131q-17 -41 14 -81q17 -21 81 -82h1l1 -1l1 -1l2 -2q141 -131 191 -221q3 -5 6.5 -12.5t7 -26.5t-0.5 -34t-25 -27.5t-59 -12.5l-256 -4q-24 -5 -56 5t-52 22l-20 12q-30 21 -70 64t-68.5 77.5t-61 58 t-56.5 15.5q-3 -1 -8 -3.5t-17 -14.5t-21.5 -29.5t-17 -52t-6.5 -77.5q0 -15 -3.5 -27.5t-7.5 -18.5l-4 -5q-18 -19 -53 -22h-115q-71 -4 -146 16.5t-131.5 53t-103 66t-70.5 57.5l-25 24q-10 10 -27.5 30t-71.5 91t-106 151t-122.5 211t-130.5 272q-6 16 -6 27t3 16l4 6 q15 19 57 19l274 2q12 -2 23 -6.5t16 -8.5l5 -3q16 -11 24 -32q20 -50 46 -103.5t41 -81.5l16 -29q29 -60 56 -104t48.5 -68.5t41.5 -38.5t34 -14t27 5q2 1 5 5t12 22t13.5 47t9.5 81t0 125q-2 40 -9 73t-14 46l-6 12q-25 34 -85 43q-13 2 5 24q17 19 38 30q53 26 239 24 q82 -1 135 -13q20 -5 33.5 -13.5t20.5 -24t10.5 -32t3.5 -45.5t-1 -55t-2.5 -70.5t-1.5 -82.5q0 -11 -1 -42t-0.5 -48t3.5 -40.5t11.5 -39t22.5 -24.5q8 -2 17 -4t26 11t38 34.5t52 67t68 107.5q60 104 107 225q4 10 10 17.5t11 10.5l4 3l5 2.5t13 3t20 0.5l288 2 q39 5 64 -2.5t31 -16.5z" /> +<glyph unicode="" horiz-adv-x="1792" d="M675 252q21 34 11 69t-45 50q-34 14 -73 1t-60 -46q-22 -34 -13 -68.5t43 -50.5t74.5 -2.5t62.5 47.5zM769 373q8 13 3.5 26.5t-17.5 18.5q-14 5 -28.5 -0.5t-21.5 -18.5q-17 -31 13 -45q14 -5 29 0.5t22 18.5zM943 266q-45 -102 -158 -150t-224 -12 q-107 34 -147.5 126.5t6.5 187.5q47 93 151.5 139t210.5 19q111 -29 158.5 -119.5t2.5 -190.5zM1255 426q-9 96 -89 170t-208.5 109t-274.5 21q-223 -23 -369.5 -141.5t-132.5 -264.5q9 -96 89 -170t208.5 -109t274.5 -21q223 23 369.5 141.5t132.5 264.5zM1563 422 q0 -68 -37 -139.5t-109 -137t-168.5 -117.5t-226 -83t-270.5 -31t-275 33.5t-240.5 93t-171.5 151t-65 199.5q0 115 69.5 245t197.5 258q169 169 341.5 236t246.5 -7q65 -64 20 -209q-4 -14 -1 -20t10 -7t14.5 0.5t13.5 3.5l6 2q139 59 246 59t153 -61q45 -63 0 -178 q-2 -13 -4.5 -20t4.5 -12.5t12 -7.5t17 -6q57 -18 103 -47t80 -81.5t34 -116.5zM1489 1046q42 -47 54.5 -108.5t-6.5 -117.5q-8 -23 -29.5 -34t-44.5 -4q-23 8 -34 29.5t-4 44.5q20 63 -24 111t-107 35q-24 -5 -45 8t-25 37q-5 24 8 44.5t37 25.5q60 13 119 -5.5t101 -65.5z M1670 1209q87 -96 112.5 -222.5t-13.5 -241.5q-9 -27 -34 -40t-52 -4t-40 34t-5 52q28 82 10 172t-80 158q-62 69 -148 95.5t-173 8.5q-28 -6 -52 9.5t-30 43.5t9.5 51.5t43.5 29.5q123 26 244 -11.5t208 -134.5z" /> +<glyph unicode="" d="M1133 -34q-171 -94 -368 -94q-196 0 -367 94q138 87 235.5 211t131.5 268q35 -144 132.5 -268t235.5 -211zM638 1394v-485q0 -252 -126.5 -459.5t-330.5 -306.5q-181 215 -181 495q0 187 83.5 349.5t229.5 269.5t325 137zM1536 638q0 -280 -181 -495 q-204 99 -330.5 306.5t-126.5 459.5v485q179 -30 325 -137t229.5 -269.5t83.5 -349.5z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1402 433q-32 -80 -76 -138t-91 -88.5t-99 -46.5t-101.5 -14.5t-96.5 8.5t-86.5 22t-69.5 27.5t-46 22.5l-17 10q-113 -228 -289.5 -359.5t-384.5 -132.5q-19 0 -32 13t-13 32t13 31.5t32 12.5q173 1 322.5 107.5t251.5 294.5q-36 -14 -72 -23t-83 -13t-91 2.5t-93 28.5 t-92 59t-84.5 100t-74.5 146q114 47 214 57t167.5 -7.5t124.5 -56.5t88.5 -77t56.5 -82q53 131 79 291q-7 -1 -18 -2.5t-46.5 -2.5t-69.5 0.5t-81.5 10t-88.5 23t-84 42.5t-75 65t-54.5 94.5t-28.5 127.5q70 28 133.5 36.5t112.5 -1t92 -30t73.5 -50t56 -61t42 -63t27.5 -56 t16 -39.5l4 -16q12 122 12 195q-8 6 -21.5 16t-49 44.5t-63.5 71.5t-54 93t-33 112.5t12 127t70 138.5q73 -25 127.5 -61.5t84.5 -76.5t48 -85t20.5 -89t-0.5 -85.5t-13 -76.5t-19 -62t-17 -42l-7 -15q1 -5 1 -50.5t-1 -71.5q3 7 10 18.5t30.5 43t50.5 58t71 55.5t91.5 44.5 t112 14.5t132.5 -24q-2 -78 -21.5 -141.5t-50 -104.5t-69.5 -71.5t-81.5 -45.5t-84.5 -24t-80 -9.5t-67.5 1t-46.5 4.5l-17 3q-23 -147 -73 -283q6 7 18 18.5t49.5 41t77.5 52.5t99.5 42t117.5 20t129 -23.5t137 -77.5z" /> +<glyph unicode="" horiz-adv-x="1280" d="M1259 283v-66q0 -85 -57.5 -144.5t-138.5 -59.5h-57l-260 -269v269h-529q-81 0 -138.5 59.5t-57.5 144.5v66h1238zM1259 609v-255h-1238v255h1238zM1259 937v-255h-1238v255h1238zM1259 1077v-67h-1238v67q0 84 57.5 143.5t138.5 59.5h846q81 0 138.5 -59.5t57.5 -143.5z " /> +<glyph unicode="" d="M1152 640q0 -14 -9 -23l-320 -320q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5v192h-352q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h352v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198 t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1152 736v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-352v-192q0 -14 -9 -23t-23 -9q-12 0 -24 10l-319 319q-9 9 -9 23t9 23l320 320q9 9 23 9q13 0 22.5 -9.5t9.5 -22.5v-192h352q13 0 22.5 -9.5t9.5 -22.5zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198 t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" d="M1024 960v-640q0 -26 -19 -45t-45 -19q-20 0 -37 12l-448 320q-27 19 -27 52t27 52l448 320q17 12 37 12q26 0 45 -19t19 -45zM1280 160v960q0 13 -9.5 22.5t-22.5 9.5h-960q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5z M1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" d="M1024 640q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181zM768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5 t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" /> +<glyph unicode="" horiz-adv-x="1664" d="M1023 349l102 -204q-58 -179 -210 -290t-339 -111q-156 0 -288.5 77.5t-210 210t-77.5 288.5q0 181 104.5 330t274.5 211l17 -131q-122 -54 -195 -165.5t-73 -244.5q0 -185 131.5 -316.5t316.5 -131.5q126 0 232.5 65t165 175.5t49.5 236.5zM1571 249l58 -114l-256 -128 q-13 -7 -29 -7q-40 0 -57 35l-239 477h-472q-24 0 -42.5 16.5t-21.5 40.5l-96 779q-2 16 6 42q14 51 57 82.5t97 31.5q66 0 113 -47t47 -113q0 -69 -52 -117.5t-120 -41.5l37 -289h423v-128h-407l16 -128h455q40 0 57 -35l228 -455z" /> +<glyph unicode="" d="M1254 899q16 85 -21 132q-52 65 -187 45q-17 -3 -41 -12.5t-57.5 -30.5t-64.5 -48.5t-59.5 -70t-44.5 -91.5q80 7 113.5 -16t26.5 -99q-5 -52 -52 -143q-43 -78 -71 -99q-44 -32 -87 14q-23 24 -37.5 64.5t-19 73t-10 84t-8.5 71.5q-23 129 -34 164q-12 37 -35.5 69 t-50.5 40q-57 16 -127 -25q-54 -32 -136.5 -106t-122.5 -102v-7q16 -8 25.5 -26t21.5 -20q21 -3 54.5 8.5t58 10.5t41.5 -30q11 -18 18.5 -38.5t15 -48t12.5 -40.5q17 -46 53 -187q36 -146 57 -197q42 -99 103 -125q43 -12 85 -1.5t76 31.5q131 77 250 237 q104 139 172.5 292.5t82.5 226.5zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1152" d="M1152 704q0 -191 -94.5 -353t-256.5 -256.5t-353 -94.5h-160q-14 0 -23 9t-9 23v611l-215 -66q-3 -1 -9 -1q-10 0 -19 6q-13 10 -13 26v128q0 23 23 31l233 71v93l-215 -66q-3 -1 -9 -1q-10 0 -19 6q-13 10 -13 26v128q0 23 23 31l233 71v250q0 14 9 23t23 9h160 q14 0 23 -9t9 -23v-181l375 116q15 5 28 -5t13 -26v-128q0 -23 -23 -31l-393 -121v-93l375 116q15 5 28 -5t13 -26v-128q0 -23 -23 -31l-393 -121v-487q188 13 318 151t130 328q0 14 9 23t23 9h160q14 0 23 -9t9 -23z" /> +<glyph unicode="" horiz-adv-x="1408" d="M1152 736v-64q0 -14 -9 -23t-23 -9h-352v-352q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v352h-352q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h352v352q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-352h352q14 0 23 -9t9 -23zM1280 288v832q0 66 -47 113t-113 47h-832 q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113zM1408 1120v-832q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832q119 0 203.5 -84.5t84.5 -203.5z" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +<glyph unicode="" horiz-adv-x="1792" /> +</font> +</defs></svg>
\ No newline at end of file diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.ttf b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.ttf Binary files differnew file mode 100644 index 00000000..e89738de --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.ttf diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.woff b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.woff Binary files differnew file mode 100644 index 00000000..8c1748aa --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/fonts/fontawesome-webfont.woff diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/js/theme.js b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/js/theme.js new file mode 100644 index 00000000..60520cc3 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/static/js/theme.js @@ -0,0 +1,47 @@ +$( document ).ready(function() { + // Shift nav in mobile when clicking the menu. + $(document).on('click', "[data-toggle='wy-nav-top']", function() { + $("[data-toggle='wy-nav-shift']").toggleClass("shift"); + $("[data-toggle='rst-versions']").toggleClass("shift"); + }); + // Close menu when you click a link. + $(document).on('click', ".wy-menu-vertical .current ul li a", function() { + $("[data-toggle='wy-nav-shift']").removeClass("shift"); + $("[data-toggle='rst-versions']").toggleClass("shift"); + }); + $(document).on('click', "[data-toggle='rst-current-version']", function() { + $("[data-toggle='rst-versions']").toggleClass("shift-up"); + }); + // Make tables responsive + $("table.docutils:not(.field-list)").wrap("<div class='wy-table-responsive'></div>"); +}); + +window.SphinxRtdTheme = (function (jquery) { + var stickyNav = (function () { + var navBar, + win, + stickyNavCssClass = 'stickynav', + applyStickNav = function () { + if (navBar.height() <= win.height()) { + navBar.addClass(stickyNavCssClass); + } else { + navBar.removeClass(stickyNavCssClass); + } + }, + enable = function () { + applyStickNav(); + win.on('resize', applyStickNav); + }, + init = function () { + navBar = jquery('nav.wy-nav-side:first'); + win = jquery(window); + }; + jquery(init); + return { + enable : enable + }; + }()); + return { + StickyNav : stickyNav + }; +}($)); diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/theme.conf b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/theme.conf new file mode 100644 index 00000000..dcfbf8c2 --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/theme.conf @@ -0,0 +1,8 @@ +[theme] +inherit = basic +stylesheet = css/theme.css + +[options] +typekit_id = hiw1hhg +analytics_id = +sticky_navigation = False diff --git a/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/versions.html b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/versions.html new file mode 100644 index 00000000..8b3eb79d --- /dev/null +++ b/storage/maria/libmarias3/docs/_themes/sphinx_rtd_theme/versions.html @@ -0,0 +1,37 @@ +{% if READTHEDOCS %} +{# Add rst-badge after rst-versions for small badge style. #} + <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions"> + <span class="rst-current-version" data-toggle="rst-current-version"> + <span class="fa fa-book"> Read the Docs</span> + v: {{ current_version }} + <span class="fa fa-caret-down"></span> + </span> + <div class="rst-other-versions"> + <dl> + <dt>Versions</dt> + {% for slug, url in versions %} + <dd><a href="{{ url }}">{{ slug }}</a></dd> + {% endfor %} + </dl> + <dl> + <dt>Downloads</dt> + {% for type, url in downloads %} + <dd><a href="{{ url }}">{{ type }}</a></dd> + {% endfor %} + </dl> + <dl> + <dt>On Read the Docs</dt> + <dd> + <a href="//{{ PRODUCTION_DOMAIN }}/projects/{{ slug }}/?fromdocs={{ slug }}">Project Home</a> + </dd> + <dd> + <a href="//{{ PRODUCTION_DOMAIN }}/builds/{{ slug }}/?fromdocs={{ slug }}">Builds</a> + </dd> + </dl> + <hr/> + Free document hosting provided by <a href="http://www.readthedocs.org">Read the Docs</a>. + + </div> + </div> +{% endif %} + diff --git a/storage/maria/libmarias3/docs/api/compiling.rst b/storage/maria/libmarias3/docs/api/compiling.rst new file mode 100644 index 00000000..99930a4c --- /dev/null +++ b/storage/maria/libmarias3/docs/api/compiling.rst @@ -0,0 +1,37 @@ +Compiling Your Application +========================== + +Include Files +------------- + +Make sure that your application includes the main libMariaS3 include as follows: + +.. code-block:: c + + #include <libmarias3/marias3.h> + +This will pull in all the libMariaS3 functions and constants you may require for your application. + +Package Config +-------------- + +libMaria3e includes a utility called ``libmarias3-config``. This can give you all the options used to compile the library as well as the compiler options to link the library. For a full list of what it providesrun: + +.. code-block:: bash + + libmarias3-config --help + +Compiling +--------- + +If the library is installed correctly in your Linux distribution compiling your application with libMariaS3 should be a simple matter of adding the library to link to as follows: + +.. code-block:: bash + + gcc -o basic basic.c -lmarias3 + +And likewise for CLang: + +.. code-block:: bash + + clang -o basic basic.c -lmarias3 diff --git a/storage/maria/libmarias3/docs/api/errors.rst b/storage/maria/libmarias3/docs/api/errors.rst new file mode 100644 index 00000000..80c84372 --- /dev/null +++ b/storage/maria/libmarias3/docs/api/errors.rst @@ -0,0 +1,30 @@ +Error Codes +=========== + ++------------------------+------------------------------------------------------+ +| Code | Details | ++========================+======================================================+ +| MS3_ERR_NONE | Success (always equal to ``0``) | ++------------------------+------------------------------------------------------+ +| MS3_ERR_PARAMETER | A required function parameter is missing | ++------------------------+------------------------------------------------------+ +| MS3_ERR_NO_DATA | No data is supplied to a function that requires data | ++------------------------+------------------------------------------------------+ +| MS3_ERR_URI_TOO_LONG | The generated URI for the request is too long | ++------------------------+------------------------------------------------------+ +| MS3_ERR_RESPONSE_PARSE | The API could not parse the response from S3 | ++------------------------+------------------------------------------------------+ +| MS3_ERR_REQUEST_ERROR | The API could not send the request to S3 | ++------------------------+------------------------------------------------------+ +| MS3_ERR_OOM | Could not allocate required memory | ++------------------------+------------------------------------------------------+ +| MS3_ERR_IMPOSSIBLE | A theortically impossible condition occurred | ++------------------------+------------------------------------------------------+ +| MS3_ERR_AUTH | Authentication failed | ++------------------------+------------------------------------------------------+ +| MS3_ERR_NOT_FOUND | Object not found | ++------------------------+------------------------------------------------------+ +| MS3_ERR_SERVER | Unknown error code in S3 response | ++------------------------+------------------------------------------------------+ +| MS3_ERR_TOO_BIG | PUT data is too large, 4GB maximum | ++------------------------+------------------------------------------------------+ diff --git a/storage/maria/libmarias3/docs/api/functions.rst b/storage/maria/libmarias3/docs/api/functions.rst new file mode 100644 index 00000000..b30fac92 --- /dev/null +++ b/storage/maria/libmarias3/docs/api/functions.rst @@ -0,0 +1,393 @@ +Functions +========= + +ms3_library_init() +------------------ + +.. c:function:: void ms3_library_init(void) + + Initializes the library for use. + Should be called before any threads are spawned. + +ms3_library_deinit() +-------------------- + +.. c:function:: void ms3_library_deinit(void) + + Cleans up the library, typically for the end of the application's execution. + +ms3_library_init_malloc() +------------------------- + +.. c:function:: uint8_t ms3_library_init_malloc(ms3_malloc_callback m, ms3_free_callback f, ms3_realloc_callback r, ms3_strdup_callback s, ms3_calloc_callback c) + + Initialize the library for use with custom allocator replacement functions. These functions are also fed into libcurl. The function prototypes should be as follows: + + .. c:function:: void *ms3_malloc_callback(size_t size) + + To replace ``malloc()``. + + .. c:function:: void ms3_free_callback(void *ptr) + + To replace ``free()``. + + .. c:function:: void *ms3_realloc_callback(void *ptr, size_t size) + + To replace ``realloc()``. + + .. c:function:: char *ms3_strdup_callback(const char *str) + + To replace ``strdup()``. + + .. c:function:: void *ms3_calloc_callback(size_t nmemb, size_t size) + + To replace ``calloc()``. + + Should be called before any threads are spawned. All parameters are required or the function *will* fail. + + Remember: With great power comes great responsibility. + + :param m: The malloc callback + :param f: The free callback + :param r: The realloc callback + :param s: The strdup callback + :param c: The calloc callback + :returns: ``0`` on success, ``MS3_ERR_PARAMETER`` if a parameter is ``NULL`` + +ms3_init() +---------- + +.. c:function:: ms3_st *ms3_init(const char *s3key, const char *s3secret, const char *region, const char *base_domain) + + Initializes a :c:type:`ms3_st` object. This object should only be used in + the thread that created it because it reuses connections. But it is safe to + have other :c:type:`ms3_st` objects running at the same time in other threads. + + .. note:: + You *MUST* call :c:func:`ms3_library_init` before + spawning threads when using this access method. + + :param s3key: The AWS access key + :param s3secret: The AWS secret key + :param region: The AWS region to use (such as ``us-east-1``) + :param base_domain: A domain name to use if AWS S3 is not the desired server (set to ``NULL`` for S3) + :returns: A newly allocated marias3 object + +ms3_deinit() +------------ + +.. c:function:: void ms3_deinit(ms3_st *ms3) + + Cleans up and frees a :c:type:`ms3_st` object. + + :param ms3: The marias3 object + +ms3_server_error() +------------------ + +.. c:function:: const char *ms3_server_error(ms3_st *ms3) + + Returns the last error message from the S3 server or underlying Curl library. + + :param ms3: The marias3 object + :returns: The error message string or ``NULL`` if there is no message. + +ms3_error() +----------- + +.. c:function:: const char *ms3_error(uint8_t errcode) + + Returns an error message for a given error code + + :param errcode: The error code to translate + :returns: The error message + +ms3_debug() +----------- + +.. c:function:: void ms3_debug() + + Enables and disables debugging output on stderr. Each call toggles enable / disable. + + Note:: + This enables/disables globally for the library + +ms3_list() +---------- + +.. c:function:: uint8_t ms3_list(ms3_st *ms3, const char *bucket, const char *prefix, ms3_list_st **list) + + Retrieves a list of files from a given S3 bucket and fills it into a :c:type:`ms3_list_st`. + + The list generated is the eqivilent of a recursive directory listing but only has files in it, no entries for directories. + + The list will automatically be freed on the next list/list_dir call or :c:func:`ms3_deinit` + + :param ms3: The marias3 object + :param bucket: The bucket name to use + :param prefix: An optional path/file prefix to use (``NULL`` for all files) + :param list: A pointer to a pointer that will contain the returned list + :returns: ``0`` on success, a positive integer on failure + +Example +^^^^^^^ + +.. code-block:: c + + char *s3key= getenv("S3KEY"); + char *s3secret= getenv("S3SECRET"); + char *s3region= getenv("S3REGION"); + char *s3bucket= getenv("S3BUCKET"); + ms3_list_st *list= NULL, *list_it= NULL; + uint8_t res; + + ms3_library_init(); + ms3_st *ms3= ms3_thread_init(s3key, s3secret, s3region, NULL); + + res= ms3_list(ms3, s3bucket, NULL, &list); + if (res) + { + printf("Error occured: %d\n", res); + return; + } + list_it= list; + while(list_it) + { + printf("File: %s, size: %ld, tstamp: %ld\n", list_it->key, list_it->length, list_it->created); + list_it= list_it->next; + } + ms3_deinit(ms3); + +ms3_list_dir() +-------------- + +.. c:function:: uint8_t ms3_list_dir(ms3_st *ms3, const char *bucket, const char *prefix, ms3_list_st **list) + + Retrieves a list of files from a given S3 bucket and fills it into a :c:type:`ms3_list_st`. + + The list generated will automatically add the delimiter ``/`` and therefore filter up to the first ``/`` after the prefix. Unlike :c:func:`ms3_list` it includes directory entries. This is the eqivilent of doing a regular directory listing in a current directory (as designated by ``prefix``). + + The list will automatically be freed on the next list/list_dir call or :c:func:`ms3_deinit` + + :param ms3: The marias3 object + :param bucket: The bucket name to use + :param prefix: An optional path/file prefix to use (``NULL`` for all files) + :param list: A pointer to a pointer that will contain the returned list + :returns: ``0`` on success, a positive integer on failure + + +ms3_list_free() +--------------- + +.. c:function:: void ms3_list_free(ms3_list_st *list) + + .. deprecated:: 3.1.1 + Now a NULL operation which be removed in 4.0 + + A NULL operation, previously free'd :c:func:`ms3_list`, but this is now done internally on :c:func:`ms3_deinit` or when a new list is requested. + + :param list: The list to free + +ms3_put() +--------- + +.. c:function:: uint8_t ms3_put(ms3_st *ms3, const char *bucket, const char *key, const uint8_t *data, size_t length) + + Puts a binary data from a given pointer into S3 at a given key/filename. If an existing key/file exists with the same name this will be overwritten. + + :param ms3: The marias3 object + :param bucket: The bucket name to use + :param key: The key/filename to create/overwrite + :param data: A pointer to the data to write + :param length: The length of the data to write + :returns: ``0`` on success, a positive integer on failure + +Example +^^^^^^^ + +.. code-block:: c + + char *s3key= getenv("S3KEY"); + char *s3secret= getenv("S3SECRET"); + char *s3region= getenv("S3REGION"); + char *s3bucket= getenv("S3BUCKET"); + uint8_t res; + const char *test_string= "Another one bites the dust"; + + ms3_library_init(); + ms3_st *ms3= ms3_thread_init(s3key, s3secret, s3region, NULL); + + res= ms3_put(ms3, s3bucket, "test/ms3.txt", (const uint8_t*)test_string, strlen(test_string)); + if (res) + { + printf("Error occured: %d\n", res); + return; + } + ms3_deinit(ms3); + + +ms3_copy() +---------- + +.. c:function:: uint8_t ms3_copy(ms3_st *ms3, const char *source_bucket, const char *source_key, const char *dest_bucket, const char *dest_key) + + S3 internally copies an object from a source bucket and key to a destination bucket and key. + + :param ms3: The marias3 object + :param source_bucket: The bucket where the source object is + :param source_key: The key/filename of the source object + :param dest_bucket: The destination bucket (can be the same as source) + :param dest_key: The destination key/filename + :returns: ``0`` on success, a positive integer on failure + +ms3_move() +---------- + +.. c:function:: uint8_t ms3_move(ms3_st *ms3, const char *source_bucket, const char *source_key, const char *dest_bucket, const char *dest_key) + + Moves an object from source to destination. Internally the library performs a copy and if successful performs a delete on the source object. + + :param ms3: The marias3 object + :param source_bucket: The bucket where the source object is + :param source_key: The key/filename of the source object + :param dest_bucket: The destination bucket (can be the same as source) + :param dest_key: The destination key/filename + :returns: ``0`` on success, a positive integer on failure + +ms3_get() +--------- + +.. c:function:: uint8_t ms3_get(ms3_st *ms3, const char *bucket, const char *key, uint8_t **data, size_t *length) + + Retrieves a given object from S3. + + .. Note:: + The application is expected to free the resulting data pointer after use + + :param ms3: The marias3 object + :param bucket: The bucket name to use + :param key: The key/filename to retrieve + :param data: A pointer to a pointer the data to be retrieved into + :param length: A pointer to the data length + :returns: ``0`` on success, a positive integer on failure + +Example +^^^^^^^ + +.. code-block:: c + + char *s3key= getenv("S3KEY"); + char *s3secret= getenv("S3SECRET"); + char *s3region= getenv("S3REGION"); + char *s3bucket= getenv("S3BUCKET"); + uint8_t res; + uint8_t *data= NULL; + size_t length; + + ms3_library_init(); + ms3_st *ms3= ms3_thread_init(s3key, s3secret, s3region, NULL); + + res= ms3_get(ms3, s3bucket, "test/ms3.txt", &data, &length); + if (res) + { + printf("Error occured: %d\n", res); + return; + } + printf("File contents: %s\n", data); + printf("File length: %ld\n", length); + ms3_free(data); + ms3_deinit(ms3); + +ms3_free() +---------- + +.. c:function:: void ms3_free(uint8_t *data) + + Used to free the data allocated by :c:func:`ms3_get`. + + :param data: The data to free + +ms3_set_option() +---------------- + +.. c:function:: uint8_t ms3_set_option(ms3_st *ms3, ms3_set_option_t option, void *value) + + Sets a given connection option. See :c:type:`ms3_set_option_t` for a list of options. + + :param ms3: The marias3 object + :param option: The option to set + :param value: A pointer to the value for the option (if required, ``NULL`` if not) + :returns: ``0`` on success, a positive integer on failure + +ms3_delete() +------------ + +.. c:function:: uint8_t ms3_delete(ms3_st *ms3, const char *bucket, const char *key) + + Deletes an object from an S3 bucket + + :param ms3: The marias3 object + :param bucket: The bucket name to use + :param key: The key/filename to delete + :returns: ``0`` on success, a positive integer on failure + +Example +^^^^^^^ + +.. code-block:: c + + char *s3key= getenv("S3KEY"); + char *s3secret= getenv("S3SECRET"); + char *s3region= getenv("S3REGION"); + char *s3bucket= getenv("S3BUCKET"); + uint8_t res; + + ms3_library_init(); + ms3_st *ms3= ms3_thread_init(s3key, s3secret, s3region, NULL); + + res = ms3_delete(ms3, s3bucket, "test/ms3.txt"); + if (res) + { + printf("Error occured: %d\n", res); + return; + } + ms3_deinit(ms3); + +ms3_status() +------------ + +.. c:function:: uint8_t ms3_status(ms3_st *ms3, const char *bucket, const char *key, ms3_status_st *status) + + Retreives the status of a given filename/key into a :c:type:`ms3_status_st` object. Will return an error if not found. + + :param ms3: The marias3 object + :param bucket: The bucket name to use + :param key: The key/filename to status check + :param status: A status object to fill + :returns: ``0`` on success, a positive integer on failure + +Example +^^^^^^^ + +.. code-block:: c + + char *s3key= getenv("S3KEY"); + char *s3secret= getenv("S3SECRET"); + char *s3region= getenv("S3REGION"); + char *s3bucket= getenv("S3BUCKET"); + uint8_t res; + ms3_status_st status; + + ms3_library_init(); + ms3_st *ms3= ms3_thread_init(s3key, s3secret, s3region, NULL); + + res= ms3_status(ms3, s3bucket, "test/ms3.txt", &status); + if (res) + { + printf("Error occured: %d\n", res); + return; + } + printf("File length: %ld\n", status.length); + printf("File timestamp: %ld\n", status.created); + ms3_deinit(ms3); + diff --git a/storage/maria/libmarias3/docs/api/types.rst b/storage/maria/libmarias3/docs/api/types.rst new file mode 100644 index 00000000..eba57466 --- /dev/null +++ b/storage/maria/libmarias3/docs/api/types.rst @@ -0,0 +1,67 @@ +Structs +======= + +.. c:type:: ms3_st + + An internal struct which contains authentication information + +.. c:type:: ms3_list_st + + A linked-list struct which contains a list of files/keys and information about them + + .. c:member:: char *key + + The key/filename for the object + + .. c:member:: size_t length + + The data size for the object + + .. c:member:: time_t created + + The created / updated timestamp for the object + + .. c:member:: struct ms3_list_st *next + + A pointer to the next struct in the list + +.. c:type:: ms3_status_st + + An struct which contains the status of an object + + .. c:member:: size_t length + + The data size for the object + + .. c:member:: time_t created + + The created / updated timestamp for the object + +Constants +========= + +.. c:type:: ms3_set_option_t + + Options to use for :c:func:`ms3_set_option`. Possible values: + + * ``MS3_OPT_USE_HTTP`` - Use ``http://`` instead of ``https://``. The ``value`` parameter of :c:func:`ms3_set_option` is unused and each call to this toggles the flag (HTTPS is used by default) + * ``MS3_OPT_DISABLE_SSL_VERIFY`` - Disable SSL verification. The ``value`` parameter of :c:func:`ms3_set_option` is unused and each call to this toggles the flag (SSL verification is on by default) + * ``MS3_OPT_BUFFER_CHUNK_SIZE`` - Set the chunk size in bytes for the receive buffer. Default is 1MB. If you are receiving a large file a realloc will have to happen every time the buffer is full. For performance reasons you may want to increase the size of this buffer to reduce the reallocs and associated memory copies. The ``value`` parameter of :c:func:`ms3_set_option` should be a pointer to a :c:type:`size_t` greater than 1. + * ``MS3_OPT_FORCE_LIST_VERSION`` - An internal option for the regression suite only. The ``value`` parameter of :c:func:`ms3_set_option` should be a pointer to a :c:type:`uint8_t` of value ``1`` or ``2`` + * ``MS3_OPT_FORCE_PROTOCOL_VERSION`` - Set to 1 to force talking to the S3 server using version 1 of the List Bucket API, this is for S3 compatible servers. Set to 2 to force talking to the S3 server version 2 of the List Bucket API. This is for use when the autodetect bsaed on providing a base_domain does the wrong thing. The ``value`` parameter of :c:func:`ms3_set_option` should be a pointer to a :c:type:`uint8_t` of value ``1`` or ``2`` + +Built-In Types +============== + +.. c:type:: NULL + + A null pointer as defined in the standard header ``string.h``. + +.. c:type:: uint8_t + + An unsigned single byte character as defined in the standard header ``stdint.h`` + +.. c:type:: bool + + A boolean type as defined in the standard header ``stdbool.h`` + diff --git a/storage/maria/libmarias3/docs/appendix/credits.rst b/storage/maria/libmarias3/docs/appendix/credits.rst new file mode 100644 index 00000000..c72a7f38 --- /dev/null +++ b/storage/maria/libmarias3/docs/appendix/credits.rst @@ -0,0 +1,15 @@ +Credits +======= + +The libMariaS3 authors are: + +* `Andrew (LinuxJedi) Hutchings <mailto:linuxjedi@mariadb.com>`_ +* `Sergei Golubchik <mailto:sergei@mariadb.com>`_ +* `Markus Mäkelä <markus.makela@mariadb.com>`_ + +libMariaS3 uses the following Open Source projects: + +* `libcurl <https://curl.haxx.se/>`_ +* `xml.c <https://github.com/ooxi/xml.c/>`_ +* `DDM4 <https://github.com/TangentOrg/ddm4>`_ +* `Jouni Malinen's SHA256 hash code <j@w1.fi>`_ diff --git a/storage/maria/libmarias3/docs/appendix/version_history.rst b/storage/maria/libmarias3/docs/appendix/version_history.rst new file mode 100644 index 00000000..64ec2567 --- /dev/null +++ b/storage/maria/libmarias3/docs/appendix/version_history.rst @@ -0,0 +1,166 @@ +Version History +=============== + +Version 3.1 +----------- + +Version 3.1.3 GA +^^^^^^^^^^^^^^^^ + +* Fix :c:func:`ms3_copy` not working correctly with non-alphanumeric characters (also affected :c:func:`ms3_move`) + +Version 3.1.2 GA +^^^^^^^^^^^^^^^^ + +* Make library work with quirks in Google Cloud's S3 implementation +* Detect when libcurl was built with OpenSSL < 1.1.0 and add workaround to thread safety issues in the older OpenSSL versions (affects Ubuntu 16.04 in particular) +* Remove libxml and replace it with a modified version of `xml.c <https://github.com/ooxi/xml.c>`_ which handles <? ?> tags and other minor changes +* Fix issue where an empty key for :c:func:`ms3_get` turns it into a list call +* Partially fix issue with ``AC_MSG_ERROR``. Will still fail if you don't have ``libtool`` and ``pkg-config`` installed. + +Version 3.1.1 GA (2019-06-28) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Fix bad host header when path based buckets are used +* Make autodetection of access type and list version *much* smarter: + + * Checks for S3 domain in provided domain and uses list version 2 + * Checks for IP provided domain and turns on list version 1 and path based buckets + * Any other domain uses list version one and domain based buckets + +* Reduced linked list mallocs for :c:func:`ms3_list` and :c:func:`ms3_list_dir`. This also deprecates :c:func:`ms3_list_free`. + +Version 3.1.0 GA (2019-06-24) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Fix compiling issues when ``-Wdeclaration-after-statement`` is enabled +* Add ``MS3_OPT_FORCE_PROTOCOL_VERSION`` for use with :c:func:`ms3_set_option` which will force use of AWS S3 methods and paths (version 2) or compatible methods and paths (version 1) +* Fix double-free upon certain errors +* Add snowman UTF-8 test and minor cleanups +* Cleanup build system + +Version 3.0 +----------- + +Version 3.0.2 GA (2019-05-24) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Fix libm linkage +* Remove mhash dependency and use a modified cut-down version of wpa_supplicant's BSD licensed crypto code (required for Windows compiling) +* Several minor performance optimizations + + * Removed 2x1kb mallocs on every request (now on :c:func:`ms3_init` instead) + * Compiling with ``-O3`` by default + * Stop executing string compares in list loop when something is found + * Remove unneeded ``strdup()`` usage + +Version 3.0.1 GA (2019-05-16) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Improve performance of PUT +* Fix a few potential pointer arithmatic issues +* Fix race condition on time generation +* Added TSAN to ci-scripts +* Fix minor issues found in cppcheck +* Stop buffer overrun if the buffer chunk size is set smaller than packet +* Fix :c:func:`ms3_get` returning random data if a CURL request completely fails +* Fix potential crash if the server error message is junk +* Fix double-free if a server error message is ``NULL`` + +Version 3.0.0 GA (2019-05-13) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Allow compiling to work with gnu89 compiler mode +* Fix building in CLang +* Removed previous deprecated ``ms3_thread_init`` and ``ms3_buffer_chunk_size`` +* Remove ``bool`` from frontend API by: + + * Making :c:func:`ms3_debug` a toggle + * Making the boolean options of :c:func:`ms3_set_option` toggles + +Version 2.3 +----------- + +Version 2.3.0 GA (2019-05-07) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Allow compiling with a C++ compiler +* Fix logic error in :c:func:`ms3_move` +* Stop :c:func:`ms3_get` returning the error message as the object data on error +* Add :c:func:`ms3_list_dir` to get a non-recursive directory listing +* Setting the buffer chunk size using ``ms3_buffer_chunk_size`` or :c:func:`ms3_set_option` no longer has a lower limit of 1MB + +Version 2.2 +----------- + +Version 2.2.0 GA (2019-04-23) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Add :c:func:`ms3_init` to replace ``ms3_thread_init`` and deprecate the latter. +* Add :c:func:`ms3_library_init_malloc` to add custom allocators +* Add :c:func:`ms3_library_deinit` to cleanup` +* Add :c:func:`ms3_copy` and :c:func:`ms3_move` to use S3's internal file copy + +Version 2.1 +----------- + +Version 2.1.1 GA (2019-04-02) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Remove iso646.h support in codebase +* Autoswitch to bucket path instead of bucket domain access method (for IP urls) +* Fixed issue with SSL disabled verification +* Fixed minor leak when base_domain is set +* Add ``S3NOVERIFY`` env var to tests which will disable SSL verification when set to ``1`` + +Version 2.1.0 GA (2019-03-29) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Add :c:func:`ms3_set_option` to set various connection options +* Deprecated ``ms3_buffer_chunk_size``, use :c:func:`ms3_set_option` instead +* Added options to use ``http`` instead of ``https`` and to disable SSL verification +* Added debugging output for server/curl error messages +* Added compatibility for V1 bucket list API. Will turn on automatically for non-Amazon S3 compatible servers. Additionally an option has been created to force V1 or V2 + +Version 2.0 +----------- + +Version 2.0.0 GA (2019-03-28) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Fix double-free when using ``ms3_thread_init`` and an error occurs +* Fix error when a PUT >= 65535 is attempted +* Improve performance of GET for large files +* Make ``ms3_thread_init`` treat empty string base_domain as ``NULL`` +* Add :c:func:`ms3_free` +* Add ``ms3_buffer_chunk_size`` +* Cleanup linking +* Removed ``ms3_init`` +* Added :c:func:`ms3_server_error` to get the last server or Curl error + +Version 1.1 +----------- + +Version 1.1.0 GA (2019-03-27) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Fix memory leak in libxml2 function usage +* Fix memory leaks in libcurl usage +* Fix test collisions causing failures +* Added :c:func:`ms3_library_init` and ``ms3_thread_init`` for higher-performance acceses + +Version 1.0 +----------- + +Version 1.0.1 RC (2019-03-26) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Fixed issues found with valgrind, cppcheck and scanbuild +* Added RPM & DEB build systems +* Fixed pagination calls for :c:func:`ms3_list` so it support > 1000 objects +* Made ``ms3_init()`` thread safe + +Version 1.0.0 Beta (2019-03-25) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* Initial Beta version diff --git a/storage/maria/libmarias3/docs/conf.py b/storage/maria/libmarias3/docs/conf.py new file mode 100644 index 00000000..f1ce490f --- /dev/null +++ b/storage/maria/libmarias3/docs/conf.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- +# +# DDm4 documentation build configuration file, created by +# sphinx-quickstart on Sun Mar 6 12:05:53 2011. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os +from subprocess import Popen, PIPE + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +needs_sphinx = '1.1' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +#extensions = ['sphinxcontrib.googleanalytics'] + +# Google +#googleanalytics_id = '' +#googleanalytics_enabled = 'False' + +# Add any paths that contain templates here, relative to this directory. +#templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'libmarias3' +copyright = u'2019 MariaDB Corporation Ab' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version_file = open("../VERSION.txt", "r") +version = version_file.read().strip() +# The full version, including alpha/beta/rc tags. +get_rev= Popen("git rev-parse --short HEAD", shell=True, stdout=PIPE).stdout.read().strip() +release = version + '-' + get_rev.decode("utf-8") + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = en + +primary_domain='c' + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'colorful' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +if not on_rtd: + html_theme = 'sphinx_rtd_theme' + html_theme_path = ["_themes"] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = '' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +html_show_sourcelink = False + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'libmarias3' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'libmarias3.tex', u'libmarias3 Documentation', + u'Andrew Hutchings', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples + +# (source start file, name, description, authors, manual section). +#man_show_urls=True +#man_pages = [ +# ('bin/cxxflags', 'cxxflags', u'DDM4, http://tangent.org/', [u'Data Differential http://www.datadifferential.com/'], 1), +# ('bin/cflags', 'cflags', u'DDM4, http://tangent.org/', [u'Data Differential http://www.datadifferential.com/'], 1), +# ] diff --git a/storage/maria/libmarias3/docs/contributors/coding_standards.rst b/storage/maria/libmarias3/docs/contributors/coding_standards.rst new file mode 100644 index 00000000..a3242ffd --- /dev/null +++ b/storage/maria/libmarias3/docs/contributors/coding_standards.rst @@ -0,0 +1,112 @@ +Coding Standard +=============== + +General +------- + +We are aiming for a minimum of C99 support. A script in ``extra`` can be found called ``astyle.sh``. This uses the Linux tool `Artistic Style <http://astyle.sourceforge.net/>`_ to enforce coding standards. + +Coding Style +------------ + +Everyone has a preferred coding style, there is no real correct style. What is important is that we stick to one style throughout the code. + +We should use a variant of the `Allman coding style <http://en.wikipedia.org/wiki/Indent_style#Allman_style>`_. The variation is to use 2 spaces instead of tabs. The exception to the rule is Makefiles where space indentation can break them. + +Allman style specifies that braces associated with a statement should be on the following line with the same indentation and the statements inside the braces are next level indented. The closing braces are also on a new line at the same indentation as the original statement. + +For example: + +.. code-block:: cpp + + while (x == y) + { + something(); + somethingelse(); + } + finalthing(); + + +Types +----- + +Use C99 types (where possible), this will very much help us to find conversion bugs. So: + +* Use bool, not my_bool. +* Use true and false, not TRUE and FALSE (those macros need to die). +* ulong → uint32_t +* ulonglong uint64_t +* long int → int32_t + +The keyword :c:type:`NULL` should always be used when referring to the pointer NULL + +Allocation +---------- + +For performance reasons we should try to limit the number of times we allocate and deallocate memory. Do not do thousands of allocates and deallocates to save 32k of RAM. + +Naming style +------------ + +Variable names +^^^^^^^^^^^^^^ + +Variables should be verbosely names, no caps, underscores with spaces. Do not just use ``i`` in for loops, again we have developers with bad eyes. + +Types +^^^^^ + +New types should use the ``_t`` postfix. Private structs should be typedef'ed and also use this. + +Public Structs +^^^^^^^^^^^^^^ + +Public structs should be typedef'ed and use the ``_st`` postfix + +Conventions +^^^^^^^^^^^ + +* use *column* instead of *field* +* use *schema* instead of *database* + +Include Files +------------- + +Includes that will be installed need to be written like: + +.. code-block:: cpp + + #include <drizzled/field/blob.h> + + +The following should only be used in cases where we are to never install these libraries in the filesystem: + +.. code-block:: cpp + + #include "item.h" + +Comments +-------- + +Where it is not obvious what is going on. Hopefully most of the code will be self-commenting. + +All code should have license headers. + +Comment blocks should use the format: + +.. code-block:: cpp + + /* Comment Block + * This is a multi-line comment block + */ + +C99 style in-line and single line comments are allowed for small comments + +.. code-block:: cpp + + // small comment + +Line lengths +------------ + +Whilst there is no hard limit on line lengths it is recommended that lines stay under 80 characters unless going above this increases readability of the code. diff --git a/storage/maria/libmarias3/docs/contributors/docs.rst b/storage/maria/libmarias3/docs/contributors/docs.rst new file mode 100644 index 00000000..47900a21 --- /dev/null +++ b/storage/maria/libmarias3/docs/contributors/docs.rst @@ -0,0 +1,45 @@ +Updating Documentation +====================== + +Overview +-------- + +This documentation is stored along with the source in the ``docs`` directory of the git tree and uses the `reStructuredText format <http://en.wikipedia.org/wiki/ReStructuredText>`_. + +We recommend reading this `reStructuredText Primer <http://sphinx-doc.org/rest.html>`_ before editing the docs for the first time. + +Compiling Docs +-------------- + +The docs are compiled using `Sphinx Python Documentation Generator <http://sphinx-doc.org/>`_. The libMariaS3 build system already knows how to use this. To compile the docs please follow theses steps: + +#. Install the ``python-sphinx`` package using your distribution's package manager + +#. Re-run bootstrap as follows so that it picks up that Sphinx is installed:: + + ./bootstrap.sh -m + +#. To compile in HTML format:: + + make html + +There will now be an HTML version of the docs in the ``/html`` directory of the source. + +Compiling PDF Docs +------------------ + +Sphinx required LaTeX to build PDF docs. The following steps show you how to build PDF docs: + +#. Install ``python-sphinx`` as above + +#. Install the full *TeXLive* package. In Fedora this is ``texlive-scheme-full`` and ``texlive-full`` in Ubuntu + +#. Re-run bootstrap as follows so that it picks up that Sphinx and LaTeX are installed:: + + ./bootstrap.sh -m + +#. To compile in PDF format:: + + make latexpdf + +The generated PDF will be in the ``/docs/latex/`` directory. diff --git a/storage/maria/libmarias3/docs/contributors/github.rst b/storage/maria/libmarias3/docs/contributors/github.rst new file mode 100644 index 00000000..11c91e64 --- /dev/null +++ b/storage/maria/libmarias3/docs/contributors/github.rst @@ -0,0 +1,113 @@ +Using GitHub +============ + +GitHub contributions typically work by creating a fork of the project on your user account, making a branch on that fork to work on and then filing a pull request to upstream your code. This is how you would go about it. + +Forking +------- + +Go to the `libMariaS3 GitHub page <https://github.com/mariadb-corporation/libmarias3>`_ and click the *Fork* button near the top. Once you have forked you can get a local copy of this fork to work on (where *user* is your username): + +.. code-block:: bash + + git clone https://github.com/user/libmarias3.git + +You then need to make your local clone aware of the upstream repository: + +.. code-block:: bash + + cd libmarias3 + git remote add upstream https://github.com/mariadb-corporation/libmarias3.git + +Branch +------ + +Before creating a branch to work on you should first make sure your local copy is up to date: + +.. code-block:: bash + + git checkout master + git pull --ff-only upstream master + git push + +You can then create a branch from master to work on: + +.. code-block:: bash + + git checkout -b a_new_feature + +Hack on code! +------------- + +Hack away at your feature or bug. + +Test +---- + +Once your code is ready the test suite should be run locally: + +.. code-block:: bash + + make + make check + +If there are documentation changes you should install ``python-sphinx`` and try to build the HTML version to run a syntax check: + +.. code-block:: bash + + make html + +Commit and push +--------------- + +If you have never contributed to GitHub before then you need to setup git so that it knows you for the commit message: + +.. code-block:: bash + + git config --global user.name "Real Name" + git config --global user.email "me@me.com" + +Make sure you use `git add` to add any new files to the repository and then commit: + +.. code-block:: bash + + git commit -a + +Your editor will pop up to enter a commit messages above the comments. The first line should be no more than 50 characters and be a subject of the commit. The second line should be blank. The third line onwards can contain details and these should be no more than 72 characters each. + +If your commit fixes an issue you can add the following (for issue #93):: + + Fixes mariadb-corporation/libmarias3#93 + +Once all your commits are done a quick rebase may be needed to make sure your changes will merge OK with what is in master: + +.. code-block:: bash + + git fetch upstream + git rebase -i upstream/master + +This should bring up a commit-style message in the editor with *pick* as the first word. Save this and the rebase will complete. If the rebase tells you there is a conflict you will need to locate the problem using ``git diff``, fix it and do: + +.. code-block:: bash + + git add <filename> + git rebase --continue + +If things look like they are going wrong you can undo the rebase using the following and can get in touch with us: + +.. code-block:: bash + + git rebase --abort + +You should now be ready to push up to GitHub: + +.. code-block:: bash + + git push --set-upstream origin a_new_feature + +If you go to your repository on GitHub's website you will an option to file a *Pull Request*. Use this to submit a pull request upstream for your branch. + +Help +---- + +If you get stuck at any point feel free to reach out to us by filing an issue on Github. diff --git a/storage/maria/libmarias3/docs/contributors/introduction.rst b/storage/maria/libmarias3/docs/contributors/introduction.rst new file mode 100644 index 00000000..6e3d0164 --- /dev/null +++ b/storage/maria/libmarias3/docs/contributors/introduction.rst @@ -0,0 +1,58 @@ +Introduction to Contributing +============================ + +There are many ways to contribute to libMariaS3. Simply using it and creating an issue report when you found a bug or have a suggestion is a great contribution. Documentation and code contribituions are also greatly appreciated. + +Layout +------ + +The code for libMariaS3 in several parts: + ++--------------------+-------------------------------+ +| Directory | Contents | ++====================+===============================+ +| ``/src`` | The API source code | ++--------------------+-------------------------------+ +| ``/libmarias3`` | The public API headers | ++--------------------+-------------------------------+ +| ``/tests`` | Unit tests for the public API | ++--------------------+-------------------------------+ + +In each case if any files are added or removed the ``include.am`` file in that directory will require updating to reflect the change. + +Submitting to Github +-------------------- + +The main hub for the code is `GitHub <https://github.com/>`_. The main tree is the `libMariaS3 GitHub tree <https://github.com/mariadb-corporation/libmarias3>`_. Anyone is welcome to submit pull requests or issues. All requests will be considered and appropriate feedback given. + +Modifying the Build System +-------------------------- + +The build system is an m4 template system called `DDM4 <https://github.com/BrianAker/DDM4>`_. If any changes are made to the scripts in ``m4`` directory the *serial* line will need incrementing in that file. You should look for a line near the top that looks like: + +.. code-block:: makefile + + #serial 7 + +Shared Library Version +^^^^^^^^^^^^^^^^^^^^^^ + +If any of the source code has changed please see ``LIBMARIAS3_LIBRARY_VERSION`` in ``configure.ac``. This gives rules on bumping the shared library versioning, not to be confused with the API public version which follows similar rules as described in the next section. + +API Version +----------- + +API versioning is stored in the ``VERSION.txt`` file which is used by the build system to version the API and docs. The versioning scheme follows the `Semantic Versioning Rules <http://semver.org/>`_. + +Function Visibility +------------------- + +The code and build system only exposes public API functions as usable symbols in the finished binary. This cuts down on binary size quite significantly and also discourages use of undocumented functionality that was not designed for public use. + +When adding a new API function to ``/libmarias3`` always add ``MS3_API`` on its own on the line above the function definition in the header. This tells the build system this is an API function to be included. + +License Headers +--------------- + +Please make sure before committing that all new files have appropriate license headers in. Only add to the copyright of older headers if you have made a significant contribution to that file (25 - 50 lines is typically classed as significant for Open Souce projects). + diff --git a/storage/maria/libmarias3/docs/contributors/test_cases.rst b/storage/maria/libmarias3/docs/contributors/test_cases.rst new file mode 100644 index 00000000..f6796f04 --- /dev/null +++ b/storage/maria/libmarias3/docs/contributors/test_cases.rst @@ -0,0 +1,162 @@ +Writing Test Cases +================== + +libMariaS3 uses DDM4's YATL library to create unit tests, this provides macros to test if the outcomes are as expected. + +Adding a Test Case +------------------ + +Test cases are basic C applications in the ``tests/`` directory. To add a test case to the suite. To add a test edit the ``include.am`` and add the following (replacing *mytest* with whatever the test is called): + +.. code-block:: makefile + + t_mytest_SOURCES= tests/mytest.c + t_mytest_LDADD= src/libmarias3.la + check_PROGRAMS+= t/mytest + noinst_PROGRAMS+= t/mytest + + +Using YATL +---------- + +YATL is needed to make sure conditions within the test program are met. To include it in your test application, add the following: + +.. code-block:: c + + #include <yatl/lite.h> + +A test skip can be added if certain conditions aren't met: + +.. code-block:: c + + SKIP_IF_(!is_connected, "Cannot connected to a database server") + +There are many types of assert provided as can be seen in the next section, they can be used as follows: + +.. code-block:: c + + ASSERT_EQ_(3, column, "Column count unexpected) + ASSERT_FALSE_(false_condition, "False condition is not false") + ASSERT_STREQ_("test", some_data, "Unexpected data") + +YATL Library +------------ + +Parameter Definitions +^^^^^^^^^^^^^^^^^^^^^ + +.. c:type:: __expression + + An expression typically used in an ``if`` statement. + +.. c:type:: __expected + + An expected variable or expression + +.. c:type:: __actual + + The actual variable or expression + +.. c:type:: __expected_str + + The expected string + +.. c:type:: __actual_str + + The actual string to compare with + +.. c:type:: __length + + The length of a string for comparison + +Function Definitions +^^^^^^^^^^^^^^^^^^^^ + +.. c:macro:: SKIP_IF(__expression) + + Skips the test if the expression is true + +.. c:macro:: SKIP_IF_(__expression, ...) + + Skips the test if the expression is true and uses a printf style format message + +.. c:macro:: ASSERT_TRUE(__expression) + + Make sure the expression is true, test will fail if it is false + +.. c:macro:: ASSERT_FALSE(__expression) + + Make sure the expression is false, test will fail if it is true + +.. c:macro:: ASSERT_FALSE_(__expression, ...) + + Make sure the expression is false and use a printf style format message to fail if it is true. + +.. c:macro:: ASSERT_NULL_(__expression, ...) + + Make sure the expression is :c:type:`NULL` and use a printf style format message to fail if it isn't. + +.. c:macro:: ASSERT_NOT_NULL(__expression) + + Make sure the expression is not :c:type:`NULL`, test will fail if it is :c:type:`NULL`. + +.. c:macro:: ASSERT_NOT_NULL_(__expression, ...) + + Make sure the expression is not :c:type:`NULL` and use a printf style format message to fail if it is. + +.. c:macro:: ASSERT_TRUE_(__expression, ...) + + Make sure the expression is ``true`` and use a printf style format message to fail if it is not. + +.. c:macro:: ASSERT_EQ(__expected, __actual) + + Make sure that one condition or variable matches another one. + + .. note:: + Not suitable for string matching + +.. c:macro:: ASSERT_EQ_(__expected, __actual, ...) + + Make sure that one condition or variable matches another one and use a printf style format message to fail if the do not match. + + .. note:: + Not suitable for string matching + +.. c:macro:: ASSERT_NEQ(__expected, __actual) + + Make sure that one condition or variable does not match another one. + + .. note:: + Not suitable for string matching + +.. c:macro:: ASSERT_NEQ_(__expected, __actual, ...) + + Make sure that one condition or variable does not match another one and use a printf style format message to fail if they do match. + + .. note:: + Not suitable for string matching + +.. c:macro:: ASSERT_STREQ(__expected_str, __actual_str) + + Compare one ``NUL`` terminated string with another one and fail if they do not match. + +.. c:macro:: ASSERT_STREQ_(__expected_str, __actual_str, ...) + + Compare one ``NUL`` terminated string with another one and use a printf style format message to fail if they do not match. + +.. c:macro:: ASSERT_STREQL_(__expected_str, __actual_str, __length, ...) + + Compare a string of :c:type:`__length` to another one and use a printf style format message to fail if they do not match. + + .. note:: + This is designed for use with non-NUL-terminated strings. + +.. c:macro:: ASSERT_STRNE(__expected_str, __actual_str) + + Compare one ``NUL`` terminated string with another one and fail if they match. + +.. c:macro:: ASSERT_STRNE_(__expected_str, __actual_str, ...) + + Compare one ``NUL`` terminated string with another one and use a printf style format message to fail if they match. + + diff --git a/storage/maria/libmarias3/docs/include.am b/storage/maria/libmarias3/docs/include.am new file mode 100644 index 00000000..1a9ef3f5 --- /dev/null +++ b/storage/maria/libmarias3/docs/include.am @@ -0,0 +1,82 @@ +# vim:ft=automake +# included from Top Level Makefile.am +# All paths should be given relative to the root + +# Makefile for Sphinx documentation +# + +SPHINXOPTS = ${SPHINX_WARNINGS} -q +PAPER = +SPHINX_BUILDDIR = ${abs_srcdir}/docs + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -c $(top_builddir)/docs $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SPHINX_BUILDDIR) + +.PHONY: clean-docs-check +clean-docs-check: + -rm -rf docs/_build docs/doctrees man/.doctrees + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest + +man: docs/conf.py + @PYTHONPATH=$(SPHINX_BUILDDIR)/docs $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) ${top_builddir}/man + +install-html-local: html-local + @$(MKDIR_P) $(htmldir)/html + @cp -r ${top_builddir}/html $(htmldir)/ + +html-local: docs/conf.py + @PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -nW -b html $(ALLSPHINXOPTS) ${top_builddir}/html + +singlehtml: html-local + @PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/singlehtml + +pickle: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(SPHINX_BUILDDIR)/htmlhelp." + +epub: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(SPHINX_BUILDDIR)/epub." + +latex: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(SPHINX_BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: latex + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(SPHINX_BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(SPHINX_BUILDDIR)/latex." + +text: docs/conf.py + @PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/text + +changes: docs/conf.py + @PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/changes + +linkcheck: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/linkcheck + +doctest: docs/conf.py + PYTHONPATH=${top_srcdir}/docs $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(SPHINX_BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(SPHINX_BUILDDIR)/doctest/output.txt." diff --git a/storage/maria/libmarias3/docs/index.rst b/storage/maria/libmarias3/docs/index.rst new file mode 100644 index 00000000..0d454980 --- /dev/null +++ b/storage/maria/libmarias3/docs/index.rst @@ -0,0 +1,47 @@ +libMariaS3 +========== + +:Release: |release| +:Date: |today| + +Introduction +------------ +.. toctree:: + :maxdepth: 2 + + introduction/whatis + introduction/license + introduction/compiling + +API Documentation +----------------- +.. toctree:: + :maxdepth: 2 + + api/functions + api/types + api/errors + api/compiling + +Contributing +------------ +.. toctree:: + :maxdepth: 2 + + contributors/introduction + contributors/coding_standards + contributors/docs + contributors/test_cases + contributors/github + +Appendix +-------- + +.. toctree:: + :maxdepth: 2 + + appendix/version_history + appendix/credits + +* :ref:`genindex` +* :ref:`search` diff --git a/storage/maria/libmarias3/docs/introduction/compiling.rst b/storage/maria/libmarias3/docs/introduction/compiling.rst new file mode 100644 index 00000000..b0ad3b4b --- /dev/null +++ b/storage/maria/libmarias3/docs/introduction/compiling.rst @@ -0,0 +1,79 @@ +Compiling libMariaS3 +==================== + +libMariaS3 is designed to be compiled with GCC or CLang on a modern Linux distrubition or Mac OSX. + +Prerequisites +------------- + +libMariaS3 requires *libcurl 7.x* and *libxml2* to be installed. For RPM based distributions this can be installed using: + +.. code-block:: bash + + sudo dnf install libcurl-devel libxml2-devel + +Building +-------- + +On most systems you can use the following commands, this is especially useful for customising your install:: + + autoreconf -fi + ./configure + make + make install + +The build system will automatically detect how many processor cores you have (physicaly and virtual) and set the ``--jobs`` options of make accordingly. + +Testing +------- + +libMariaS3 comes with a basic test suite which we recommend executing, especially if you are building for a new platform. + +You will need the following OS environment variables set to run the tests: + ++------------+----------------------------------------------------------+ +| Variable | Desription | ++============+==========================================================+ +| S3KEY | Your AWS access key | ++------------+----------------------------------------------------------+ +| S3SECRET | Your AWS secret key | ++------------+----------------------------------------------------------+ +| S3REGION | The AWS region (for example us-east-1) | ++------------+----------------------------------------------------------+ +| S3BUCKET | The S3 bucket name | ++------------+----------------------------------------------------------+ +| S3HOST | OPTIONAL hostname for non-AWS S3 service | ++------------+----------------------------------------------------------+ +| S3NOVERIFY | Set to ``1`` if the host should not use SSL verification | ++------------+----------------------------------------------------------+ + +The test suite is automatically built along with the library and can be executed with ``make check`` or ``make distcheck``. If you wish to test with valgrind you can use:: + + TESTS_ENVIRONMENT="./libtool --mode=execute valgrind --error-exitcode=1 --leak-check=yes --track-fds=yes --malloc-fill=A5 --free-fill=DE" make check + +Building RPMs +------------- + +The build system for libMariaS3 has the capability to build RPMs. To build RPMs simply do the following: + +.. code-block:: bash + + autoreconf -fi + ./configure + make dist-rpm + +.. note:: + The package ``redhat-rpm-config`` is required for building the RPM because this generates the debuginfo RPM. + +Building DEBs +------------- + +Debian packages for libMariaS3 can be built using the standard ``dpkg-buildpackage`` tool as follows: + +.. code-block:: bash + + autoreconf -fi + dpkg-buildpackage + +.. note:: + You may need to add ``--no-sign`` to dpkg-buildpackage to build unsigned packages. diff --git a/storage/maria/libmarias3/docs/introduction/license.rst b/storage/maria/libmarias3/docs/introduction/license.rst new file mode 100644 index 00000000..fbb62162 --- /dev/null +++ b/storage/maria/libmarias3/docs/introduction/license.rst @@ -0,0 +1,521 @@ +Licensing +========= + +Documentation Content +--------------------- + +.. image:: /_static/cc-symbol.png + :alt: Creative Commons License + :target: http://creativecommons.org/licenses/by-sa/4.0/ + +The libMariaS3 Documentation is licensed under a `Creative Commons Attribution-ShareAlike 4.0 International License <http://creativecommons.org/licenses/by-sa/4.0>`_. + +libMariaS3 License +------------------ + +libMariaS3 is licensed under the `Lesser GNU General Public License, Version 2.1 <https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html>`_. + +:: + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + [This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your + freedom to share and change it. By contrast, the GNU General Public + Licenses are intended to guarantee your freedom to share and change + free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some + specially designated software packages--typically libraries--of the + Free Software Foundation and other authors who decide to use it. You + can use it too, but we suggest you first think carefully about whether + this license or the ordinary General Public License is the better + strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, + not price. Our General Public Licenses are designed to make sure that + you have the freedom to distribute copies of free software (and charge + for this service if you wish); that you receive source code or can get + it if you want it; that you can change the software and use pieces of + it in new free programs; and that you are informed that you can do + these things. + + To protect your rights, we need to make restrictions that forbid + distributors to deny you these rights or to ask you to surrender these + rights. These restrictions translate to certain responsibilities for + you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis + or for a fee, you must give the recipients all the rights that we gave + you. You must make sure that they, too, receive or can get the source + code. If you link other code with the library, you must provide + complete object files to the recipients, so that they can relink them + with the library after making changes to the library and recompiling + it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the + library, and (2) we offer you this license, which gives you legal + permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that + there is no warranty for the free library. Also, if the library is + modified by someone else and passed on, the recipients should know + that what they have is not the original version, so that the original + author's reputation will not be affected by problems that might be + introduced by others. + + Finally, software patents pose a constant threat to the existence of + any free program. We wish to make sure that a company cannot + effectively restrict the users of a free program by obtaining a + restrictive license from a patent holder. Therefore, we insist that + any patent license obtained for a version of the library must be + consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the + ordinary GNU General Public License. This license, the GNU Lesser + General Public License, applies to certain designated libraries, and + is quite different from the ordinary General Public License. We use + this license for certain libraries in order to permit linking those + libraries into non-free programs. + + When a program is linked with a library, whether statically or using + a shared library, the combination of the two is legally speaking a + combined work, a derivative of the original library. The ordinary + General Public License therefore permits such linking only if the + entire combination fits its criteria of freedom. The Lesser General + Public License permits more lax criteria for linking other code with + the library. + + We call this license the "Lesser" General Public License because it + does Less to protect the user's freedom than the ordinary General + Public License. It also provides other free software developers Less + of an advantage over competing non-free programs. These disadvantages + are the reason we use the ordinary General Public License for many + libraries. However, the Lesser license provides advantages in certain + special circumstances. + + For example, on rare occasions, there may be a special need to + encourage the widest possible use of a certain library, so that it becomes + a de-facto standard. To achieve this, non-free programs must be + allowed to use the library. A more frequent case is that a free + library does the same job as widely used non-free libraries. In this + case, there is little to gain by limiting the free library to free + software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free + programs enables a greater number of people to use a large body of + free software. For example, permission to use the GNU C Library in + non-free programs enables many more people to use the whole GNU + operating system, as well as its variant, the GNU/Linux operating + system. + + Although the Lesser General Public License is Less protective of the + users' freedom, it does ensure that the user of a program that is + linked with the Library has the freedom and the wherewithal to run + that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and + modification follow. Pay close attention to the difference between a + "work based on the library" and a "work that uses the library". The + former contains code derived from the library, whereas the latter must + be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + A. This License Agreement applies to any software library or other + program which contains a notice placed by the copyright holder or + other authorized party saying it may be distributed under the terms of + this Lesser General Public License (also called "this License"). + Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data + prepared so as to be conveniently linked with application programs + (which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work + which has been distributed under these terms. A "work based on the + Library" means either the Library or any derivative work under + copyright law: that is to say, a work containing the Library or a + portion of it, either verbatim or with modifications and/or translated + straightforwardly into another language. (Hereinafter, translation is + included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for + making modifications to it. For a library, complete source code means + all the source code for all modules it contains, plus any associated + interface definition files, plus the scripts used to control compilation + and installation of the library. + + Activities other than copying, distribution and modification are not + covered by this License; they are outside its scope. The act of + running a program using the Library is not restricted, and output from + such a program is covered only if its contents constitute a work based + on the Library (independent of the use of the Library in a tool for + writing it). Whether that is true depends on what the Library does + and what the program that uses the Library does. + + A. You may copy and distribute verbatim copies of the Library's + complete source code as you receive it, in any medium, provided that + you conspicuously and appropriately publish on each copy an + appropriate copyright notice and disclaimer of warranty; keep intact + all the notices that refer to this License and to the absence of any + warranty; and distribute a copy of this License along with the + Library. + + You may charge a fee for the physical act of transferring a copy, + and you may at your option offer warranty protection in exchange for a + fee. + + A. You may modify your copy or copies of the Library or any portion + of it, thus forming a work based on the Library, and copy and + distribute such modifications or work under the terms of Section 1 + above, provided that you also meet all of these conditions: + + I) The modified work must itself be a software library. + + II) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + I) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + I) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + + These requirements apply to the modified work as a whole. If + identifiable sections of that work are not derived from the Library, + and can be reasonably considered independent and separate works in + themselves, then this License, and its terms, do not apply to those + sections when you distribute them as separate works. But when you + distribute the same sections as part of a whole which is a work based + on the Library, the distribution of the whole must be on the terms of + this License, whose permissions for other licensees extend to the + entire whole, and thus to each and every part regardless of who wrote + it. + + Thus, it is not the intent of this section to claim rights or contest + your rights to work written entirely by you; rather, the intent is to + exercise the right to control the distribution of derivative or + collective works based on the Library. + + In addition, mere aggregation of another work not based on the Library + with the Library (or with a work based on the Library) on a volume of + a storage or distribution medium does not bring the other work under + the scope of this License. + + A. You may opt to apply the terms of the ordinary GNU General Public + License instead of this License to a given copy of the Library. To do + this, you must alter all the notices that refer to this License, so + that they refer to the ordinary GNU General Public License, version 2, + instead of to this License. (If a newer version than version 2 of the + ordinary GNU General Public License has appeared, then you can specify + that version instead if you wish.) Do not make any other change in + these notices. + + Once this change is made in a given copy, it is irreversible for + that copy, so the ordinary GNU General Public License applies to all + subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of + the Library into a program that is not a library. + + A. You may copy and distribute the Library (or a portion or + derivative of it, under Section 2) in object code or executable form + under the terms of Sections 1 and 2 above provided that you accompany + it with the complete corresponding machine-readable source code, which + must be distributed under the terms of Sections 1 and 2 above on a + medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy + from a designated place, then offering equivalent access to copy the + source code from the same place satisfies the requirement to + distribute the source code, even though third parties are not + compelled to copy the source along with the object code. + + A. A program that contains no derivative of any portion of the + Library, but is designed to work with the Library by being compiled or + linked with it, is called a "work that uses the Library". Such a + work, in isolation, is not a derivative work of the Library, and + therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library + creates an executable that is a derivative of the Library (because it + contains portions of the Library), rather than a "work that uses the + library". The executable is therefore covered by this License. + Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file + that is part of the Library, the object code for the work may be a + derivative work of the Library even though the source code is not. + Whether this is true is especially significant if the work can be + linked without the Library, or if the work is itself a library. The + threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data + structure layouts and accessors, and small macros and small inline + functions (ten lines or less in length), then the use of the object + file is unrestricted, regardless of whether it is legally a derivative + work. (Executables containing this object code plus portions of the + Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may + distribute the object code for the work under the terms of Section 6. + Any executables containing that work also fall under Section 6, + whether or not they are linked directly with the Library itself. + + A. As an exception to the Sections above, you may also combine or + link a "work that uses the Library" with the Library to produce a + work containing portions of the Library, and distribute that work + under terms of your choice, provided that the terms permit + modification of the work for the customer's own use and reverse + engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the + Library is used in it and that the Library and its use are covered by + this License. You must supply a copy of this License. If the work + during execution displays copyright notices, you must include the + copyright notice for the Library among them, as well as a reference + directing the user to the copy of this License. Also, you must do one + of these things: + + I) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + I) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + I) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + I) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + A) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the + Library" must include any data and utility programs needed for + reproducing the executable from it. However, as a special exception, + the materials to be distributed need not include anything that is + normally distributed (in either source or binary form) with the major + components (compiler, kernel, and so on) of the operating system on + which the executable runs, unless that component itself accompanies + the executable. + + It may happen that this requirement contradicts the license + restrictions of other proprietary libraries that do not normally + accompany the operating system. Such a contradiction means you cannot + use both them and the Library together in an executable that you + distribute. + + A. You may place library facilities that are a work based on the + Library side-by-side in a single library together with other library + facilities not covered by this License, and distribute such a combined + library, provided that the separate distribution of the work based on + the Library and of the other library facilities is otherwise + permitted, and provided that you do these two things: + + I) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + I) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + A. You may not copy, modify, sublicense, link with, or distribute + the Library except as expressly provided under this License. Any + attempt otherwise to copy, modify, sublicense, link with, or + distribute the Library is void, and will automatically terminate your + rights under this License. However, parties who have received copies, + or rights, from you under this License will not have their licenses + terminated so long as such parties remain in full compliance. + + I. You are not required to accept this License, since you have not + signed it. However, nothing else grants you permission to modify or + distribute the Library or its derivative works. These actions are + prohibited by law if you do not accept this License. Therefore, by + modifying or distributing the Library (or any work based on the + Library), you indicate your acceptance of this License to do so, and + all its terms and conditions for copying, distributing or modifying + the Library or works based on it. + + A. Each time you redistribute the Library (or any work based on the + Library), the recipient automatically receives a license from the + original licensor to copy, distribute, link with or modify the Library + subject to these terms and conditions. You may not impose any further + restrictions on the recipients' exercise of the rights granted herein. + You are not responsible for enforcing compliance by third parties with + this License. + + A. If, as a consequence of a court judgment or allegation of patent + infringement or for any other reason (not limited to patent issues), + conditions are imposed on you (whether by court order, agreement or + otherwise) that contradict the conditions of this License, they do not + excuse you from the conditions of this License. If you cannot + distribute so as to satisfy simultaneously your obligations under this + License and any other pertinent obligations, then as a consequence you + may not distribute the Library at all. For example, if a patent + license would not permit royalty-free redistribution of the Library by + all those who receive copies directly or indirectly through you, then + the only way you could satisfy both it and this License would be to + refrain entirely from distribution of the Library. + + If any portion of this section is held invalid or unenforceable under any + particular circumstance, the balance of the section is intended to apply, + and the section as a whole is intended to apply in other circumstances. + + It is not the purpose of this section to induce you to infringe any + patents or other property right claims or to contest validity of any + such claims; this section has the sole purpose of protecting the + integrity of the free software distribution system which is + implemented by public license practices. Many people have made + generous contributions to the wide range of software distributed + through that system in reliance on consistent application of that + system; it is up to the author/donor to decide if he or she is willing + to distribute software through any other system and a licensee cannot + impose that choice. + + This section is intended to make thoroughly clear what is believed to + be a consequence of the rest of this License. + + A. If the distribution and/or use of the Library is restricted in + certain countries either by patents or by copyrighted interfaces, the + original copyright holder who places the Library under this License may add + an explicit geographical distribution limitation excluding those countries, + so that distribution is permitted only in or among countries not thus + excluded. In such case, this License incorporates the limitation as if + written in the body of this License. + + A. The Free Software Foundation may publish revised and/or new + versions of the Lesser General Public License from time to time. + Such new versions will be similar in spirit to the present version, + but may differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the Library + specifies a version number of this License which applies to it and + "any later version", you have the option of following the terms and + conditions either of that version or of any later version published by + the Free Software Foundation. If the Library does not specify a + license version number, you may choose any version ever published by + the Free Software Foundation. + + A. If you wish to incorporate parts of the Library into other free + programs whose distribution conditions are incompatible with these, + write to the author to ask for permission. For software which is + copyrighted by the Free Software Foundation, write to the Free + Software Foundation; we sometimes make exceptions for this. Our + decision will be guided by the two goals of preserving the free status + of all derivatives of our free software and of promoting the sharing + and reuse of software generally. + + NO WARRANTY + + A. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO + WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. + EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR + OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY + KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE + LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME + THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + A. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN + WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY + AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU + FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR + CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE + LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING + RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A + FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF + SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest + possible use to the public, we recommend making it free software that + everyone can redistribute and change. You can do so by permitting + redistribution under these terms (or, alternatively, under the terms of the + ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is + safest to attach them to the start of each source file to most effectively + convey the exclusion of warranty; and each file should have at least the + "copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + + Also add information on how to contact you by electronic and paper mail. + + You should also get your employer (if you work as a programmer) or your + school, if any, to sign a "copyright disclaimer" for the library, if + necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + + That's all there is to it! diff --git a/storage/maria/libmarias3/docs/introduction/whatis.rst b/storage/maria/libmarias3/docs/introduction/whatis.rst new file mode 100644 index 00000000..9e457783 --- /dev/null +++ b/storage/maria/libmarias3/docs/introduction/whatis.rst @@ -0,0 +1,6 @@ +What is libMariaS3? +=================== + +libMariaS3 is a lightweight library to connect to Amazon's S3 storage. + +It is LGPL 2.1 licensed so that it is possible to use both with Open Source and Commercial applications. It is also designed to provided a relatively easy to use API. diff --git a/storage/maria/libmarias3/extra/astyle.sh b/storage/maria/libmarias3/extra/astyle.sh new file mode 100755 index 00000000..d3cf2620 --- /dev/null +++ b/storage/maria/libmarias3/extra/astyle.sh @@ -0,0 +1,2 @@ +#!/bin/bash +astyle --style=allman --indent=spaces=2 --indent-switches --break-blocks --pad-comma --pad-oper --pad-header --lineend=linux --align-pointer=name --align-reference=name --max-code-length=80 --recursive "*.c" "*.h" diff --git a/storage/maria/libmarias3/libmarias3.pc.in b/storage/maria/libmarias3/libmarias3.pc.in new file mode 100644 index 00000000..3a97fb21 --- /dev/null +++ b/storage/maria/libmarias3/libmarias3.pc.in @@ -0,0 +1,11 @@ +prefix=@prefix@ +exec_prefix=@prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: @PACKAGE_NAME@ +Version: @PACKAGE_VERSION@ +Description: a C connector for Amazon's S3. + +Libs: -L${libdir} @LIBS@ +Cflags: -I${includedir} diff --git a/storage/maria/libmarias3/libmarias3/include.am b/storage/maria/libmarias3/libmarias3/include.am new file mode 100644 index 00000000..4fcb7495 --- /dev/null +++ b/storage/maria/libmarias3/libmarias3/include.am @@ -0,0 +1,6 @@ +# vim:ft=automake +# included from Top Level Makefile.am +# All paths should be given relative to the root + +nobase_include_HEADERS+= libmarias3/marias3.h +nobase_include_HEADERS+= libmarias3/visibility.h diff --git a/storage/maria/libmarias3/libmarias3/marias3.h b/storage/maria/libmarias3/libmarias3/marias3.h new file mode 100644 index 00000000..80b5e77c --- /dev/null +++ b/storage/maria/libmarias3/libmarias3/marias3.h @@ -0,0 +1,171 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include <curl/curl.h> +#include <stdint.h> + +#include <libmarias3/visibility.h> + +struct ms3_st; +typedef struct ms3_st ms3_st; + +struct ms3_list_st +{ + char *key; + size_t length; + time_t created; + struct ms3_list_st *next; +}; + +typedef struct ms3_list_st ms3_list_st; + +struct ms3_status_st +{ + size_t length; + time_t created; +}; + +typedef struct ms3_status_st ms3_status_st; + +typedef void *(*ms3_malloc_callback)(size_t size); +typedef void (*ms3_free_callback)(void *ptr); +typedef void *(*ms3_realloc_callback)(void *ptr, size_t size); +typedef char *(*ms3_strdup_callback)(const char *str); +typedef void *(*ms3_calloc_callback)(size_t nmemb, size_t size); + +enum ms3_error_code_t +{ + MS3_ERR_NONE, + MS3_ERR_PARAMETER, + MS3_ERR_NO_DATA, + MS3_ERR_URI_TOO_LONG, + MS3_ERR_RESPONSE_PARSE, + MS3_ERR_REQUEST_ERROR, + MS3_ERR_OOM, + MS3_ERR_IMPOSSIBLE, + MS3_ERR_AUTH, + MS3_ERR_NOT_FOUND, + MS3_ERR_SERVER, + MS3_ERR_TOO_BIG, + MS3_ERR_AUTH_ROLE, + MS3_ERR_MAX // Always the last error +}; + +typedef enum ms3_error_code_t ms3_error_code_t; + +enum ms3_set_option_t +{ + MS3_OPT_USE_HTTP, + MS3_OPT_DISABLE_SSL_VERIFY, + MS3_OPT_BUFFER_CHUNK_SIZE, + MS3_OPT_FORCE_LIST_VERSION, + MS3_OPT_FORCE_PROTOCOL_VERSION, + MS3_OPT_PORT_NUMBER +}; + +typedef enum ms3_set_option_t ms3_set_option_t; + +MS3_API +void ms3_library_init(void); + +MS3_API +void ms3_library_deinit(void); + +MS3_API +uint8_t ms3_library_init_malloc(ms3_malloc_callback m, + ms3_free_callback f, ms3_realloc_callback r, + ms3_strdup_callback s, ms3_calloc_callback c); + +MS3_API +ms3_st *ms3_init(const char *s3key, const char *s3secret, + const char *region, + const char *base_domain); + +MS3_API +uint8_t ms3_init_assume_role(ms3_st *ms3, const char *iam_role, const char *sts_endpoint, const char *sts_region); + +MS3_API +uint8_t ms3_ec2_set_cred(ms3_st *ms3, const char *iam_role, + const char *s3key, const char *s3secret, + const char *token); + +MS3_API +uint8_t ms3_set_option(ms3_st *ms3, ms3_set_option_t option, void *value); + +MS3_API +void ms3_deinit(ms3_st *ms3); + +MS3_API +const char *ms3_server_error(ms3_st *ms3); + +MS3_API +const char *ms3_error(uint8_t errcode); + +MS3_API +void ms3_debug(void); + +MS3_API +uint8_t ms3_list(ms3_st *ms3, const char *bucket, const char *prefix, + ms3_list_st **list); + +MS3_API +uint8_t ms3_list_dir(ms3_st *ms3, const char *bucket, const char *prefix, + ms3_list_st **list); + +MS3_API +void ms3_list_free(ms3_list_st *list); + +MS3_API +uint8_t ms3_put(ms3_st *ms3, const char *bucket, const char *key, + const uint8_t *data, size_t length); + +MS3_API +uint8_t ms3_get(ms3_st *ms3, const char *bucket, const char *key, + uint8_t **data, size_t *length); + +MS3_API +uint8_t ms3_copy(ms3_st *ms3, const char *source_bucket, const char *source_key, + const char *dest_bucket, const char *dest_key); + +MS3_API +uint8_t ms3_move(ms3_st *ms3, const char *source_bucket, const char *source_key, + const char *dest_bucket, const char *dest_key); + +MS3_API +void ms3_free(uint8_t *data); + +MS3_API +uint8_t ms3_delete(ms3_st *ms3, const char *bucket, const char *key); + +MS3_API +uint8_t ms3_status(ms3_st *ms3, const char *bucket, const char *key, + ms3_status_st *status); + +MS3_API +uint8_t ms3_assume_role(ms3_st *ms3); + +#ifdef __cplusplus +} +#endif diff --git a/storage/maria/libmarias3/libmarias3/visibility.h b/storage/maria/libmarias3/libmarias3/visibility.h new file mode 100644 index 00000000..191c2129 --- /dev/null +++ b/storage/maria/libmarias3/libmarias3/visibility.h @@ -0,0 +1,36 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#if defined(BUILDING_MS3) +# if defined(HAVE_VISIBILITY) && HAVE_VISIBILITY +# define MS3_API __attribute__ ((visibility("default"))) +# elif defined (__SUNPRO_C) && (__SUNPRO_C >= 0x550) +# define MS3_API __global +# elif defined(_MSC_VER) +# define MS3_API extern __declspec(dllexport) +# else +# define MS3_API +# endif /* defined(HAVE_VISIBILITY) */ +#else /* defined(BUILDING_MS3) */ +# if defined(_MSC_VER) +# define MS3_API extern __declspec(dllimport) +# else +# define MS3_API +# endif /* defined(_MSC_VER) */ +#endif /* defined(BUILDING_MS3) */ diff --git a/storage/maria/libmarias3/m4/ax_add_am_macro.m4 b/storage/maria/libmarias3/m4/ax_add_am_macro.m4 new file mode 100644 index 00000000..51ce0d0c --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_add_am_macro.m4 @@ -0,0 +1,29 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_add_am_macro.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_ADD_AM_MACRO([RULE]) +# +# DESCRIPTION +# +# Adds the specified rule to $AMINCLUDE. This macro will only work +# properly with implementations of Make which allow include statements. +# See also AX_ADD_AM_MACRO_STATIC. +# +# LICENSE +# +# Copyright (c) 2009 Tom Howard <tomhoward@users.sf.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 9 + +AC_DEFUN([AX_ADD_AM_MACRO],[ + AC_REQUIRE([AX_AM_MACROS]) + AX_APPEND_TO_FILE([$AMINCLUDE],[$1]) +]) diff --git a/storage/maria/libmarias3/m4/ax_am_jobserver.m4 b/storage/maria/libmarias3/m4/ax_am_jobserver.m4 new file mode 100644 index 00000000..0bee7ab6 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_am_jobserver.m4 @@ -0,0 +1,55 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_am_jobserver.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_AM_JOBSERVER([default_value]) +# +# DESCRIPTION +# +# Enables the use of make's jobserver for the purpose of parallel building +# by passing the -j option to make. +# +# The option --enable-jobserver is added to configure which can accept a +# yes, no, or an integer. The integer is the number of separate jobs to +# allow. If 'yes' is given, then the is assumed to be one more than the +# number of CPUs (determined through AX_COUNT_CPUS). If the value of no is +# given, then the jobserver is disabled. The default value is given by the +# first argument of the macro, or 'yes' if the argument is omitted. +# +# This macro makes use of AX_AM_MACROS, so you must add the following line +# +# @INC_AMINCLUDE@ +# +# to your Makefile.am files. +# +# LICENSE +# +# Copyright (c) 2008 Michael Paul Bailey <jinxidoru@byu.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 7 + +AC_DEFUN([AX_AM_JOBSERVER], [ + AC_REQUIRE([AX_COUNT_CPUS]) + AC_REQUIRE([AX_AM_MACROS]) + AC_ARG_ENABLE( jobserver, + [ --enable-jobserver@<:@=no/yes/@%:@@:>@ default=m4_ifval([$1],[$1],[yes]) + Enable up to @%:@ make jobs + yes: enable one more than CPU count + ],, [enable_jobserver=m4_ifval([$1],[$1],[yes])]) + if test "x$enable_jobserver" = "xyes"; then + enable_jobserver=$CPU_COUNT + ((enable_jobserver++)) + fi + m4_pattern_allow(AM_MAKEFLAGS) + if test "x$enable_jobserver" != "xno"; then + AC_MSG_NOTICE([added jobserver support to make for $enable_jobserver jobs]) + AX_ADD_AM_MACRO( AM_MAKEFLAGS += -j$enable_jobserver ) + fi +]) diff --git a/storage/maria/libmarias3/m4/ax_am_macros.m4 b/storage/maria/libmarias3/m4/ax_am_macros.m4 new file mode 100644 index 00000000..6b4bd223 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_am_macros.m4 @@ -0,0 +1,44 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_am_macros.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_AM_MACROS +# +# DESCRIPTION +# +# Adds support for macros that create Make rules. You must manually add +# the following line +# +# @INC_AMINCLUDE@ +# +# to your Makefile.in (or Makefile.am if you use Automake) files. +# +# LICENSE +# +# Copyright (c) 2009 Tom Howard <tomhoward@users.sf.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 9 + +AC_DEFUN([AX_AM_MACROS], +[ +AC_MSG_NOTICE([adding automake macro support]) +AMINCLUDE="aminclude.am" +AC_SUBST(AMINCLUDE) +AC_MSG_NOTICE([creating $AMINCLUDE]) +AMINCLUDE_TIME=`date` +AX_PRINT_TO_FILE([$AMINCLUDE],[[ +# generated automatically by configure from AX_AUTOMAKE_MACROS +# on $AMINCLUDE_TIME + +]]) + +INC_AMINCLUDE="include \$(top_builddir)/$AMINCLUDE" +AC_SUBST(INC_AMINCLUDE) +]) diff --git a/storage/maria/libmarias3/m4/ax_append_compile_flags.m4 b/storage/maria/libmarias3/m4/ax_append_compile_flags.m4 new file mode 100644 index 00000000..1f8e7084 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_append_compile_flags.m4 @@ -0,0 +1,65 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_append_compile_flags.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_APPEND_COMPILE_FLAGS([FLAG1 FLAG2 ...], [FLAGS-VARIABLE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# For every FLAG1, FLAG2 it is checked whether the compiler works with the +# flag. If it does, the flag is added FLAGS-VARIABLE +# +# If FLAGS-VARIABLE is not specified, the current language's flags (e.g. +# CFLAGS) is used. During the check the flag is always added to the +# current language's flags. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# NOTE: This macro depends on the AX_APPEND_FLAG and +# AX_CHECK_COMPILE_FLAG. Please keep this macro in sync with +# AX_APPEND_LINK_FLAGS. +# +# LICENSE +# +# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 3 + +AC_DEFUN([AX_APPEND_COMPILE_FLAGS], +[AC_REQUIRE([AX_CHECK_COMPILE_FLAG]) +AC_REQUIRE([AX_APPEND_FLAG]) +for flag in $1; do + AX_CHECK_COMPILE_FLAG([$flag], [AX_APPEND_FLAG([$flag], [$2])], [], [$3]) +done +])dnl AX_APPEND_COMPILE_FLAGS diff --git a/storage/maria/libmarias3/m4/ax_append_flag.m4 b/storage/maria/libmarias3/m4/ax_append_flag.m4 new file mode 100644 index 00000000..1d38b76f --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_append_flag.m4 @@ -0,0 +1,69 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_append_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_APPEND_FLAG(FLAG, [FLAGS-VARIABLE]) +# +# DESCRIPTION +# +# FLAG is appended to the FLAGS-VARIABLE shell variable, with a space +# added in between. +# +# If FLAGS-VARIABLE is not specified, the current language's flags (e.g. +# CFLAGS) is used. FLAGS-VARIABLE is not changed if it already contains +# FLAG. If FLAGS-VARIABLE is unset in the shell, it is set to exactly +# FLAG. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de> +# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 2 + +AC_DEFUN([AX_APPEND_FLAG], +[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX +AS_VAR_PUSHDEF([FLAGS], [m4_default($2,_AC_LANG_PREFIX[FLAGS])])dnl +AS_VAR_SET_IF(FLAGS, + [case " AS_VAR_GET(FLAGS) " in + *" $1 "*) + AC_RUN_LOG([: FLAGS already contains $1]) + ;; + *) + AC_RUN_LOG([: FLAGS="$FLAGS $1"]) + AS_VAR_SET(FLAGS, ["AS_VAR_GET(FLAGS) $1"]) + ;; + esac], + [AS_VAR_SET(FLAGS,["$1"])]) +AS_VAR_POPDEF([FLAGS])dnl +])dnl AX_APPEND_FLAG diff --git a/storage/maria/libmarias3/m4/ax_append_link_flags.m4 b/storage/maria/libmarias3/m4/ax_append_link_flags.m4 new file mode 100644 index 00000000..48cbd4bb --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_append_link_flags.m4 @@ -0,0 +1,63 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_append_link_flags.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_APPEND_LINK_FLAGS([FLAG1 FLAG2 ...], [FLAGS-VARIABLE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# For every FLAG1, FLAG2 it is checked whether the linker works with the +# flag. If it does, the flag is added FLAGS-VARIABLE +# +# If FLAGS-VARIABLE is not specified, the linker's flags (LDFLAGS) is +# used. During the check the flag is always added to the linker's flags. +# +# If EXTRA-FLAGS is defined, it is added to the linker's default flags +# when the check is done. The check is thus made with the flags: "LDFLAGS +# EXTRA-FLAGS FLAG". This can for example be used to force the linker to +# issue an error when a bad flag is given. +# +# NOTE: This macro depends on the AX_APPEND_FLAG and AX_CHECK_LINK_FLAG. +# Please keep this macro in sync with AX_APPEND_COMPILE_FLAGS. +# +# LICENSE +# +# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 3 + +AC_DEFUN([AX_APPEND_LINK_FLAGS], +[AC_REQUIRE([AX_CHECK_LINK_FLAG]) +AC_REQUIRE([AX_APPEND_FLAG]) +for flag in $1; do + AX_CHECK_LINK_FLAG([$flag], [AX_APPEND_FLAG([$flag], [m4_default([$2], [LDFLAGS])])], [], [$3]) +done +])dnl AX_APPEND_LINK_FLAGS diff --git a/storage/maria/libmarias3/m4/ax_append_to_file.m4 b/storage/maria/libmarias3/m4/ax_append_to_file.m4 new file mode 100644 index 00000000..f9f54e08 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_append_to_file.m4 @@ -0,0 +1,27 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_append_to_file.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_APPEND_TO_FILE([FILE],[DATA]) +# +# DESCRIPTION +# +# Appends the specified data to the specified file. +# +# LICENSE +# +# Copyright (c) 2008 Tom Howard <tomhoward@users.sf.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 8 + +AC_DEFUN([AX_APPEND_TO_FILE],[ +AC_REQUIRE([AX_FILE_ESCAPES]) +printf "$2\n" >> "$1" +]) diff --git a/storage/maria/libmarias3/m4/ax_assert.m4 b/storage/maria/libmarias3/m4/ax_assert.m4 new file mode 100644 index 00000000..88741293 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_assert.m4 @@ -0,0 +1,66 @@ +# =========================================================================== +# https://github.com/BrianAker/ddm4/ +# =========================================================================== +# +# SYNOPSIS +# +# AX_ASSERT() +# +# DESCRIPTION +# +# --enable-assert +# +# LICENSE +# +# Copyright (C) 2012 Brian Aker +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# * The names of its contributors may not be used to endorse or +# promote products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#serial 6 + +AC_DEFUN([AX_ASSERT], + [AC_PREREQ([2.63])dnl + AC_REQUIRE([AX_DEBUG]) + AC_REQUIRE([AX_VCS_CHECKOUT]) + AC_ARG_ENABLE([assert], + [AS_HELP_STRING([--enable-assert], + [Enable assert, this will be overridden by --enable-debug (yes|no) @<:@default=no@:>@])], + [ax_enable_assert=yes], + [ax_enable_assert=no]) + + AS_IF([ test "$ax_enable_assert" = "yes" -o "$ax_enable_debug" = "yes" -o "$ac_cv_vcs_checkout" = "yes" ], + [ax_enable_assert="yes"], + [ax_enable_assert="no" + AC_DEFINE(NDEBUG,[1],[Define to 1 to disable assert'ing code.])]) + + AC_MSG_CHECKING([for assert]) + AC_MSG_RESULT([$ax_enable_assert]) + ]) + diff --git a/storage/maria/libmarias3/m4/ax_check_compile_flag.m4 b/storage/maria/libmarias3/m4/ax_check_compile_flag.m4 new file mode 100644 index 00000000..c3a8d695 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_check_compile_flag.m4 @@ -0,0 +1,72 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAG works with the current language's compiler +# or gives an error. (Warnings, however, are ignored) +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this +# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de> +# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 2 + +AC_DEFUN([AX_CHECK_COMPILE_FLAG], +[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX +AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl +AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ + ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM()], + [AS_VAR_SET(CACHEVAR,[yes])], + [AS_VAR_SET(CACHEVAR,[no])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) +AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], + [m4_default([$2], :)], + [m4_default([$3], :)]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl AX_CHECK_COMPILE_FLAGS diff --git a/storage/maria/libmarias3/m4/ax_check_link_flag.m4 b/storage/maria/libmarias3/m4/ax_check_link_flag.m4 new file mode 100644 index 00000000..e2d0d363 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_check_link_flag.m4 @@ -0,0 +1,71 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_link_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_LINK_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAG works with the linker or gives an error. +# (Warnings, however, are ignored) +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. +# +# If EXTRA-FLAGS is defined, it is added to the linker's default flags +# when the check is done. The check is thus made with the flags: "LDFLAGS +# EXTRA-FLAGS FLAG". This can for example be used to force the linker to +# issue an error when a bad flag is given. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this +# macro in sync with AX_CHECK_{PREPROC,COMPILE}_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de> +# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com> +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 2 + +AC_DEFUN([AX_CHECK_LINK_FLAG], +[AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_ldflags_$4_$1])dnl +AC_CACHE_CHECK([whether the linker accepts $1], CACHEVAR, [ + ax_check_save_flags=$LDFLAGS + LDFLAGS="$LDFLAGS $4 $1" + AC_LINK_IFELSE([AC_LANG_PROGRAM()], + [AS_VAR_SET(CACHEVAR,[yes])], + [AS_VAR_SET(CACHEVAR,[no])]) + LDFLAGS=$ax_check_save_flags]) +AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], + [m4_default([$2], :)], + [m4_default([$3], :)]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl AX_CHECK_LINK_FLAGS diff --git a/storage/maria/libmarias3/m4/ax_compiler_vendor.m4 b/storage/maria/libmarias3/m4/ax_compiler_vendor.m4 new file mode 100644 index 00000000..c2f421bc --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_compiler_vendor.m4 @@ -0,0 +1,85 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_compiler_vendor.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_COMPILER_VENDOR +# +# DESCRIPTION +# +# Determine the vendor of the C/C++ compiler, e.g., gnu, intel, ibm, sun, +# hp, borland, comeau, dec, cray, kai, lcc, metrowerks, sgi, microsoft, +# watcom, etc. The vendor is returned in the cache variable +# $ax_cv_c_compiler_vendor for C and $ax_cv_cxx_compiler_vendor for C++. +# +# LICENSE +# +# Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu> +# Copyright (c) 2008 Matteo Frigo +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 12 + +AC_DEFUN([AX_COMPILER_VENDOR], +[AC_CACHE_CHECK([for _AC_LANG compiler vendor], ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor, + [# note: don't check for gcc first since some other compilers define __GNUC__ + vendors="intel: __ICC,__ECC,__INTEL_COMPILER + ibm: __xlc__,__xlC__,__IBMC__,__IBMCPP__ + pathscale: __PATHCC__,__PATHSCALE__ + clang: __clang__ + fujitsu: __FUJITSU + gnu: __GNUC__ + sun: __SUNPRO_C,__SUNPRO_CC + hp: __HP_cc,__HP_aCC + dec: __DECC,__DECCXX,__DECC_VER,__DECCXX_VER + borland: __BORLANDC__,__TURBOC__ + comeau: __COMO__ + cray: _CRAYC + kai: __KCC + lcc: __LCC__ + sgi: __sgi,sgi + microsoft: _MSC_VER + metrowerks: __MWERKS__ + watcom: __WATCOMC__ + portland: __PGI + unknown: UNKNOWN" + for ventest in $vendors; do + case $ventest in + *:) vendor=$ventest; continue ;; + *) vencpp="defined("`echo $ventest | sed 's/,/) || defined(/g'`")" ;; + esac + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[ + #if !($vencpp) + thisisanerror; + #endif + ])], [break]) + done + ax_cv_[]_AC_LANG_ABBREV[]_compiler_vendor=`echo $vendor | cut -d: -f1` + ]) +]) diff --git a/storage/maria/libmarias3/m4/ax_compiler_version.m4 b/storage/maria/libmarias3/m4/ax_compiler_version.m4 new file mode 100644 index 00000000..8304d3db --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_compiler_version.m4 @@ -0,0 +1,100 @@ +# =========================================================================== +# https://github.com/BrianAker/ddm4/ +# =========================================================================== +# +# SYNOPSIS +# +# AX_COMPILER_VERSION() +# +# DESCRIPTION +# +# Capture version of C/C++ compiler +# +# LICENSE +# +# Copyright (C) 2012 Brian Aker +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# * The names of its contributors may not be used to endorse or +# promote products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#serial 6 +AC_DEFUN([_C_COMPILER_VERSION], + [AC_MSG_CHECKING([C Compiler version]) + + AS_CASE(["$ax_cv_c_compiler_vendor"], + [sun],[ax_c_compiler_version=`$CC -V 2>&1 | sed 1q`], + [intel],[ax_c_compiler_version=`$CC --version 2>&1 | sed 1q`], + [clang],[ax_c_compiler_version=`$CC --version 2>&1 | sed 1q`], + [gnu],[ax_c_compiler_version=`$CC --version | sed 1q`], + [mingw],[ax_c_compiler_version=`$CC --version | sed 1q`], + [ax_c_compiler_version="unknown: $ax_cv_c_compiler_vendor"]) + + AC_MSG_RESULT(["$ax_c_compiler_version"]) + AC_SUBST([CC_VERSION_VENDOR],["$ax_cv_c_compiler_vendor"]) + AC_SUBST([CC_VERSION],["$ax_c_compiler_version"]) + ]) + +AC_DEFUN([_CXX_COMPILER_VERSION], + [AC_MSG_CHECKING([C++ Compiler version]) + + AS_CASE(["$ax_cv_c_compiler_vendor"], + [sun],[ax_cxx_compiler_version=`$CXX -V 2>&1 | sed 1q`], + [intel],[ax_cxx_compiler_version=`$CXX --version 2>&1 | sed 1q`], + [clang],[ax_cxx_compiler_version=`$CXX --version 2>&1 | sed 1q`], + [gnu],[ax_cxx_compiler_version=`$CXX --version | sed 1q`], + [mingw],[ax_cxx_compiler_version=`$CXX --version | sed 1q`], + [ax_cxx_compiler_version="unknown: $ax_cv_c_compiler_vendor"]) + + AC_MSG_RESULT(["$ax_cxx_compiler_version"]) + AC_SUBST([CXX_VERSION_VENDOR],["$ax_cv_c_compiler_vendor"]) + AC_SUBST([CXX_VERSION],["$ax_cxx_compiler_version"]) + ]) + +AC_DEFUN([AX_COMPILER_VERSION], + [AC_REQUIRE([AX_COMPILER_VENDOR]) + + AC_MSG_CHECKING([MINGW]) + AC_CHECK_DECL([__MINGW64__], + [MINGW=yes + ax_c_compiler_version_vendor=mingw], + [MINGW=no]) + AC_MSG_RESULT([$MINGW]) + + AC_REQUIRE([_C_COMPILER_VERSION]) + AC_REQUIRE([_CXX_COMPILER_VERSION]) + AS_IF([test "x$GCC" = xyes], + [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ +#if !defined(__GNUC__) || (__GNUC__ < 4) || ((__GNUC__ >= 4) && (__GNUC_MINOR__ < 7)) +# error GCC is Too Old! +#endif + ]])], + [ac_c_gcc_recent=yes], + [ac_c_gcc_recent=no]) + ]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_count_cpus.m4 b/storage/maria/libmarias3/m4/ax_count_cpus.m4 new file mode 100644 index 00000000..d4f3d290 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_count_cpus.m4 @@ -0,0 +1,57 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_count_cpus.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_COUNT_CPUS +# +# DESCRIPTION +# +# Attempt to count the number of processors present on the machine. If the +# detection fails, then a value of 1 is assumed. +# +# The value is placed in the CPU_COUNT variable. +# +# LICENSE +# +# Copyright (c) 2012 Brian Aker <brian@tangent.org> +# Copyright (c) 2008 Michael Paul Bailey <jinxidoru@byu.net> +# Copyright (c) 2008 Christophe Tournayre <turn3r@users.sourceforge.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 10 + + AC_DEFUN([AX_COUNT_CPUS],[ + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_REQUIRE([AC_PROG_EGREP]) + AC_MSG_CHECKING([the number of available CPUs]) + CPU_COUNT="0" + + AS_CASE([$host_os],[ + *darwin*],[ + AS_IF([test -x /usr/sbin/sysctl],[ + sysctl_a=`/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.cpu` + AS_IF([test sysctl_a],[ + CPU_COUNT=`/usr/sbin/sysctl -n hw.ncpu` + ]) + ])],[ + *linux*],[ + AS_IF([test "x$CPU_COUNT" = "x0" -a -e /proc/cpuinfo],[ + AS_IF([test "x$CPU_COUNT" = "x0" -a -e /proc/cpuinfo],[ + CPU_COUNT=`$EGREP -c '^processor' /proc/cpuinfo` + ]) + ]) + ]) + + AS_IF([test "x$CPU_COUNT" = "x0"],[ + CPU_COUNT="1" + AC_MSG_RESULT( [unable to detect (assuming 1)] ) + ],[ + AC_MSG_RESULT( $CPU_COUNT ) + ]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_create_generic_config.m4 b/storage/maria/libmarias3/m4/ax_create_generic_config.m4 new file mode 100644 index 00000000..535838f4 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_create_generic_config.m4 @@ -0,0 +1,195 @@ +# ============================================================================ +# http://www.gnu.org/software/autoconf-archive/ax_create_generic_config.html +# ============================================================================ +# +# SYNOPSIS +# +# AX_CREATE_GENERIC_CONFIG [(PACKAGEnlibs [, VERSION])] +# +# DESCRIPTION +# +# Creates a generic PACKAGE-config file that has all the things that you +# want, hmm, well, atleast it has --cflags, --version, --libs. Ahhm, did +# you see ax_path_generic in the autoconf-archive? ;-) +# +# this macros saves you all the typing for a pkg-config.in script, you +# don't even need to distribute one along. Place this macro in your +# configure.ac, et voila, you got one that you want to install. +# +# oh, btw, if the first arg looks like "mylib -lwhat' then it will go to +# be added to the --libs, and mylib is extracted. +# +# the defaults: $1 = $PACKAGE $LIBS $2 = $VERSION there is also an +# AC_SUBST(GENERIC_CONFIG) that will be set to the name of the file that +# we did output in this macro. Use as: +# +# install-exec-local: install-generic-config +# +# install-generic-config: +# $(mkinstalldirs) $(DESTDIR)$(bindir) +# $(INSTALL_SCRIPT) @GENERIC_CONFIG@ $(DESTDIR)$(bindir) +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 6 + +AU_ALIAS([AC_CREATE_GENERIC_CONFIG], [AX_CREATE_GENERIC_CONFIG]) +AC_DEFUN([AX_CREATE_GENERIC_CONFIG],[# create a generic PACKAGE-config file +L=`echo ifelse($1, , $PACKAGE $LIBS, $1)` +P=`echo $L | sed -e 's/ -.*//'` +P=`echo $P` +V=`echo ifelse($2, , $VERSION, $2)` +F=`echo $P-config` +L=`echo -l$L | sed -e 's/^-llib/-l/'` +AC_MSG_RESULT(creating $F - generic $V for $L) +test "x$prefix" = xNONE && prefix="$ac_default_prefix" +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' +echo '#! /bin/sh' >$F +echo ' ' >>$F +echo 'package="'$P'"' >>$F +echo 'version="'$V'"' >>$F +echo 'libs="'$L'"' >>$F +echo ' ' >>$F +# in the order of occurence a standard automake Makefile +echo 'prefix="'$prefix'"' >>$F +echo 'exec_prefix="'$exec_prefix'"' >>$F +echo 'bindir="'$bindir'"' >>$F +echo 'sbindir="'$sbindir'"' >>$F +echo 'libexecdir="'$libexecdir'"' >>$F +echo 'datadir="'$datadir'"' >>$F +echo 'sysconfdir="'$sysconfdir'"' >>$F +echo 'sharedstatedir="'$sharedstatedir'"' >>$F +echo 'localstatedir="'$localstatedir'"' >>$F +echo 'libdir="'$libdir'"' >>$F +echo 'infodir="'$infodir'"' >>$F +echo 'mandir="'$mandir'"' >>$F +echo 'includedir="'$includedir'"' >>$F +echo 'target="'$target'"' >>$F +echo 'host="'$host'"' >>$F +echo 'build="'$build'"' >>$F +echo ' ' >>$F +echo 'if test "'"\$""#"'" -eq 0; then' >>$F +echo ' cat <<EOF' >>$F +echo 'Usage: $package-config [OPTIONS]' >>$F +echo 'Options:' >>$F +echo ' --prefix[=DIR]) : \$prefix' >>$F +echo ' --package) : \$package' >>$F +echo ' --version) : \$version' >>$F +echo ' --cflags) : -I\$includedir' >>$F +echo ' --libs) : -L\$libdir -l\$package' >>$F +echo ' --help) print all the options (not just these)' >>$F +echo 'EOF' >>$F +echo 'fi' >>$F +echo ' ' >>$F +echo 'o=""' >>$F +echo 'h=""' >>$F +echo 'for i in "[$]@"; do' >>$F +echo ' case $i in' >>$F +echo ' --prefix=*) prefix=`echo $i | sed -e "s/--prefix=//"` ;;' >>$F +echo ' --prefix) o="$o $prefix" ;;' >>$F +echo ' --package) o="$o $package" ;;' >>$F +echo ' --version) o="$o $version" ;;' >>$F +echo ' --cflags) if test "_$includedir" != "_/usr/include"' >>$F +echo ' then o="$o -I$includedir" ; fi' >>$F +echo ' ;;' >>$F +echo ' --libs) o="$o -L$libdir $libs" ;;' >>$F +echo ' --exec_prefix|--eprefix) o="$o $exec_prefix" ;;' >>$F +echo ' --bindir) o="$o $bindir" ;;' >>$F +echo ' --sbindir) o="$o $sbindir" ;;' >>$F +echo ' --libexecdir) o="$o $libexecdir" ;;' >>$F +echo ' --datadir) o="$o $datadir" ;;' >>$F +echo ' --datainc) o="$o -I$datadir" ;;' >>$F +echo ' --datalib) o="$o -L$datadir" ;;' >>$F +echo ' --sysconfdir) o="$o $sysconfdir" ;;' >>$F +echo ' --sharedstatedir) o="$o $sharedstatedir" ;;' >>$F +echo ' --localstatedir) o="$o $localstatedir" ;;' >>$F +echo ' --libdir) o="$o $libdir" ;;' >>$F +echo ' --libadd) o="$o -L$libdir" ;;' >>$F +echo ' --infodir) o="$o $infodir" ;;' >>$F +echo ' --mandir) o="$o $mandir" ;;' >>$F +echo ' --target) o="$o $target" ;;' >>$F +echo ' --host) o="$o $host" ;;' >>$F +echo ' --build) o="$o $build" ;;' >>$F +echo ' --data) o="$o -I$datadir/$package" ;;' >>$F +echo ' --pkgdatadir) o="$o $datadir/$package" ;;' >>$F +echo ' --pkgdatainc) o="$o -I$datadir/$package" ;;' >>$F +echo ' --pkgdatalib) o="$o -L$datadir/$package" ;;' >>$F +echo ' --pkglibdir) o="$o $libdir/$package" ;;' >>$F +echo ' --pkglibinc) o="$o -I$libinc/$package" ;;' >>$F +echo ' --pkglibadd) o="$o -L$libadd/$package" ;;' >>$F +echo ' --pkgincludedir) o="$o $includedir/$package" ;;' >>$F +echo ' --help) h="1" ;;' >>$F +echo ' -?//*|-?/*//*|-?./*//*|//*|/*//*|./*//*) ' >>$F +echo ' v=`echo $i | sed -e s://:\$:g`' >>$F +echo ' v=`eval "echo $v"` ' >>$F +echo ' o="$o $v" ;; ' >>$F +echo ' esac' >>$F +echo 'done' >>$F +echo ' ' >>$F +echo 'o=`eval "echo $o"`' >>$F +echo 'o=`eval "echo $o"`' >>$F +echo 'eval "echo $o"' >>$F +echo ' ' >>$F +echo 'if test ! -z "$h" ; then ' >>$F +echo 'cat <<EOF' >>$F +echo ' --prefix=xxx) (what is that for anyway?)' >>$F +echo ' --prefix) \$prefix $prefix' >>$F +echo ' --package) \$package $package' >>$F +echo ' --version) \$version $version' >>$F +echo ' --cflags) -I\$includedir unless it is /usr/include' >>$F +echo ' --libs) -L\$libdir -l\$PACKAGE \$LIBS' >>$F +echo ' --exec_prefix) or... ' >>$F +echo ' --eprefix) \$exec_prefix $exec_prefix' >>$F +echo ' --bindir) \$bindir $bindir' >>$F +echo ' --sbindir) \$sbindir $sbindir' >>$F +echo ' --libexecdir) \$libexecdir $libexecdir' >>$F +echo ' --datadir) \$datadir $datadir' >>$F +echo ' --sysconfdir) \$sysconfdir $sysconfdir' >>$F +echo ' --sharedstatedir) \$sharedstatedir$sharedstatedir' >>$F +echo ' --localstatedir) \$localstatedir $localstatedir' >>$F +echo ' --libdir) \$libdir $libdir' >>$F +echo ' --infodir) \$infodir $infodir' >>$F +echo ' --mandir) \$mandir $mandir' >>$F +echo ' --target) \$target $target' >>$F +echo ' --host) \$host $host' >>$F +echo ' --build) \$build $build' >>$F +echo ' --data) -I\$datadir/\$package' >>$F +echo ' --pkgdatadir) \$datadir/\$package' >>$F +echo ' --pkglibdir) \$libdir/\$package' >>$F +echo ' --pkgincludedir) \$includedir/\$package' >>$F +echo ' --help) generated by ax_create_generic_config.m4' >>$F +echo ' -I//varname and other inc-targets like --pkgdatainc supported' >>$F +echo ' -L//varname and other lib-targets, e.g. --pkgdatalib or --libadd' >>$F +echo 'EOF' >>$F +echo 'fi' >>$F +GENERIC_CONFIG="$F" +AC_SUBST(GENERIC_CONFIG) +]) diff --git a/storage/maria/libmarias3/m4/ax_debug.m4 b/storage/maria/libmarias3/m4/ax_debug.m4 new file mode 100644 index 00000000..17152e79 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_debug.m4 @@ -0,0 +1,65 @@ +# =========================================================================== +# https://github.com/BrianAker/ddm4/ +# =========================================================================== +# +# SYNOPSIS +# +# AX_DEBUG() +# +# DESCRIPTION +# +# --enable-debug +# +# LICENSE +# +# Copyright (C) 2012-2014 Brian Aker +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# * The names of its contributors may not be used to endorse or +# promote products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#serial 8 + +AC_DEFUN([AX_DEBUG], + [AC_PREREQ([2.63])dnl + AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [Add debug code/turns off optimizations (yes|no) @<:@default=no@:>@])], + [ax_enable_debug=$enableval], + [ax_enable_debug=no]) + + AS_IF([test "x$ax_enable_debug" = xyes], + [AC_DEFINE([DEBUG],[1],[Define to 1 to enable debugging code.]) + AX_ADD_AM_MACRO([AM_YFLAGS += --debug]) + AX_ADD_AM_MACRO([AM_CPPFLAGS += -D_GLIBCXX_DEBUG])], + [AC_SUBST([MCHECK]) + AC_DEFINE([DEBUG],[0],[Define to 1 to enable debugging code.])]) + + AC_MSG_CHECKING([for debug]) + AC_MSG_RESULT([$ax_enable_debug]) + AM_CONDITIONAL([DEBUG],[test "x${ax_enable_debug}" = "xyes"])]) diff --git a/storage/maria/libmarias3/m4/ax_endian.m4 b/storage/maria/libmarias3/m4/ax_endian.m4 new file mode 100644 index 00000000..0dfbcd52 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_endian.m4 @@ -0,0 +1,35 @@ +# https://github.com/BrianAker/ddm4 +# =========================================================================== +# +# SYNOPSIS +# +# AX_ENDIAN() +# +# DESCRIPTION +# +# Generate to defines describing endian. +# +# LICENSE +# +# Copyright (c) 2012 Brian Aker <brian@tangent.org> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 1 + +AC_DEFUN([AX_ENDIAN],[ + AC_C_BIGENDIAN([ + AC_DEFINE([WORDS_BIGENDIAN],[1],[machine is big-endian]) + AC_DEFINE([ENDIAN_BIG],[1],[machine is big-endian]) + AC_DEFINE([ENDIAN_LITTLE],[0],[machine is little-endian])],[ + ],[ + AC_DEFINE([WORDS_BIGENDIAN],[0],[machine is big-endian]) + AC_DEFINE([ENDIAN_BIG],[0],[machine is big-endian]) + AC_DEFINE([ENDIAN_LITTLE],[1],[machine is little-endian]) + ],[ + AC_MSG_ERROR([unable to determine endian]) + ]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_file_escapes.m4 b/storage/maria/libmarias3/m4/ax_file_escapes.m4 new file mode 100644 index 00000000..f4c6a06a --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_file_escapes.m4 @@ -0,0 +1,30 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_file_escapes.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_FILE_ESCAPES +# +# DESCRIPTION +# +# Writes the specified data to the specified file. +# +# LICENSE +# +# Copyright (c) 2008 Tom Howard <tomhoward@users.sf.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 7 + +AC_DEFUN([AX_FILE_ESCAPES],[ +AX_DOLLAR="\$" +AX_SRB="\\135" +AX_SLB="\\133" +AX_BS="\\\\" +AX_DQ="\"" +]) diff --git a/storage/maria/libmarias3/m4/ax_harden_compiler_flags.m4 b/storage/maria/libmarias3/m4/ax_harden_compiler_flags.m4 new file mode 100644 index 00000000..63783b4a --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_harden_compiler_flags.m4 @@ -0,0 +1,309 @@ +# vim:expandtab:shiftwidth=2:tabstop=2:smarttab: +# =========================================================================== +# https://github.com/BrianAker/ddm4/ +# =========================================================================== +# +# SYNOPSIS +# +# AX_HARDEN_COMPILER_FLAGS() AX_HARDEN_LINKER_FLAGS() +# +# DESCRIPTION +# +# Any compiler flag that "hardens" or tests code. C99 is assumed. +# +# LICENSE +# +# Copyright (C) 2012-2014 Brian Aker +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# * The names of its contributors may not be used to endorse or +# promote products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Notes: +# We do not test for c99 or c++11, that is out of scope. + +# The Following flags are not checked for +# -Wdeclaration-after-statement is counter to C99 +# _APPEND_COMPILE_FLAGS_ERROR([-pedantic]) + +#serial 19 + +AC_DEFUN([_WARNINGS_AS_ERRORS], + [AC_CACHE_CHECK([if all warnings into errors],[ac_cv_warnings_as_errors], + [AS_IF([test "x$ac_cv_vcs_checkout" = xyes],[ac_cv_warnings_as_errors=yes], + [ac_cv_warnings_as_errors=no]) + ]) + ]) + +# Note: Should this be LIBS or LDFLAGS? +AC_DEFUN([_APPEND_LINK_FLAGS_ERROR], + [AX_APPEND_LINK_FLAGS([$1],[LDFLAGS],[-Werror]) + ]) + +AC_DEFUN([_APPEND_COMPILE_FLAGS_ERROR], + [AX_APPEND_COMPILE_FLAGS([$1],,[-Werror]) + ]) + +# Everything above this does the heavy lifting, while what follows does the specifics. + +AC_DEFUN([_HARDEN_LINKER_FLAGS], + [AS_IF([test "$ax_cv_c_compiler_vendor" != "clang"], + [_APPEND_LINK_FLAGS_ERROR([-z relro -z now]) + AS_IF([test "x$ac_cv_vcs_checkout" = xyes], + [_APPEND_LINK_FLAGS_ERROR([-rdynamic]) +# AX_APPEND_LINK_FLAGS([--coverage])]) + AS_IF([test "x$ac_cv_warnings_as_errors" = xyes],[AX_APPEND_LINK_FLAGS([-Werror])]) + ]) + ]) + +AC_DEFUN([_HARDEN_CC_COMPILER_FLAGS], + [AC_LANG_PUSH([C])dnl + + AS_IF([test "x$ax_enable_debug" = xyes], + [ + #_APPEND_COMPILE_FLAGS_ERROR([-H]) + _APPEND_COMPILE_FLAGS_ERROR([-g]) + _APPEND_COMPILE_FLAGS_ERROR([-g3]) + _APPEND_COMPILE_FLAGS_ERROR([-fno-eliminate-unused-debug-types]) + _APPEND_COMPILE_FLAGS_ERROR([-fno-omit-frame-pointer]) + _APPEND_COMPILE_FLAGS_ERROR([-O0]) + ],[ + _APPEND_COMPILE_FLAGS_ERROR([-g]) + _APPEND_COMPILE_FLAGS_ERROR([-O3]) + ]) + + AS_IF([test "x$ac_cv_vcs_checkout" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-fstack-check]) +# _APPEND_COMPILE_FLAGS_ERROR([--coverage]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpragmas]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunknown-pragmas])], + [_APPEND_COMPILE_FLAGS_ERROR([-Wno-unknown-pragmas]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-pragmas])]) + + AS_IF([test "$ax_cv_c_compiler_vendor" = "clang"],[_APPEND_COMPILE_FLAGS_ERROR([-Qunused-arguments])]) + + _APPEND_COMPILE_FLAGS_ERROR([-Wall]) + _APPEND_COMPILE_FLAGS_ERROR([-Wextra]) + _APPEND_COMPILE_FLAGS_ERROR([-Weverything]) + _APPEND_COMPILE_FLAGS_ERROR([-Wthis-test-should-fail]) +# Anything below this comment please keep sorted. +# _APPEND_COMPILE_FLAGS_ERROR([-Wmissing-format-attribute]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-attributes]) + _APPEND_COMPILE_FLAGS_ERROR([-Waddress]) + _APPEND_COMPILE_FLAGS_ERROR([-Wvarargs]) + _APPEND_COMPILE_FLAGS_ERROR([-Warray-bounds]) + _APPEND_COMPILE_FLAGS_ERROR([-Wbad-function-cast]) +# Not in use -Wc++-compat + _APPEND_COMPILE_FLAGS_ERROR([-Wchar-subscripts]) + _APPEND_COMPILE_FLAGS_ERROR([-Wcomment]) + _APPEND_COMPILE_FLAGS_ERROR([-Wfloat-equal]) + _APPEND_COMPILE_FLAGS_ERROR([-Wformat-security]) + _APPEND_COMPILE_FLAGS_ERROR([-Wformat=2]) + _APPEND_COMPILE_FLAGS_ERROR([-Wformat-y2k]) + _APPEND_COMPILE_FLAGS_ERROR([-Wmissing-field-initializers]) + _APPEND_COMPILE_FLAGS_ERROR([-Wdeclaration-after-statement]) + AS_IF([test "x$MINGW" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-Wno-missing-noreturn])], + [_APPEND_COMPILE_FLAGS_ERROR([-Wmissing-noreturn])]) + _APPEND_COMPILE_FLAGS_ERROR([-Wmissing-prototypes]) + _APPEND_COMPILE_FLAGS_ERROR([-Wnested-externs]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpointer-arith]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpointer-sign]) + AS_IF([test "x$MINGW" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-Wno-suggest-attribute=const]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-suggest-attribute=noreturn]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-suggest-attribute=pure]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-redundant-decls])], + [_APPEND_COMPILE_FLAGS_ERROR([-Wredundant-decls])]) + _APPEND_COMPILE_FLAGS_ERROR([-Wshadow]) + _APPEND_COMPILE_FLAGS_ERROR([-Wshorten-64-to-32]) + _APPEND_COMPILE_FLAGS_ERROR([-Wsign-compare]) + _APPEND_COMPILE_FLAGS_ERROR([-Wstrict-overflow=1]) + _APPEND_COMPILE_FLAGS_ERROR([-Wstrict-prototypes]) + _APPEND_COMPILE_FLAGS_ERROR([-Wswitch-enum]) + _APPEND_COMPILE_FLAGS_ERROR([-Wundef]) + + _APPEND_COMPILE_FLAGS_ERROR([-Wunused]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-result]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-variable]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-parameter]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-local-typedefs]) + _APPEND_COMPILE_FLAGS_ERROR([-Wwrite-strings]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-deprecated-declarations]) + _APPEND_COMPILE_FLAGS_ERROR([-fwrapv]) + _APPEND_COMPILE_FLAGS_ERROR([-pipe]) + AS_IF([test "x$MINGW" = xyes], + [], + [_APPEND_COMPILE_FLAGS_ERROR([-fPIE -pie])]) + _APPEND_COMPILE_FLAGS_ERROR([-Wsizeof-pointer-memaccess]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpacked]) + _APPEND_COMPILE_FLAGS_ERROR([-Wlong-long]) + _APPEND_COMPILE_FLAGS_ERROR([ftrapv]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-unused-command-line-argument]) +# GCC 4.5 removed this. +# _APPEND_COMPILE_FLAGS_ERROR([-Wunreachable-code]) + + AS_IF([test "x$ax_enable_debug" = xno], + [AS_IF([test "x$ac_cv_vcs_checkout" = xyes], + [AS_IF([test "x${host_os}" != "xmingw"], + [AS_IF([test "x$ac_c_gcc_recent" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-D_FORTIFY_SOURCE=2]) + #_APPEND_COMPILE_FLAGS_ERROR([-Wstack-protector]) + #_APPEND_COMPILE_FLAGS_ERROR([-fstack-protector --param=ssp-buffer-size=4]) + _APPEND_COMPILE_FLAGS_ERROR([-fstack-protector-all]) + ])])])]) + + AS_IF([test "x$ac_cv_warnings_as_errors" = xyes], + [AX_APPEND_COMPILE_FLAGS([-Werror])]) + + AC_LANG_POP([C]) + ]) + +AC_DEFUN([_HARDEN_CXX_COMPILER_FLAGS], + [AC_LANG_PUSH([C++]) + AS_IF([test "x$ax_enable_debug" = xyes], + [CXXFLAGS='' + #_APPEND_COMPILE_FLAGS_ERROR([-H]) + _APPEND_COMPILE_FLAGS_ERROR([-g]) + _APPEND_COMPILE_FLAGS_ERROR([-g3]) + _APPEND_COMPILE_FLAGS_ERROR([-fno-inline]) + _APPEND_COMPILE_FLAGS_ERROR([-fno-eliminate-unused-debug-types]) + _APPEND_COMPILE_FLAGS_ERROR([-fno-omit-frame-pointer]) + _APPEND_COMPILE_FLAGS_ERRPR([-O0]) + ],[ + _APPEND_COMPILE_FLAGS_ERROR([-g]) + _APPEND_COMPILE_FLAGS_ERROR([-O3]) + ]) + + AS_IF([test "x$ac_cv_vcs_checkout" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-fstack-check]) +# _APPEND_COMPILE_FLAGS_ERROR([--coverage]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpragmas]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunknown-pragmas])], + [_APPEND_COMPILE_FLAGS_ERROR([-Wno-unknown-pragmas]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-pragmas])]) + + AS_IF([test "$ax_cv_c_compiler_vendor" = "clang"],[_APPEND_COMPILE_FLAGS_ERROR([-Qunused-arguments])]) + + _APPEND_COMPILE_FLAGS_ERROR([-Wall]) + _APPEND_COMPILE_FLAGS_ERROR([-Wextra]) + _APPEND_COMPILE_FLAGS_ERROR([-Weverything]) + _APPEND_COMPILE_FLAGS_ERROR([-Wthis-test-should-fail]) +# Anything below this comment please keep sorted. +# _APPEND_COMPILE_FLAGS_ERROR([-Wmissing-format-attribute]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-attributes]) + _APPEND_COMPILE_FLAGS_ERROR([-Wvarargs]) + _APPEND_COMPILE_FLAGS_ERROR([-Waddress]) + _APPEND_COMPILE_FLAGS_ERROR([-Warray-bounds]) + _APPEND_COMPILE_FLAGS_ERROR([-Wchar-subscripts]) + _APPEND_COMPILE_FLAGS_ERROR([-Wcomment]) + _APPEND_COMPILE_FLAGS_ERROR([-Wctor-dtor-privacy]) + _APPEND_COMPILE_FLAGS_ERROR([-Wfloat-equal]) + _APPEND_COMPILE_FLAGS_ERROR([-Wformat=2]) + _APPEND_COMPILE_FLAGS_ERROR([-Wformat-y2k]) + _APPEND_COMPILE_FLAGS_ERROR([-Wmaybe-uninitialized]) + _APPEND_COMPILE_FLAGS_ERROR([-Wmissing-field-initializers]) + _APPEND_COMPILE_FLAGS_ERROR([-Wlogical-op]) + _APPEND_COMPILE_FLAGS_ERROR([-Wnon-virtual-dtor]) + _APPEND_COMPILE_FLAGS_ERROR([-Wnormalized=id]) + _APPEND_COMPILE_FLAGS_ERROR([-Woverloaded-virtual]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpointer-arith]) + AS_IF([test "x$MINGW" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-Wno-suggest-attribute=const]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-missing-noreturn]) + _APPEND_COMPILE_FLAGS_ERROR([-Wmissing-noreturn]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-suggest-attribute=noreturn]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-error=suggest-attribute=noreturn]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-redundant-decls])], + [_APPEND_COMPILE_FLAGS_ERROR([-Wredundant-decls])]) + _APPEND_COMPILE_FLAGS_ERROR([-Wshadow]) + _APPEND_COMPILE_FLAGS_ERROR([-Wshorten-64-to-32]) + _APPEND_COMPILE_FLAGS_ERROR([-Wsign-compare]) + _APPEND_COMPILE_FLAGS_ERROR([-Wstrict-overflow=1]) + _APPEND_COMPILE_FLAGS_ERROR([-Wswitch-enum]) + _APPEND_COMPILE_FLAGS_ERROR([-Wtrampolines]) + _APPEND_COMPILE_FLAGS_ERROR([-Wundef]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunsafe-loop-optimizations]) + _APPEND_COMPILE_FLAGS_ERROR([-funsafe-loop-optimizations]) + _APPEND_COMPILE_FLAGS_ERROR([-Wc++11-compat]) +# _APPEND_COMPILE_FLAGS_ERROR([-Weffc++]) +# _APPEND_COMPILE_FLAGS_ERROR([-Wold-style-cast]) + _APPEND_COMPILE_FLAGS_ERROR([-Wclobbered]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-result]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-variable]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-parameter]) + _APPEND_COMPILE_FLAGS_ERROR([-Wunused-local-typedefs]) + _APPEND_COMPILE_FLAGS_ERROR([-Wwrite-strings]) + _APPEND_COMPILE_FLAGS_ERROR([-Wformat-security]) + _APPEND_COMPILE_FLAGS_ERROR([-Wno-deprecated-declarations]) + _APPEND_COMPILE_FLAGS_ERROR([-fwrapv]) + _APPEND_COMPILE_FLAGS_ERROR([-pipe]) + AS_IF([test "x$MINGW" = xyes], + [], + [_APPEND_COMPILE_FLAGS_ERROR([-fPIE -pie])]) + _APPEND_COMPILE_FLAGS_ERROR([-Wsizeof-pointer-memaccess]) + _APPEND_COMPILE_FLAGS_ERROR([-Wpacked]) + _APPEND_COMPILE_FLAGS_ERROR([-Wlong-long]) +# GCC 4.5 removed this. +# _APPEND_COMPILE_FLAGS_ERROR([-Wunreachable-code]) +# Disable c++11 long long warning for brew openssl on Mac + AS_IF([test "$ax_cv_c_compiler_vendor" = "clang"],[ + _APPEND_COMPILE_FLAGS_ERROR([-Wno-c++11-long-long])]) + AS_IF([test "x$ax_enable_debug" = xno], + [AS_IF([test "x$ac_cv_vcs_checkout" = xyes], + [AS_IF([test "x${host_os}" != "xmingw"], + [AS_IF([test "x$ac_c_gcc_recent" = xyes], + [_APPEND_COMPILE_FLAGS_ERROR([-D_FORTIFY_SOURCE=2]) + #_APPEND_COMPILE_FLAGS_ERROR([-Wstack-protector]) + #_APPEND_COMPILE_FLAGS_ERROR([-fstack-protector --param=ssp-buffer-size=4]) + _APPEND_COMPILE_FLAGS_ERROR([-fstack-protector-all]) + ])])])]) + + AS_IF([test "x$ac_cv_warnings_as_errors" = xyes], + [AX_APPEND_COMPILE_FLAGS([-Werror])]) + AC_LANG_POP([C++]) + ]) + +# All of the heavy lifting happens in _HARDEN_LINKER_FLAGS, +# _HARDEN_CC_COMPILER_FLAGS, _HARDEN_CXX_COMPILER_FLAGS + AC_DEFUN([AX_HARDEN_COMPILER_FLAGS], + [AC_PREREQ([2.63])dnl + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_REQUIRE([AX_COMPILER_VERSION]) + AC_REQUIRE([AX_DEBUG]) + AC_REQUIRE([AX_ASSERT]) + _WARNINGS_AS_ERRORS + + AC_REQUIRE([gl_VISIBILITY]) + AS_IF([test -n "$CFLAG_VISIBILITY"],[CPPFLAGS="$CPPFLAGS $CFLAG_VISIBILITY"]) + + _HARDEN_LINKER_FLAGS + _HARDEN_CC_COMPILER_FLAGS + ]) + diff --git a/storage/maria/libmarias3/m4/ax_hex_version.m4 b/storage/maria/libmarias3/m4/ax_hex_version.m4 new file mode 100644 index 00000000..48d3a737 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_hex_version.m4 @@ -0,0 +1,33 @@ +# =========================================================================== +# https://github.com/BrianAker/ddm4 +# =========================================================================== +# +# SYNOPSIS +# +# AX_HEX_VERSION(VARIABLE_NAME, VERSION) +# +# DESCRIPTION +# +# Generate version information in HEX and STRING format. +# +# LICENSE +# +# Copyright (c) 2012 Brian Aker <brian@tangent.org> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 3 + +AC_DEFUN([AX_HEX_VERSION], + [AC_PREREQ([2.63])dnl + + string_version_$1=`echo $2 | sed 's|[\-a-z0-9]*$||' | awk -F. '{printf "%d.%d.%d", $[]1, $[]2, $[]3}'` + hex_version_$1=`echo $2 | sed 's|[\-a-z0-9]*$||' | awk -F. '{printf "0x%0.2d%0.3d%0.3d", $[]1, $[]2, $[]3}'` + + AC_SUBST([$1_VERSION_STRING],["$string_version_$1"]) + AC_SUBST([$1_VERSION_HEX],["$hex_version_$1"]) + ]) + diff --git a/storage/maria/libmarias3/m4/ax_platform.m4 b/storage/maria/libmarias3/m4/ax_platform.m4 new file mode 100644 index 00000000..ba46a540 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_platform.m4 @@ -0,0 +1,56 @@ +# =========================================================================== +# http:// +# =========================================================================== +# +# SYNOPSIS +# +# AX_PLATFORM +# +# DESCRIPTION +# +# Provide target and host defines. +# +# LICENSE +# +# Copyright (c) 2012-2013 Brian Aker <brian@tangent.org> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 4 +# + AC_DEFUN([AX_PLATFORM], + [AC_REQUIRE([AC_CANONICAL_HOST]) + + AC_DEFINE_UNQUOTED([HOST_VENDOR],["$host_vendor"],[Vendor of Build System]) + AC_DEFINE_UNQUOTED([HOST_OS],["$host_os"], [OS of Build System]) + AC_DEFINE_UNQUOTED([HOST_CPU],["$host_cpu"], [CPU of Build System]) + + AS_CASE([$host_os], + [*mingw*], + [HOST_WINDOWS="true" + AC_DEFINE([HOST_OS_WINDOWS], [1], [Whether we are building for Windows]) + AC_DEFINE([EAI_SYSTEM], [11], [Another magical number]) + AH_BOTTOM([ +#ifndef HAVE_SYS_SOCKET_H +# define SHUT_RD SD_RECEIVE +# define SHUT_WR SD_SEND +# define SHUT_RDWR SD_BOTH +#endif + ])], + [*freebsd*],[AC_DEFINE([HOST_OS_FREEBSD],[1],[Whether we are building for FreeBSD]) + AC_DEFINE([__APPLE_CC__],[1],[Workaround for bug in FreeBSD headers])], + [*solaris*],[AC_DEFINE([HOST_OS_SOLARIS],[1],[Whether we are building for Solaris])], + [*darwin*], + [HOST_OSX="true"], + [*linux*], + [HOST_LINUX="true" + AC_DEFINE([HOST_OS_LINUX],[1],[Whether we build for Linux])]) + + AM_CONDITIONAL([BUILD_WIN32],[test "x${HOST_WINDOWS}" = "xtrue"]) + AM_CONDITIONAL([HOST_OSX],[test "x${HOST_OSX}" = "xtrue"]) + AM_CONDITIONAL([HOST_LINUX],[test "x${HOST_LINUX}" = "xtrue"]) + AM_CONDITIONAL([HOST_FREEBSD],[test "x${HOST_OS_FREEBSD}" = "xtrue"]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_print_to_file.m4 b/storage/maria/libmarias3/m4/ax_print_to_file.m4 new file mode 100644 index 00000000..5b9d1c39 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_print_to_file.m4 @@ -0,0 +1,27 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_print_to_file.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PRINT_TO_FILE([FILE],[DATA]) +# +# DESCRIPTION +# +# Writes the specified data to the specified file. +# +# LICENSE +# +# Copyright (c) 2008 Tom Howard <tomhoward@users.sf.net> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 7 + +AC_DEFUN([AX_PRINT_TO_FILE],[ +AC_REQUIRE([AX_FILE_ESCAPES]) +printf "$2" > "$1" +]) diff --git a/storage/maria/libmarias3/m4/ax_prog_sphinx_build.m4 b/storage/maria/libmarias3/m4/ax_prog_sphinx_build.m4 new file mode 100644 index 00000000..653c7bce --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_prog_sphinx_build.m4 @@ -0,0 +1,48 @@ +# =========================================================================== +# https://github.com/BrianAker/ddm4 +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_SPHINX_BUILD([ACTION-IF-FOUND], [ACTION-IF-NOT_FOUND]) +# +# DESCRIPTION +# +# Look for sphinx-build and make sure it is a recent version of it. +# +# LICENSE +# +# Copyright (c) 2012-2013 Brian Aker <brian@tangent.org> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 5 + +AC_DEFUN([AX_PROG_SPHINX_BUILD], + [AX_WITH_PROG([SPHINXBUILD],[sphinx-build],[:]) + AS_IF([test x"SPHINXBUILD" = x":"], + [SPHINXBUILD=], + [AS_IF([test -x "$SPHINXBUILD"], + [AC_MSG_CHECKING([Checking to see if $SPHINXBUILD is recent]) + junk=`$SPHINXBUILD --version &> version_file` + AS_IF([test $? -eq 0], + [ax_sphinx_build_version=`head -1 version_file`], + [junk=`$SPHINXBUILD &> version_file` + ax_sphinx_build_version=`head -1 version_file` + rm version_file + AC_MSG_RESULT([$SPHINXBUILD is version "$ax_sphinx_build_version"]) + $SPHINXBUILD -Q -C -b man -d conftest.d . . >/dev/null 2>&1 + AS_IF([test $? -eq 0], ,[SPHINXBUILD=]) + rm -rf conftest.d ]) + ]) + rm -f version_file + ]) + + AS_IF([test -n "${SPHINXBUILD}"], + [AC_SUBST([SPHINXBUILD]) + ifelse([$1], , :, [$1])], + [ifelse([$2], , :, [$2])]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_prog_valgrind.m4 b/storage/maria/libmarias3/m4/ax_prog_valgrind.m4 new file mode 100644 index 00000000..af12dc36 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_prog_valgrind.m4 @@ -0,0 +1,44 @@ +# =========================================================================== +# https://github.com/BrianAker/ddm4 +# =========================================================================== +# +# SYNOPSIS +# +# AX_PROG_VALGRIND([tool],[options],[ACTION-IF-FOUND], [ACTION-IF-NOT_FOUND]) +# +# DESCRIPTION +# +# Look for valgrind and make sure it is a recent version of it. +# +# LICENSE +# +# Copyright (c) 2012-2013 Brian Aker <brian@tangent.org> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 1 + +AC_DEFUN([AX_PROG_VALGRIND], + [AX_WITH_PROG([VALGRIND],[valgrind],[:]) + AS_IF([test x"VALGRIND" = x":"], + [VALGRIND=], + [AS_IF([test -x "$VALGRIND"], + [AC_MSG_CHECKING([Checking to see if $VALGRIND is recent]) + junk=`$VALGRIND --version &> version_file` + ax_valgrind_version=`head -1 version_file` + rm version_file + AC_MSG_RESULT([$VALGRIND is version "$ax_valgrind_version"]) + ]) + ]) + + AS_IF([test -n "${VALGRIND}"], + [AC_SUBST([VALGRIND]) + LIBTOOL_COMMAND="\${LIBTOOL} --mode=execute" + AC_SUBST([LIBTOOL_COMMAND]) + AX_ADD_AM_MACRO([[TESTS_ENVIRONMENT=\"\${LIBTOOL_COMMAND} \${VALGRIND} --tool=$1 $2\"]]) + ifelse([$3], , :, [$3])], + [ifelse([$4], , :, [$4])]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_vcs_checkout.m4 b/storage/maria/libmarias3/m4/ax_vcs_checkout.m4 new file mode 100644 index 00000000..8047b65e --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_vcs_checkout.m4 @@ -0,0 +1,75 @@ +# =========================================================================== +# http:// +# =========================================================================== +# +# SYNOPSIS +# +# AX_VCS_CHECKOUT +# +# DESCRIPTION +# +# Discover whether or not we are operating with a tree which +# has been checked out of a version control system. +# +# +# LICENSE +# +# Copyright (C) 2012 Brian Aker +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# * The names of its contributors may not be used to endorse or +# promote products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#serial 6 + +AC_DEFUN([AX_VCS_SYSTEM], + [AC_PREREQ([2.63])dnl + AC_CACHE_CHECK([for vcs system], [ac_cv_vcs_system], + [ac_cv_vcs_system="none" + AS_IF([test -d ".bzr"],[ac_cv_vcs_system="bazaar"]) + AS_IF([test -d ".svn"],[ac_cv_vcs_system="svn"]) + AS_IF([test -d ".hg"],[ac_cv_vcs_system="mercurial"]) + AS_IF([test -d ".git"],[ac_cv_vcs_system="git"]) + ]) + AC_DEFINE_UNQUOTED([VCS_SYSTEM],["$ac_cv_vcs_system"],[VCS system]) + ]) + +AC_DEFUN([AX_VCS_CHECKOUT], + [AC_PREREQ([2.63])dnl + AC_REQUIRE([AX_VCS_SYSTEM]) + AC_CACHE_CHECK([for vcs checkout],[ac_cv_vcs_checkout], + [AS_IF([test "x$ac_cv_vcs_system" != "xnone"], + [ac_cv_vcs_checkout=yes], + [ac_cv_vcs_checkout=no]) + ]) + + AM_CONDITIONAL([IS_VCS_CHECKOUT],[test "x$ac_cv_vcs_checkout" = "xyes"]) + AS_IF([test "x$ac_cv_vcs_checkout" = "xyes"], + [AC_DEFINE([VCS_CHECKOUT],[1],[Define if the code was built from VCS.])], + [AC_DEFINE([VCS_CHECKOUT],[0],[Define if the code was built from VCS.])]) + ]) diff --git a/storage/maria/libmarias3/m4/ax_with_prog.m4 b/storage/maria/libmarias3/m4/ax_with_prog.m4 new file mode 100644 index 00000000..f337c059 --- /dev/null +++ b/storage/maria/libmarias3/m4/ax_with_prog.m4 @@ -0,0 +1,70 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_with_prog.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_WITH_PROG([VARIABLE],[program],[VALUE-IF-NOT-FOUND],[PATH]) +# +# DESCRIPTION +# +# Locates an installed program binary, placing the result in the precious +# variable VARIABLE. Accepts a present VARIABLE, then --with-program, and +# failing that searches for program in the given path (which defaults to +# the system path). If program is found, VARIABLE is set to the full path +# of the binary; if it is not found VARIABLE is set to VALUE-IF-NOT-FOUND +# if provided, unchanged otherwise. +# +# A typical example could be the following one: +# +# AX_WITH_PROG(PERL,perl) +# +# NOTE: This macro is based upon the original AX_WITH_PYTHON macro from +# Dustin J. Mitchell <dustin@cs.uchicago.edu>. +# +# LICENSE +# +# Copyright (c) 2008 Francesco Salvestrini <salvestrini@users.sourceforge.net> +# Copyright (c) 2008 Dustin J. Mitchell <dustin@cs.uchicago.edu> +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 16 + +AC_DEFUN([AX_WITH_PROG],[ + AC_PREREQ([2.61]) + + pushdef([VARIABLE],$1) + pushdef([EXECUTABLE],$2) + pushdef([VALUE_IF_NOT_FOUND],$3) + pushdef([PATH_PROG],$4) + + AC_ARG_VAR(VARIABLE,Absolute path to EXECUTABLE executable) + + AS_IF(test -z "$VARIABLE",[ + AC_MSG_CHECKING(whether EXECUTABLE executable path has been provided) + AC_ARG_WITH(EXECUTABLE,AS_HELP_STRING([--with-EXECUTABLE=[[[PATH]]]],absolute path to EXECUTABLE executable), [ + AS_IF([test "$withval" != yes && test "$withval" != no],[ + VARIABLE="$withval" + AC_MSG_RESULT($VARIABLE) + ],[ + VARIABLE="" + AC_MSG_RESULT([no]) + AS_IF([test "$withval" != no], [ + AC_PATH_PROG([]VARIABLE[],[]EXECUTABLE[],[]VALUE_IF_NOT_FOUND[],[]PATH_PROG[]) + ]) + ]) + ],[ + AC_MSG_RESULT([no]) + AC_PATH_PROG([]VARIABLE[],[]EXECUTABLE[],[]VALUE_IF_NOT_FOUND[],[]PATH_PROG[]) + ]) + ]) + + popdef([PATH_PROG]) + popdef([VALUE_IF_NOT_FOUND]) + popdef([EXECUTABLE]) + popdef([VARIABLE]) +]) diff --git a/storage/maria/libmarias3/m4/config_extra.m4 b/storage/maria/libmarias3/m4/config_extra.m4 new file mode 100644 index 00000000..6ac9773c --- /dev/null +++ b/storage/maria/libmarias3/m4/config_extra.m4 @@ -0,0 +1,61 @@ +# SYNOPSIS +# +# CONFIG_EXTRA +# +# DESCRIPTION +# +# Adds required extras to config.h +# +# LICENSE +# +# Copyright (C) 2014 Andrew Hutchings +# Based on bottom.m4 from libdrizzle +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# +# * The names of its contributors may not be used to endorse or +# promote products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#serial 1 + +AC_DEFUN([CONFIG_EXTRA], [ + +AH_TOP([ +#pragma once + +/* Define to make inttypes.h work on some platforms */ +#define __STDC_FORMAT_MACROS 1 +#define __STDC_LIMIT_MACROS 1 + +/* _SYS_FEATURE_TESTS_H is Solaris, _FEATURES_H is GCC */ +#if defined( _SYS_FEATURE_TESTS_H) || defined(_FEATURES_H) +# error "You should include config.h as your first include file" +#endif + +]) +])dnl CONFIG_EXTRA diff --git a/storage/maria/libmarias3/m4/include.am b/storage/maria/libmarias3/m4/include.am new file mode 100644 index 00000000..f64a1cc4 --- /dev/null +++ b/storage/maria/libmarias3/m4/include.am @@ -0,0 +1,10 @@ +# vim:ft=automake +# Copyright (C) 2012 Data Differential +# All rights reserved. +# +# Use and distribution licensed under the BSD license. See +# the COPYING file in the parent directory for full text. +# +# included from Top Level Makefile.am +# All paths should be given relative to the root + diff --git a/storage/maria/libmarias3/m4/visibility.m4 b/storage/maria/libmarias3/m4/visibility.m4 new file mode 100644 index 00000000..75c34b6e --- /dev/null +++ b/storage/maria/libmarias3/m4/visibility.m4 @@ -0,0 +1,77 @@ +# visibility.m4 serial 4 (gettext-0.18.2) +dnl Copyright (C) 2005, 2008, 2010-2011 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl From Bruno Haible. + +dnl Tests whether the compiler supports the command-line option +dnl -fvisibility=hidden and the function and variable attributes +dnl __attribute__((__visibility__("hidden"))) and +dnl __attribute__((__visibility__("default"))). +dnl Does *not* test for __visibility__("protected") - which has tricky +dnl semantics (see the 'vismain' test in glibc) and does not exist e.g. on +dnl MacOS X. +dnl Does *not* test for __visibility__("internal") - which has processor +dnl dependent semantics. +dnl Does *not* test for #pragma GCC visibility push(hidden) - which is +dnl "really only recommended for legacy code". +dnl Set the variable CFLAG_VISIBILITY. +dnl Defines and sets the variable HAVE_VISIBILITY. + +AC_DEFUN([gl_VISIBILITY], +[ + AC_REQUIRE([AC_PROG_CC]) + CFLAG_VISIBILITY= + HAVE_VISIBILITY=0 + if test -n "$GCC"; then + dnl First, check whether -Werror can be added to the command line, or + dnl whether it leads to an error because of some other option that the + dnl user has put into $CC $CFLAGS $CPPFLAGS. + AC_MSG_CHECKING([whether the -Werror option is usable]) + AC_CACHE_VAL([gl_cv_cc_vis_werror], [ + gl_save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS -Werror" + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[]], [[]])], + [gl_cv_cc_vis_werror=yes], + [gl_cv_cc_vis_werror=no]) + CFLAGS="$gl_save_CFLAGS"]) + AC_MSG_RESULT([$gl_cv_cc_vis_werror]) + dnl Now check whether visibility declarations are supported. + AC_MSG_CHECKING([for simple visibility declarations]) + AC_CACHE_VAL([gl_cv_cc_visibility], [ + gl_save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS -fvisibility=hidden" + dnl We use the option -Werror and a function dummyfunc, because on some + dnl platforms (Cygwin 1.7) the use of -fvisibility triggers a warning + dnl "visibility attribute not supported in this configuration; ignored" + dnl at the first function definition in every compilation unit, and we + dnl don't want to use the option in this case. + if test $gl_cv_cc_vis_werror = yes; then + CFLAGS="$CFLAGS -Werror" + fi + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM( + [[extern __attribute__((__visibility__("hidden"))) int hiddenvar; + extern __attribute__((__visibility__("default"))) int exportedvar; + extern __attribute__((__visibility__("hidden"))) int hiddenfunc (void); + extern __attribute__((__visibility__("default"))) int exportedfunc (void); + void dummyfunc (void) {} + ]], + [[]])], + [gl_cv_cc_visibility=yes], + [gl_cv_cc_visibility=no]) + CFLAGS="$gl_save_CFLAGS"]) + AC_MSG_RESULT([$gl_cv_cc_visibility]) + if test $gl_cv_cc_visibility = yes; then + CFLAG_VISIBILITY="-fvisibility=hidden" + HAVE_VISIBILITY=1 + fi + fi + AC_SUBST([CFLAG_VISIBILITY]) + AC_SUBST([HAVE_VISIBILITY]) + AC_DEFINE_UNQUOTED([HAVE_VISIBILITY], [$HAVE_VISIBILITY], + [Define to 1 or 0, depending whether the compiler supports simple visibility declarations.]) +]) diff --git a/storage/maria/libmarias3/rpm/include.mk b/storage/maria/libmarias3/rpm/include.mk new file mode 100644 index 00000000..0c424813 --- /dev/null +++ b/storage/maria/libmarias3/rpm/include.mk @@ -0,0 +1,54 @@ +# vim:ft=automake + +RPM_BUILDDIR= ~/rpmbuild +RPM_SOURCESDIR= $(RPM_BUILDDIR)/SOURCES + +RPM_BUILD_TARGET= @PACKAGE@-@VERSION@-@RPM_RELEASE@.@build_cpu@.rpm +RPM_SOURCE= $(RPM_SOURCESDIR)/$(DIST_ARCHIVES) + +RPMS= +RPMS+= $(RPM_BUILD_TARGET) +RPMS+= @PACKAGE@-devel-@VERSION@-@RPM_RELEASE@.@build_cpu@.rpm +RPMS+= @PACKAGE@-debuginfo-@VERSION@-@RPM_RELEASE@.@build_cpu@.rpm + +SRPMS= @PACKAGE@-@VERSION@-@RPM_RELEASE@.src.rpm + +RPM_DIST= $(RPMS) $(SRPMS) + +BUILD_RPMS= $(foreach rpm_iterator,$(RPMS),$(addprefix $(RPM_BUILDDIR)/RPMS/@build_cpu@/, $(rpm_iterator))) +BUILD_SRPMS= $(foreach srpm_iterator,$(SRPMS),$(addprefix $(RPM_BUILDDIR)/SRPMS/, $(srpm_iterator))) +BUILD_RPM_DIR= $(RPM_BUILDDIR)/BUILD/@PACKAGE@-@VERSION@ + +$(RPM_BUILDDIR): + @@RPMDEV_SETUPTREE@ + +$(DIST_ARCHIVES): $(DISTFILES) + $(MAKE) $(AM_MAKEFLAGS) dist-gzip + +$(RPM_SOURCE): $(DIST_ARCHIVES) $(RPM_BUILDDIR) + @cp $< $@ + +$(RPM_BUILD_TARGET): rpm/@PACKAGE@.spec $(RPM_SOURCE) + -@rm -f $(BUILD_RPMS) $(BUILD_SRPMS) + -@rm -rf $(BUILD_RPM_DIR) + @@RPMBUILD@ -ba $< + @cp $(BUILD_RPMS) $(BUILD_SRPMS) . + +.PHONY: rpm-sign +rpm-sign: $(RPM_BUILD_TARGET) + @@RPM@ --addsign $(RPM_DIST) + @@RPM@ --checksig $(RPM_DIST) + +.PHONY: clean-rpm +clean-rpm: + -@rm -f $(BUILD_RPMS) $(BUILD_SRPMS) $(BUILD_RPM_SOURCE) $(RPM_DIST) $(BUILD_DAEMON_INIT) + -@rm -rf $(BUILD_RPM_DIR) + +dist-rpm: $(RPM_BUILD_TARGET) + +.PHONY: release +release: rpm rpm-sign + +.PHONY: auto-rpmbuild +auto-rpmbuild: rpm/@PACKAGE@.spec + @auto-br-rpmbuild -ba $< diff --git a/storage/maria/libmarias3/rpm/libmarias3.spec.in b/storage/maria/libmarias3/rpm/libmarias3.spec.in new file mode 100644 index 00000000..464dd929 --- /dev/null +++ b/storage/maria/libmarias3/rpm/libmarias3.spec.in @@ -0,0 +1,67 @@ +Summary: libMariaS3 +Name: @PACKAGE@ +Version: @VERSION@ +Release: 1 +License: LGPL v2.1 +Group: System Environment/Libraries +BuildRequires: libcurl +URL: https://github.com/mariadb-corporation/libmarias3 + +Packager: Andrew Hutchings <linuxjedi@mariadb.com> + +Source: https://github.com/mariadb-corporation/libmarias3/releases/tag/%{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot + +%description +libMariaS3 is a lightweight C library to read/write to AWS S3 buckets using objects in memory. + +This package provides the library. + +%package devel +Summary: Header files and development libraries for %{name} +Group: Development/Libraries +Requires: %{name} = %{version}-%{release} + +%description devel +This package contains the header files +for %{name}. If you like to develop programs using %{name}, +you will need to install %{name}-devel. + +%prep +%setup -q + +%configure + + +%build +%{__make} %{?_smp_mflags} + +%install +%{__rm} -rf %{buildroot} +%{__make} install DESTDIR="%{buildroot}" AM_INSTALL_PROGRAM_FLAGS="" +mkdir -p $RPM_BUILD_ROOT/ + +%check +%{__make} check + +%clean +%{__rm} -rf %{buildroot} + +%files +%defattr(-,root,root,-) +%doc LICENSE README.rst +%{_libdir}/libmarias3.a +%{_libdir}/libmarias3.la +%{_libdir}/libmarias3.so +%{_libdir}/libmarias3.so.* +%{_bindir}/libmarias3-config + +%files devel +%defattr(-,root,root,-) +%doc LICENSE README.rst +%{_includedir}/libmarias3/*.h +%{_libdir}/pkgconfig/libmarias3.pc + +%changelog +* Mon Mar 25 2019 Andrew Hutchings <linuxjedi@mariadb.com> +- Initial package diff --git a/storage/maria/libmarias3/src/assume_role.c b/storage/maria/libmarias3/src/assume_role.c new file mode 100644 index 00000000..255b1eca --- /dev/null +++ b/storage/maria/libmarias3/src/assume_role.c @@ -0,0 +1,703 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2020 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include "config.h" +#include "common.h" +#include "sha256.h" + +#include <math.h> + +const char *default_iam_domain = "iam.amazonaws.com"; +const char *default_sts_domain = "sts.amazonaws.com"; +const char *iam_request_region = "us-east-1"; + +static void set_error(ms3_st *ms3, const char *error) +{ + ms3_cfree(ms3->last_error); + + if (!error) + { + ms3->last_error = NULL; + return; + } + + ms3->last_error = ms3_cstrdup(error); +} + +static void set_error_nocopy(ms3_st *ms3, char *error) +{ + ms3_cfree(ms3->last_error); + + if (!error) + { + ms3->last_error = NULL; + return; + } + + ms3->last_error = error; +} + +static size_t header_callback(char *buffer, size_t size, + size_t nitems, void *userdata) +{ + ms3debug("%.*s\n", (int)(nitems * size), buffer); + + if (userdata) + { + // HEAD request + if (!strncasecmp(buffer, "Last-Modified", 13)) + { + ms3_status_st *status = (ms3_status_st *) userdata; + // Date/time, format: Fri, 15 Mar 2019 16:58:54 GMT + struct tm ttmp = {0}; + strptime(buffer + 15, "%a, %d %b %Y %H:%M:%S %Z", &ttmp); + status->created = mktime(&ttmp); + } + else if (!strncasecmp(buffer, "Content-Length", 14)) + { + ms3_status_st *status = (ms3_status_st *) userdata; + // Length + status->length = strtoull(buffer + 16, NULL, 10); + } + } + + return nitems * size; +} + +static size_t body_callback(void *buffer, size_t size, + size_t nitems, void *userdata) +{ + uint8_t *ptr; + size_t realsize = nitems * size; + + struct memory_buffer_st *mem = (struct memory_buffer_st *)userdata; + + if (realsize + mem->length >= mem->alloced) + { + size_t additional_size = mem->buffer_chunk_size; + + if (realsize >= mem->buffer_chunk_size) + { + additional_size = (ceil((double)realsize / (double)mem->buffer_chunk_size) + 1) + * mem->buffer_chunk_size; + } + + ptr = (uint8_t *)ms3_crealloc(mem->data, mem->alloced + additional_size); + + if (!ptr) + { + ms3debug("Curl response OOM"); + return 0; + } + + mem->alloced += additional_size; + mem->data = ptr; + } + + memcpy(&(mem->data[mem->length]), buffer, realsize); + mem->length += realsize; + mem->data[mem->length] = '\0'; + + ms3debug("Read %zu bytes, buffer %zu bytes", realsize, mem->length); +// ms3debug("Data: %s", (char*)buffer); + return nitems * size; +} + +static uint8_t build_assume_role_request_uri(CURL *curl, const char *base_domain, const char *query, bool use_http) +{ + char uri_buffer[MAX_URI_LENGTH]; + const char *domain; + const uint8_t path_parts = 10; // "https://" + "." + "/" + const char *http_protocol = "http"; + const char *https_protocol = "https"; + const char *protocol; + + if (base_domain) + { + domain = base_domain; + } + else + { + domain = default_sts_domain; + } + + if (use_http) + { + protocol = http_protocol; + } + else + { + protocol = https_protocol; + } + + if (query) + { + if (path_parts + strlen(domain) + strlen(query) >= MAX_URI_LENGTH - 1) + { + return MS3_ERR_URI_TOO_LONG; + } + + snprintf(uri_buffer, MAX_URI_LENGTH - 1, "%s://%s/?%s", protocol, + domain, query); + } + else + { + return MS3_ERR_PARAMETER; + } + + ms3debug("URI: %s", uri_buffer); + curl_easy_setopt(curl, CURLOPT_URL, uri_buffer); + return 0; +} + +static char *generate_assume_role_query(CURL *curl, const char *action, size_t role_duration, + const char *version, const char *role_session_name, const char *role_arn, + const char *continuation, char *query_buffer) +{ + size_t query_buffer_length = 0; + char *encoded; + query_buffer[0] = '\0'; + + if (action) + { + encoded = curl_easy_escape(curl, action, (int)strlen(action)); + query_buffer_length = strlen(query_buffer); + if (query_buffer_length) + { + snprintf(query_buffer + query_buffer_length, 3072 - query_buffer_length, + "&Action=%s", encoded); + } + else + { + snprintf(query_buffer, 3072, "Action=%s", encoded); + } + curl_free(encoded); + } + if (role_duration >= 900 && role_duration <= 43200) + { + query_buffer_length = strlen(query_buffer); + if (query_buffer_length) + { + snprintf(query_buffer + query_buffer_length, 3072 - query_buffer_length, + "&DurationSeconds=%zu", role_duration); + } + else + { + snprintf(query_buffer, 3072, "DurationSeconds=%zu", role_duration); + } + } + if (continuation) + { + encoded = curl_easy_escape(curl, continuation, (int)strlen(continuation)); + query_buffer_length = strlen(query_buffer); + if (query_buffer_length) + { + snprintf(query_buffer + query_buffer_length, 3072 - query_buffer_length, + "&Marker=%s", encoded); + } + else + { + snprintf(query_buffer, 3072, "Marker=%s", encoded); + } + curl_free(encoded); + } + if (role_arn) + { + encoded = curl_easy_escape(curl, role_arn, (int)strlen(role_arn)); + query_buffer_length = strlen(query_buffer); + if (query_buffer_length) + { + snprintf(query_buffer + query_buffer_length, 3072 - query_buffer_length, + "&RoleArn=%s", encoded); + } + else + { + snprintf(query_buffer, 3072, "RoleArn=%s", encoded); + } + curl_free(encoded); + } + if (role_session_name) + { + encoded = curl_easy_escape(curl, role_session_name, (int)strlen(role_session_name)); + query_buffer_length = strlen(query_buffer); + if (query_buffer_length) + { + snprintf(query_buffer + query_buffer_length, 3072 - query_buffer_length, + "&RoleSessionName=%s", encoded); + } + else + { + snprintf(query_buffer, 3072, "RoleSessionName=%s", encoded); + } + curl_free(encoded); + } + if (version) + { + encoded = curl_easy_escape(curl, version, (int)strlen(version)); + query_buffer_length = strlen(query_buffer); + if (query_buffer_length) + { + snprintf(query_buffer + query_buffer_length, 3072 - query_buffer_length, + "&Version=%s", encoded); + } + else + { + snprintf(query_buffer, 3072, "Version=%s", encoded); + } + curl_free(encoded); + } + + return query_buffer; +} + + +static uint8_t generate_assume_role_request_hash(uri_method_t method, const char *query, char *post_hash, + struct curl_slist *headers, char *return_hash) +{ + char signing_data[3072]; + size_t pos = 0; + uint8_t sha256hash[32]; // SHA_256 binary length + uint8_t hash_pos = 0; + uint8_t i; + struct curl_slist *current_header = headers; + + // Method first + switch (method) + { + case MS3_GET: + { + sprintf(signing_data, "GET\n"); + pos += 4; + break; + } + + case MS3_HEAD: + { + sprintf(signing_data, "HEAD\n"); + pos += 5; + break; + } + + case MS3_PUT: + { + sprintf(signing_data, "PUT\n"); + pos += 4; + break; + } + + case MS3_DELETE: + { + sprintf(signing_data, "DELETE\n"); + pos += 7; + break; + } + + default: + { + ms3debug("Bad method detected"); + return MS3_ERR_IMPOSSIBLE; + } + } + + // URL query (if exists) + if (query) + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, "/\n%s\n", query); + pos += strlen(query) + 3; + } + else + { + sprintf(signing_data + pos, "\n"); + pos++; + } + + do + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, "%s\n", + current_header->data); + pos += strlen(current_header->data) + 1; + } + while ((current_header = current_header->next)); + + // List if header names + // The newline between headers and this is important + snprintf(signing_data + pos, sizeof(signing_data) - pos, + "\nhost;x-amz-content-sha256;x-amz-date\n"); + pos += 38; + + // Hash of post data (can be hash of empty) + snprintf(signing_data + pos, sizeof(signing_data) - pos, "%.*s", 64, post_hash); + //pos+= 64; + + // Hash all of the above + sha256((uint8_t *)signing_data, strlen(signing_data), (uint8_t *)sha256hash); + + for (i = 0; i < 32; i++) + { + sprintf(return_hash + hash_pos, "%.2x", sha256hash[i]); + hash_pos += 2; + } + + ms3debug("Signature data: %s", signing_data); + ms3debug("Signature: %.*s", 64, return_hash); + + return 0; +} + +static uint8_t +build_assume_role_request_headers(CURL *curl, struct curl_slist **head, + const char *base_domain, + const char* endpoint_type, + const char *region, const char *key, + const char *secret, const char *query, + uri_method_t method, + struct put_buffer_st *post_data) +{ + uint8_t ret = 0; + time_t now; + struct tm tmp_tm; + char headerbuf[3072]; + char secrethead[45]; + char date[9]; + char sha256hash[65]; + char post_hash[65]; + uint8_t tmp_hash[32]; + // Alternate between these two so hmac doesn't overwrite itself + uint8_t hmac_hash[32]; + uint8_t hmac_hash2[32]; + uint8_t hash_pos = 0; + const char *domain; + const char *type; + struct curl_slist *headers = NULL; + uint8_t offset; + uint8_t i; + struct curl_slist *current_header; + + // Host header + if (base_domain) + { + domain = base_domain; + } + else + { + domain = default_sts_domain; + } + + if (endpoint_type) + { + type = endpoint_type; + } + else + { + type = "sts"; + } + + snprintf(headerbuf, sizeof(headerbuf), "host:%s", domain); + + headers = curl_slist_append(headers, headerbuf); + *head = headers; + + // Hash post data + sha256(post_data->data, post_data->length, tmp_hash); + + for (i = 0; i < 32; i++) + { + sprintf(post_hash + hash_pos, "%.2x", tmp_hash[i]); + hash_pos += 2; + } + + snprintf(headerbuf, sizeof(headerbuf), "x-amz-content-sha256:%.*s", 64, + post_hash); + headers = curl_slist_append(headers, headerbuf); + + // Date/time header + time(&now); + snprintf(headerbuf, sizeof(headerbuf), "x-amz-date:"); + offset = strlen(headerbuf); + gmtime_r(&now, &tmp_tm); + strftime(headerbuf + offset, sizeof(headerbuf) - offset, "%Y%m%dT%H%M%SZ", + &tmp_tm); + headers = curl_slist_append(headers, headerbuf); + + // Builds the request hash + ret = generate_assume_role_request_hash(method, query, post_hash, headers, sha256hash); + + if (ret) + { + return ret; + } + + // User signing key hash + // Date hashed using AWS4:secret_key + snprintf(secrethead, sizeof(secrethead), "AWS4%.*s", 40, secret); + strftime(headerbuf, sizeof(headerbuf), "%Y%m%d", &tmp_tm); + hmac_sha256((uint8_t *)secrethead, strlen(secrethead), (uint8_t *)headerbuf, + strlen(headerbuf), hmac_hash); + + // Region signed by above key + hmac_sha256(hmac_hash, 32, (uint8_t *)region, strlen(region), + hmac_hash2); + + // Service signed by above key + hmac_sha256(hmac_hash2, 32, (uint8_t *)type, strlen(type), + hmac_hash); + + // Request version signed by above key (always "aws4_request") + sprintf(headerbuf, "aws4_request"); + hmac_sha256(hmac_hash, 32, (uint8_t *)headerbuf, strlen(headerbuf), + hmac_hash2); + + // Sign everything with the key + snprintf(headerbuf, sizeof(headerbuf), "AWS4-HMAC-SHA256\n"); + offset = strlen(headerbuf); + strftime(headerbuf + offset, sizeof(headerbuf) - offset, "%Y%m%dT%H%M%SZ\n", + &tmp_tm); + offset = strlen(headerbuf); + strftime(date, 9, "%Y%m%d", &tmp_tm); + snprintf(headerbuf + offset, sizeof(headerbuf) - offset, + "%.*s/%s/%s/aws4_request\n%.*s", 8, date, region, type, 64, sha256hash); + ms3debug("Data to sign: %s", headerbuf); + hmac_sha256(hmac_hash2, 32, (uint8_t *)headerbuf, strlen(headerbuf), + hmac_hash); + + hash_pos = 0; + + for (i = 0; i < 32; i++) + { + sprintf(sha256hash + hash_pos, "%.2x", hmac_hash[i]); + hash_pos += 2; + } + + // Make auth header + snprintf(headerbuf, sizeof(headerbuf), + "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/%s/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=%s", + key, date, region, type, sha256hash); + + headers = curl_slist_append(headers, headerbuf); + + // Disable this header or PUT will barf with a 501 + sprintf(headerbuf, "Transfer-Encoding:"); + headers = curl_slist_append(headers, headerbuf); + + current_header = headers; + + do + { + ms3debug("Header: %s", current_header->data); + } + while ((current_header = current_header->next)); + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + return 0; +} + +uint8_t execute_assume_role_request(ms3_st *ms3, command_t cmd, + const uint8_t *data, size_t data_size, + char *continuation) +{ + CURL *curl = NULL; + struct curl_slist *headers = NULL; + uint8_t res = 0; + struct memory_buffer_st mem; + uri_method_t method; + char *query = NULL; + struct put_buffer_st post_data; + CURLcode curl_res; + long response_code = 0; + char* endpoint = NULL; + const char* region = iam_request_region; + char endpoint_type[8]; + + mem.data = NULL; + mem.length = 0; + mem.alloced = 1; + mem.buffer_chunk_size = ms3->buffer_chunk_size; + + post_data.data = (uint8_t *) data; + post_data.length = data_size; + post_data.offset = 0; + + curl = ms3->curl; + + if (!ms3->first_run) + { + curl_easy_reset(curl); + } + else + { + ms3->first_run = false; + } + + if (cmd == MS3_CMD_ASSUME_ROLE) + { + query = generate_assume_role_query(curl, "AssumeRole", ms3->role_session_duration, "2011-06-15", "libmariaS3", + ms3->iam_role_arn, continuation, ms3->query_buffer); + endpoint = ms3->sts_endpoint; + region = ms3->sts_region; + sprintf(endpoint_type, "sts"); + method = MS3_GET; + } + else if (cmd == MS3_CMD_LIST_ROLE) + { + query = generate_assume_role_query(curl, "ListRoles", 0, "2010-05-08", NULL, NULL, continuation, ms3->query_buffer); + endpoint = ms3->iam_endpoint; + sprintf(endpoint_type, "iam"); + method = MS3_GET; + } + + res = build_assume_role_request_uri(curl, endpoint, query, ms3->use_http); + + if (res) + { + return res; + } + + res = build_assume_role_request_headers(curl, &headers, endpoint, + endpoint_type, region, + ms3->s3key, ms3->s3secret, query, + method, &post_data); + + if (res) + { + ms3_cfree(mem.data); + curl_slist_free_all(headers); + + return res; + } + + if (ms3->disable_verification) + { + ms3debug("Disabling SSL verification"); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0); + } + + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, header_callback); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, body_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&mem); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_res = curl_easy_perform(curl); + + if (curl_res != CURLE_OK) + { + ms3debug("Curl error: %s", curl_easy_strerror(curl_res)); + set_error(ms3, curl_easy_strerror(curl_res)); + ms3_cfree(mem.data); + curl_slist_free_all(headers); + + return MS3_ERR_REQUEST_ERROR; + } + + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + ms3debug("Response code: %ld", response_code); + + if (response_code == 404) + { + char *message = parse_error_message((char *)mem.data, mem.length); + + if (message) + { + ms3debug("Response message: %s", message); + } + + set_error_nocopy(ms3, message); + res = MS3_ERR_NOT_FOUND; + } + else if (response_code == 403) + { + char *message = parse_error_message((char *)mem.data, mem.length); + + if (message) + { + ms3debug("Response message: %s", message); + } + + set_error_nocopy(ms3, message); + res = MS3_ERR_AUTH; + } + else if (response_code >= 400) + { + char *message = parse_error_message((char *)mem.data, mem.length); + + if (message) + { + ms3debug("Response message: %s", message); + } + + set_error_nocopy(ms3, message); + res = MS3_ERR_SERVER; + } + + switch (cmd) + { + case MS3_CMD_LIST_ROLE: + { + char *cont = NULL; + res = parse_role_list_response((const char *)mem.data, mem.length, ms3->iam_role ,ms3->iam_role_arn, &cont); + + if (cont && res) + { + res = execute_assume_role_request(ms3, cmd, data, data_size, cont); + if (res) + { + ms3_cfree(cont); + ms3_cfree(mem.data); + curl_slist_free_all(headers); + return res; + } + ms3_cfree(cont); + } + + ms3_cfree(mem.data); + break; + } + + case MS3_CMD_ASSUME_ROLE: + { + if (res) + { + ms3_cfree(mem.data); + curl_slist_free_all(headers); + return res; + } + res = parse_assume_role_response((const char *)mem.data, mem.length, ms3->role_key, ms3->role_secret, ms3->role_session_token); + ms3_cfree(mem.data); + break; + } + + case MS3_CMD_LIST: + case MS3_CMD_LIST_RECURSIVE: + case MS3_CMD_PUT: + case MS3_CMD_GET: + case MS3_CMD_DELETE: + case MS3_CMD_HEAD: + case MS3_CMD_COPY: + default: + { + ms3_cfree(mem.data); + ms3debug("Bad cmd detected"); + res = MS3_ERR_IMPOSSIBLE; + } + } + + curl_slist_free_all(headers); + + return res; +} diff --git a/storage/maria/libmarias3/src/assume_role.h b/storage/maria/libmarias3/src/assume_role.h new file mode 100644 index 00000000..1a2c861a --- /dev/null +++ b/storage/maria/libmarias3/src/assume_role.h @@ -0,0 +1,22 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2020 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +uint8_t execute_assume_role_request(ms3_st *ms3, command_t cmd, const uint8_t *data, size_t data_size, char *continuation); diff --git a/storage/maria/libmarias3/src/common.h b/storage/maria/libmarias3/src/common.h new file mode 100644 index 00000000..9d4c2b08 --- /dev/null +++ b/storage/maria/libmarias3/src/common.h @@ -0,0 +1,43 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#ifdef __cplusplus +#include <cstddef> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <inttypes.h> + +#include <curl/curl.h> + +#include <libmarias3/marias3.h> + +#include "memory.h" +#include "debug.h" +#include "error.h" +#include "structs.h" +#include "response.h" +#include "request.h" +#include "assume_role.h" + diff --git a/storage/maria/libmarias3/src/debug.c b/storage/maria/libmarias3/src/debug.c new file mode 100644 index 00000000..33fe1764 --- /dev/null +++ b/storage/maria/libmarias3/src/debug.c @@ -0,0 +1,34 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include "config.h" +#include "common.h" +#include <stdbool.h> + +static bool debugging_enabled = false; + +void ms3debug_set(bool enabled) +{ + debugging_enabled = enabled; +} + +bool ms3debug_get(void) +{ + return debugging_enabled; +} diff --git a/storage/maria/libmarias3/src/debug.h b/storage/maria/libmarias3/src/debug.h new file mode 100644 index 00000000..7e4f1d2c --- /dev/null +++ b/storage/maria/libmarias3/src/debug.h @@ -0,0 +1,56 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#include "config.h" +#include <stdbool.h> + +void ms3debug_set(bool enabled); +bool ms3debug_get(void); + +#define ms3debug(MSG, ...) do { \ + if (ms3debug_get()) \ + { \ + fprintf(stderr, "[libmarias3] %s:%d " MSG "\n", __FILE__, __LINE__, ##__VA_ARGS__); \ + } \ +} while(0) +#define ms3debug_hex(DATA, LEN) do { \ + size_t hex_it; \ + fprintf(stderr, "[libmarias3] %s:%d packet hex: ", __FILE__, __LINE__); \ + for (hex_it = 0; hex_it < LEN ; hex_it++) \ + { \ + fprintf(stderr, "%02X ", (unsigned char)DATA[hex_it]); \ + } \ + fprintf(stderr, "\n"); \ + fprintf(stderr, "[libmarias3] %s:%d printable packet data: ", __FILE__, __LINE__); \ + for (hex_it = 0; hex_it < LEN ; hex_it++) \ + { \ + if (((unsigned char)DATA[hex_it] < 0x32) or (((unsigned char)DATA[hex_it] > 0x7e))) \ + { \ + fprintf(stderr, "."); \ + } \ + else \ + { \ + fprintf(stderr, "%c", (unsigned char)DATA[hex_it]); \ + } \ + } \ + fprintf(stderr, "\n"); \ +} while(0) + diff --git a/storage/maria/libmarias3/src/error.c b/storage/maria/libmarias3/src/error.c new file mode 100644 index 00000000..edf95d05 --- /dev/null +++ b/storage/maria/libmarias3/src/error.c @@ -0,0 +1,39 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include "config.h" +#include "common.h" + +// NOTE: for every new error, add an entry to errmsgs here + +const char *errmsgs[] = +{ + "No error", + "Parameter error", + "No data", + "Could not parse response XML", + "Generated URI too long", + "Error making REST request", + "Out of memory", + "Impossible condition detected", + "Authentication error", + "File not found", + "S3 server error", + "Data too big. Maximum data size is 4GB" +}; diff --git a/storage/maria/libmarias3/src/error.h b/storage/maria/libmarias3/src/error.h new file mode 100644 index 00000000..e106b285 --- /dev/null +++ b/storage/maria/libmarias3/src/error.h @@ -0,0 +1,27 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#include "config.h" + +#define baderror "No such error code" + +// extern and define in C file so we don't get redefinition at link time +extern const char *errmsgs[]; diff --git a/storage/maria/libmarias3/src/include.am b/storage/maria/libmarias3/src/include.am new file mode 100644 index 00000000..b8c07a6f --- /dev/null +++ b/storage/maria/libmarias3/src/include.am @@ -0,0 +1,37 @@ +# vim:ft=automake +# included from Top Level Makefile.am +# All paths should be given relative to the root + +noinst_HEADERS+= src/common.h +noinst_HEADERS+= src/debug.h +noinst_HEADERS+= src/error.h +noinst_HEADERS+= src/structs.h +noinst_HEADERS+= src/request.h +noinst_HEADERS+= src/response.h +noinst_HEADERS+= src/xml.h +noinst_HEADERS+= src/memory.h +noinst_HEADERS+= src/sha256.h +noinst_HEADERS+= src/sha256_i.h +noinst_HEADERS+= src/assume_role.h + +lib_LTLIBRARIES+= src/libmarias3.la +src_libmarias3_la_SOURCES= +src_libmarias3_la_LIBADD= +src_libmarias3_la_LDFLAGS= +src_libmarias3_la_CFLAGS= -DBUILDING_MS3 + +src_libmarias3_la_SOURCES+= src/marias3.c +src_libmarias3_la_SOURCES+= src/request.c +src_libmarias3_la_SOURCES+= src/response.c +src_libmarias3_la_SOURCES+= src/assume_role.c +src_libmarias3_la_SOURCES+= src/error.c +src_libmarias3_la_SOURCES+= src/debug.c + +src_libmarias3_la_SOURCES+= src/sha256.c +src_libmarias3_la_SOURCES+= src/sha256-internal.c + +src_libmarias3_la_SOURCES+= src/xml.c + +src_libmarias3_la_LDFLAGS+= -version-info ${LIBMARIAS3_LIBRARY_VERSION} + +src_libmarias3_la_LIBADD+= @LIBCURL_LIBS@ @LIBM@ diff --git a/storage/maria/libmarias3/src/marias3.c b/storage/maria/libmarias3/src/marias3.c new file mode 100644 index 00000000..74d7233a --- /dev/null +++ b/storage/maria/libmarias3/src/marias3.c @@ -0,0 +1,668 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include "config.h" +#include "common.h" + +#include <pthread.h> +#include <arpa/inet.h> +#include <netinet/in.h> + +ms3_malloc_callback ms3_cmalloc = (ms3_malloc_callback)malloc; +ms3_free_callback ms3_cfree = (ms3_free_callback)free; +ms3_realloc_callback ms3_crealloc = (ms3_realloc_callback)realloc; +ms3_strdup_callback ms3_cstrdup = (ms3_strdup_callback)strdup; +ms3_calloc_callback ms3_ccalloc = (ms3_calloc_callback)calloc; + + +/* Thread locking code for OpenSSL < 1.1.0 */ +#include <dlfcn.h> +#ifndef RTLD_DEFAULT +#define RTLD_DEFAULT ((void *)0) +#endif +static pthread_mutex_t *mutex_buf = NULL; +#define CRYPTO_LOCK 1 +static void (*openssl_set_id_callback)(unsigned long (*func)(void)); +static void (*openssl_set_locking_callback)(void (*func)(int mode,int type, const char *file,int line)); +static int (*openssl_num_locks)(void); + +static void locking_function(int mode, int n, const char *file, int line) +{ + (void) file; + (void) line; + if(mode & CRYPTO_LOCK) + pthread_mutex_lock(&(mutex_buf[n])); + else + pthread_mutex_unlock(&(mutex_buf[n])); +} + +static int curl_needs_openssl_locking() +{ + curl_version_info_data *data = curl_version_info(CURLVERSION_NOW); + + if (data->ssl_version == NULL) + { + return 0; + } + + if (strncmp(data->ssl_version, "OpenSSL", 7) != 0) + { + return 0; + } + if (data->ssl_version[8] == '0') + { + return 1; + } + if ((data->ssl_version[8] == '1') && (data->ssl_version[10] == '0')) + { + openssl_set_id_callback = dlsym(RTLD_DEFAULT, "CRYPTO_set_id_callback"); + openssl_set_locking_callback = dlsym(RTLD_DEFAULT, "CRYPTO_set_locking_callback"); + openssl_num_locks = dlsym(RTLD_DEFAULT, "CRYPTO_num_locks"); + return openssl_set_id_callback != NULL && + openssl_set_locking_callback != NULL && + openssl_num_locks != NULL; + } + return 0; +} + +static unsigned long __attribute__((unused)) id_function(void) +{ + return ((unsigned long)pthread_self()); +} + +uint8_t ms3_library_init_malloc(ms3_malloc_callback m, + ms3_free_callback f, ms3_realloc_callback r, + ms3_strdup_callback s, ms3_calloc_callback c) +{ + if (!m || !f || !r || !s || !c) + { + return MS3_ERR_PARAMETER; + } + + ms3_cmalloc = m; + ms3_cfree = f; + ms3_crealloc = r; + ms3_cstrdup = s; + ms3_ccalloc = c; + + if (curl_needs_openssl_locking()) + { + int i; + mutex_buf = ms3_cmalloc(openssl_num_locks() * sizeof(pthread_mutex_t)); + if(mutex_buf) + { + for(i = 0; i < openssl_num_locks(); i++) + pthread_mutex_init(&(mutex_buf[i]), NULL); + openssl_set_id_callback(id_function); + openssl_set_locking_callback(locking_function); + } + } + + if (curl_global_init_mem(CURL_GLOBAL_DEFAULT, m, f, r, s, c)) + { + return MS3_ERR_PARAMETER; + } + + return 0; +} + +void ms3_library_init(void) +{ + if (curl_needs_openssl_locking()) + { + int i; + mutex_buf = malloc(openssl_num_locks() * sizeof(pthread_mutex_t)); + if(mutex_buf) + { + for(i = 0; i < openssl_num_locks(); i++) + pthread_mutex_init(&(mutex_buf[i]), NULL); + openssl_set_id_callback(id_function); + openssl_set_locking_callback(locking_function); + } + } + curl_global_init(CURL_GLOBAL_DEFAULT); +} + +void ms3_library_deinit(void) +{ + int i; + if (mutex_buf) + { + openssl_set_id_callback(NULL); + openssl_set_locking_callback(NULL); + for(i = 0; i < openssl_num_locks(); i++) + pthread_mutex_destroy(&(mutex_buf[i])); + ms3_cfree(mutex_buf); + mutex_buf = NULL; + } + curl_global_cleanup(); +} + +ms3_st *ms3_init(const char *s3key, const char *s3secret, + const char *region, + const char *base_domain) +{ + ms3_st *ms3; + + if ((s3key == NULL) || (s3secret == NULL)) + { + return NULL; + } + + ms3 = ms3_cmalloc(sizeof(ms3_st)); + + ms3->s3key = ms3_cstrdup(s3key); + ms3->s3secret = ms3_cstrdup(s3secret); + ms3->region = ms3_cstrdup(region); + ms3->port = 0; /* The default value */ + + if (base_domain && strlen(base_domain)) + { + struct sockaddr_in sa; + ms3->base_domain = ms3_cstrdup(base_domain); + if (inet_pton(AF_INET, base_domain, &(sa.sin_addr))) + { + ms3->list_version = 1; + ms3->protocol_version = 1; + } + else if (strcmp(base_domain, "s3.amazonaws.com") == 0) + { + ms3->list_version = 2; + ms3->protocol_version = 2; + } + else + { + // Assume that S3-compatible APIs can't support v2 list + ms3->list_version = 1; + ms3->protocol_version = 2; + } + } + else + { + ms3->base_domain = NULL; + ms3->list_version = 2; + ms3->protocol_version = 2; + } + + ms3->buffer_chunk_size = READ_BUFFER_DEFAULT_SIZE; + + ms3->curl = curl_easy_init(); + ms3->last_error = NULL; + ms3->use_http = false; + ms3->disable_verification = false; + ms3->first_run = true; + ms3->path_buffer = ms3_cmalloc(sizeof(char) * 1024); + ms3->query_buffer = ms3_cmalloc(sizeof(char) * 3072); + ms3->list_container.pool = NULL; + ms3->list_container.next = NULL; + ms3->list_container.start = NULL; + ms3->list_container.pool_list = NULL; + ms3->list_container.pool_free = 0; + + ms3->iam_role = NULL; + ms3->role_key = NULL; + ms3->role_secret = NULL; + ms3->role_session_token = NULL; + ms3->iam_endpoint = NULL; + ms3->sts_endpoint = NULL; + ms3->sts_region = NULL; + ms3->iam_role_arn = NULL; + + return ms3; +} + +uint8_t ms3_init_assume_role(ms3_st *ms3, const char *iam_role, const char *sts_endpoint, const char *sts_region) +{ + uint8_t ret=0; + + if (iam_role == NULL) + { + return MS3_ERR_PARAMETER; + } + ms3->iam_role = ms3_cstrdup(iam_role); + + if (sts_endpoint && strlen(sts_endpoint)) + { + ms3->sts_endpoint = ms3_cstrdup(sts_endpoint); + } + else + { + ms3->sts_endpoint = ms3_cstrdup("sts.amazonaws.com"); + } + + if (sts_region && strlen(sts_region)) + { + ms3->sts_region = ms3_cstrdup(sts_region); + } + else + { + ms3->sts_region = ms3_cstrdup("us-east-1"); + } + + ms3->iam_endpoint = ms3_cstrdup("iam.amazonaws.com"); + + ms3->iam_role_arn = ms3_cmalloc(sizeof(char) * 2048); + ms3->iam_role_arn[0] = '\0'; + ms3->role_key = ms3_cmalloc(sizeof(char) * 128); + ms3->role_key[0] = '\0'; + ms3->role_secret = ms3_cmalloc(sizeof(char) * 1024); + ms3->role_secret[0] = '\0'; + // aws says theres no maximum length here.. 2048 might be overkill + ms3->role_session_token = ms3_cmalloc(sizeof(char) * 2048); + ms3->role_session_token[0] = '\0'; + // 0 will uses the default and not set a value in the request + ms3->role_session_duration = 0; + + ret = ms3_assume_role(ms3); + + return ret; +} + +uint8_t ms3_ec2_set_cred(ms3_st *ms3, const char *iam_role, + const char *s3key, const char *s3secret, + const char *token) +{ + uint8_t ret=0; + + if (iam_role == NULL || token == NULL || s3key == NULL || s3secret == NULL) + { + return MS3_ERR_PARAMETER; + } + ms3->iam_role = ms3_cstrdup(iam_role); + ms3->role_key = ms3_cstrdup(s3key); + ms3->role_secret = ms3_cstrdup(s3secret); + ms3->role_session_token = ms3_cstrdup(token); + + return ret; +} + +static void list_free(ms3_st *ms3) +{ + ms3_list_st *list = ms3->list_container.start; + struct ms3_pool_alloc_list_st *plist = NULL, *next = NULL; + while (list) + { + ms3_cfree(list->key); + list = list->next; + } + plist = ms3->list_container.pool_list; + while (plist) + { + next = plist->prev; + ms3_cfree(plist->pool); + ms3_cfree(plist); + plist = next; + } + ms3->list_container.pool = NULL; + ms3->list_container.next = NULL; + ms3->list_container.start = NULL; + ms3->list_container.pool_list = NULL; + ms3->list_container.pool_free = 0; +} + +void ms3_deinit(ms3_st *ms3) +{ + if (!ms3) + { + return; + } + + ms3debug("deinit: 0x%" PRIXPTR, (uintptr_t)ms3); + ms3_cfree(ms3->s3secret); + ms3_cfree(ms3->s3key); + ms3_cfree(ms3->region); + ms3_cfree(ms3->base_domain); + ms3_cfree(ms3->iam_role); + ms3_cfree(ms3->role_key); + ms3_cfree(ms3->role_secret); + ms3_cfree(ms3->role_session_token); + ms3_cfree(ms3->iam_endpoint); + ms3_cfree(ms3->sts_endpoint); + ms3_cfree(ms3->sts_region); + ms3_cfree(ms3->iam_role_arn); + curl_easy_cleanup(ms3->curl); + ms3_cfree(ms3->last_error); + ms3_cfree(ms3->path_buffer); + ms3_cfree(ms3->query_buffer); + list_free(ms3); + ms3_cfree(ms3); +} + +const char *ms3_server_error(ms3_st *ms3) +{ + if (!ms3) + { + return NULL; + } + + return ms3->last_error; +} + +void ms3_debug(void) +{ + bool state = ms3debug_get(); + ms3debug_set(!state); + + if (state) + { + ms3debug("enabling debug"); + } +} + +const char *ms3_error(uint8_t errcode) +{ + if (errcode >= MS3_ERR_MAX) + { + return baderror; + } + + return errmsgs[errcode]; +} + +uint8_t ms3_list_dir(ms3_st *ms3, const char *bucket, const char *prefix, + ms3_list_st **list) +{ + uint8_t res = 0; + + if (!ms3 || !bucket || !list) + { + return MS3_ERR_PARAMETER; + } + + list_free(ms3); + res = execute_request(ms3, MS3_CMD_LIST, bucket, NULL, NULL, NULL, prefix, NULL, + 0, NULL, + NULL); + *list = ms3->list_container.start; + return res; +} + +uint8_t ms3_list(ms3_st *ms3, const char *bucket, const char *prefix, + ms3_list_st **list) +{ + uint8_t res = 0; + + if (!ms3 || !bucket || !list) + { + return MS3_ERR_PARAMETER; + } + + list_free(ms3); + res = execute_request(ms3, MS3_CMD_LIST_RECURSIVE, bucket, NULL, NULL, NULL, + prefix, NULL, + 0, NULL, + NULL); + *list = ms3->list_container.start; + return res; +} + +uint8_t ms3_put(ms3_st *ms3, const char *bucket, const char *key, + const uint8_t *data, size_t length) +{ + uint8_t res; + + if (!ms3 || !bucket || !key || !data) + { + return MS3_ERR_PARAMETER; + } + + if (length == 0) + { + return MS3_ERR_NO_DATA; + } + + // mhash can't hash more than 4GB it seems + if (length > UINT32_MAX) + { + return MS3_ERR_TOO_BIG; + } + + res = execute_request(ms3, MS3_CMD_PUT, bucket, key, NULL, NULL, NULL, data, + length, NULL, + NULL); + + return res; +} + +uint8_t ms3_get(ms3_st *ms3, const char *bucket, const char *key, + uint8_t **data, size_t *length) +{ + uint8_t res = 0; + struct memory_buffer_st buf; + + buf.data = NULL; + buf.length = 0; + + if (!ms3 || !bucket || !key || key[0] == '\0' || !data || !length) + { + return MS3_ERR_PARAMETER; + } + + res = execute_request(ms3, MS3_CMD_GET, bucket, key, NULL, NULL, NULL, NULL, 0, + NULL, &buf); + *data = buf.data; + *length = buf.length; + return res; +} + +uint8_t ms3_copy(ms3_st *ms3, const char *source_bucket, const char *source_key, + const char *dest_bucket, const char *dest_key) +{ + uint8_t res = 0; + + if (!ms3 || !source_bucket || !source_key || !dest_bucket || !dest_key) + { + return MS3_ERR_PARAMETER; + } + + res = execute_request(ms3, MS3_CMD_COPY, dest_bucket, dest_key, source_bucket, + source_key, NULL, NULL, 0, NULL, NULL); + return res; +} + +uint8_t ms3_move(ms3_st *ms3, const char *source_bucket, const char *source_key, + const char *dest_bucket, const char *dest_key) +{ + uint8_t res = 0; + + if (!ms3 || !source_bucket || !source_key || !dest_bucket || !dest_key) + { + return MS3_ERR_PARAMETER; + } + + res = ms3_copy(ms3, source_bucket, source_key, dest_bucket, dest_key); + + if (res) + { + return res; + } + + res = ms3_delete(ms3, source_bucket, source_key); + + return res; +} + +uint8_t ms3_delete(ms3_st *ms3, const char *bucket, const char *key) +{ + uint8_t res; + + if (!ms3 || !bucket || !key) + { + return MS3_ERR_PARAMETER; + } + + res = execute_request(ms3, MS3_CMD_DELETE, bucket, key, NULL, NULL, NULL, NULL, + 0, NULL, + NULL); + return res; +} + +uint8_t ms3_status(ms3_st *ms3, const char *bucket, const char *key, + ms3_status_st *status) +{ + uint8_t res; + + if (!ms3 || !bucket || !key || !status) + { + return MS3_ERR_PARAMETER; + } + + res = execute_request(ms3, MS3_CMD_HEAD, bucket, key, NULL, NULL, NULL, NULL, 0, + NULL, + status); + return res; +} + +void ms3_list_free(ms3_list_st *list) +{ + // Deprecated + (void) list; +} + +void ms3_free(uint8_t *data) +{ + ms3_cfree(data); +} + +uint8_t ms3_set_option(ms3_st *ms3, ms3_set_option_t option, void *value) +{ + if (!ms3) + { + return MS3_ERR_PARAMETER; + } + + switch (option) + { + case MS3_OPT_USE_HTTP: + { + ms3->use_http = ms3->use_http ? 0 : 1; + break; + } + + case MS3_OPT_DISABLE_SSL_VERIFY: + { + ms3->disable_verification = ms3->disable_verification ? 0 : 1; + break; + } + + case MS3_OPT_BUFFER_CHUNK_SIZE: + { + size_t new_size; + + if (!value) + { + return MS3_ERR_PARAMETER; + } + + new_size = *(size_t *)value; + + if (new_size < 1) + { + return MS3_ERR_PARAMETER; + } + + ms3->buffer_chunk_size = new_size; + break; + } + + case MS3_OPT_FORCE_LIST_VERSION: + { + uint8_t list_version; + + if (!value) + { + return MS3_ERR_PARAMETER; + } + + list_version = *(uint8_t *)value; + + if (list_version < 1 || list_version > 2) + { + return MS3_ERR_PARAMETER; + } + + ms3->list_version = list_version; + break; + } + + case MS3_OPT_FORCE_PROTOCOL_VERSION: + { + uint8_t protocol_version; + + if (!value) + { + return MS3_ERR_PARAMETER; + } + + protocol_version = *(uint8_t *)value; + + if (protocol_version < 1 || protocol_version > 2) + { + return MS3_ERR_PARAMETER; + } + + ms3->list_version = protocol_version; + break; + } + + case MS3_OPT_PORT_NUMBER: + { + int port_number; + + if (!value) + { + return MS3_ERR_PARAMETER; + } + memcpy(&port_number, (void*)value, sizeof(int)); + + ms3->port = port_number; + break; + } + default: + return MS3_ERR_PARAMETER; + } + + return 0; +} + +uint8_t ms3_assume_role(ms3_st *ms3) +{ + uint8_t res = 0; + + if (!ms3 || !ms3->iam_role) + { + return MS3_ERR_PARAMETER; + } + + if (!strstr(ms3->iam_role_arn, ms3->iam_role)) + { + ms3debug("Lookup IAM role ARN"); + res = execute_assume_role_request(ms3, MS3_CMD_LIST_ROLE, NULL, 0, NULL); + if(res) + { + return res; + } + + } + ms3debug("Assume IAM role"); + res = execute_assume_role_request(ms3, MS3_CMD_ASSUME_ROLE, NULL, 0, NULL); + + return res; +} + diff --git a/storage/maria/libmarias3/src/memory.h b/storage/maria/libmarias3/src/memory.h new file mode 100644 index 00000000..a676a682 --- /dev/null +++ b/storage/maria/libmarias3/src/memory.h @@ -0,0 +1,26 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +extern ms3_malloc_callback ms3_cmalloc; +extern ms3_free_callback ms3_cfree; +extern ms3_realloc_callback ms3_crealloc; +extern ms3_strdup_callback ms3_cstrdup; +extern ms3_calloc_callback ms3_ccalloc; diff --git a/storage/maria/libmarias3/src/request.c b/storage/maria/libmarias3/src/request.c new file mode 100644 index 00000000..26165474 --- /dev/null +++ b/storage/maria/libmarias3/src/request.c @@ -0,0 +1,967 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include "config.h" +#include "common.h" +#include "sha256.h" + +#include <math.h> + +const char *default_domain = "s3.amazonaws.com"; + +static void set_error(ms3_st *ms3, const char *error) +{ + ms3_cfree(ms3->last_error); + + if (!error) + { + ms3->last_error = NULL; + return; + } + + ms3->last_error = ms3_cstrdup(error); +} + +static void set_error_nocopy(ms3_st *ms3, char *error) +{ + ms3_cfree(ms3->last_error); + + if (!error) + { + ms3->last_error = NULL; + return; + } + + ms3->last_error = error; +} + +static uint8_t build_request_uri(CURL *curl, const char *base_domain, + const char *bucket, const char *object, const char *query, bool use_http, + uint8_t protocol_version) +{ + char uri_buffer[MAX_URI_LENGTH]; + const char *domain; + const uint8_t path_parts = 10; // "https://" + "." + "/" + const char *http_protocol = "http"; + const char *https_protocol = "https"; + const char *protocol; + + if (base_domain) + { + domain = base_domain; + } + else + { + domain = default_domain; + } + + if (use_http) + { + protocol = http_protocol; + } + else + { + protocol = https_protocol; + } + + if (query) + { + if (path_parts + strlen(domain) + strlen(bucket) + strlen(object) + strlen( + query) >= MAX_URI_LENGTH - 1) + { + return MS3_ERR_URI_TOO_LONG; + } + + if (protocol_version == 1) + { + snprintf(uri_buffer, MAX_URI_LENGTH - 1, "%s://%s/%s%s?%s", protocol, + domain, bucket, + object, query); + } + else + { + snprintf(uri_buffer, MAX_URI_LENGTH - 1, "%s://%s.%s%s?%s", protocol, + bucket, domain, + object, query); + } + } + else + { + if (path_parts + strlen(domain) + strlen(bucket) + strlen( + object) >= MAX_URI_LENGTH - 1) + { + return MS3_ERR_URI_TOO_LONG; + } + + if (protocol_version == 1) + { + snprintf(uri_buffer, MAX_URI_LENGTH - 1, "%s://%s/%s%s", protocol, + domain, + bucket, + object); + } + else + { + snprintf(uri_buffer, MAX_URI_LENGTH - 1, "%s://%s.%s%s", protocol, + bucket, domain, + object); + } + } + + ms3debug("URI: %s", uri_buffer); + curl_easy_setopt(curl, CURLOPT_URL, uri_buffer); + return 0; +} + +/* Handles object name to path conversion. + * Must always start with a '/' even if object is empty. + * Object should be urlencoded. Unfortunately curl also urlencodes slashes. + * So this breaks up on slashes and reassembles the encoded parts. + * Not very efficient but works until we write a custom encoder. + */ + +static char *generate_path(CURL *curl, const char *object, char *path_buffer) +{ + char *tok_ptr = NULL; + char *save_ptr = NULL; + char *out_ptr = path_buffer; + char *path; + + // Keep scanbuild happy + path_buffer[0] = '\0'; + + if (!object) + { + sprintf(path_buffer, "/"); + return path_buffer; + } + + path = ms3_cstrdup(object); // Because strtok_r is destructive + + tok_ptr = strtok_r((char *)path, "/", &save_ptr); + + while (tok_ptr != NULL) + { + char *encoded = curl_easy_escape(curl, tok_ptr, (int)strlen(tok_ptr)); + snprintf(out_ptr, 1024 - (out_ptr - path_buffer), "/%s", encoded); + out_ptr += strlen(encoded) + 1; + curl_free(encoded); + tok_ptr = strtok_r(NULL, "/", &save_ptr); + } + + if (path_buffer[0] != '/') + { + sprintf(path_buffer, "/"); + } + + ms3_cfree(path); + return path_buffer; +} + + +/* At a later date we need to make this accept multi-param/values to support + * pagination + */ + +static char *generate_query(CURL *curl, const char *value, + const char *continuation, uint8_t list_version, bool use_delimiter, + char *query_buffer) +{ + char *encoded; + query_buffer[0] = '\0'; + + if (use_delimiter) + { + snprintf(query_buffer, 3072, "delimiter=%%2F"); + } + + if (list_version == 2) + { + if (continuation) + { + encoded = curl_easy_escape(curl, continuation, (int)strlen(continuation)); + + if (strlen(query_buffer)) + { + snprintf(query_buffer + strlen(query_buffer), 3072 - strlen(query_buffer), + "&continuation-token=%s&list-type=2", encoded); + } + else + { + snprintf(query_buffer, 3072, "continuation-token=%s&list-type=2", encoded); + } + + curl_free(encoded); + } + else + { + if (strlen(query_buffer)) + { + snprintf(query_buffer + strlen(query_buffer), 3072 - strlen(query_buffer), + "&list-type=2"); + } + else + { + sprintf(query_buffer, "list-type=2"); + } + } + } + else if (continuation) + { + // Continuation is really marker here + encoded = curl_easy_escape(curl, continuation, (int)strlen(continuation)); + + if (strlen(query_buffer)) + { + snprintf(query_buffer + strlen(query_buffer), 3072 - strlen(query_buffer), + "&marker=%s", + encoded); + } + else + { + snprintf(query_buffer, 3072, "marker=%s", encoded); + } + + curl_free(encoded); + } + + if (value) + { + encoded = curl_easy_escape(curl, value, (int)strlen(value)); + + if (strlen(query_buffer)) + { + snprintf(query_buffer + strlen(query_buffer), 3072 - strlen(query_buffer), + "&prefix=%s", + encoded); + } + else + { + snprintf(query_buffer, 3072, "prefix=%s", + encoded); + } + + curl_free(encoded); + } + + return query_buffer; +} + + +/* +<HTTPMethod>\n +<CanonicalURI>\n +<CanonicalQueryString>\n +<CanonicalHeaders>\n +<SignedHeaders>\n - host;x-amz-content-sha256;x-amz-date +<HashedPayload> - empty if no POST data +*/ +static uint8_t generate_request_hash(uri_method_t method, const char *path, + const char *bucket, + const char *query, char *post_hash, struct curl_slist *headers, bool has_source, bool has_token, + char *return_hash) +{ + char signing_data[3072]; + size_t pos = 0; + uint8_t sha256hash[32]; // SHA_256 binary length + uint8_t hash_pos = 0; + uint8_t i; + struct curl_slist *current_header = headers; + + // Method first + switch (method) + { + case MS3_GET: + { + sprintf(signing_data, "GET\n"); + pos += 4; + break; + } + + case MS3_HEAD: + { + sprintf(signing_data, "HEAD\n"); + pos += 5; + break; + } + + case MS3_PUT: + { + sprintf(signing_data, "PUT\n"); + pos += 4; + break; + } + + case MS3_DELETE: + { + sprintf(signing_data, "DELETE\n"); + pos += 7; + break; + } + + default: + { + ms3debug("Bad method detected"); + return MS3_ERR_IMPOSSIBLE; + } + } + + // URL path + if (bucket) + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, "/%s%s\n", bucket, + path); + pos += strlen(path) + strlen(bucket) + 2; + } + else + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, "%s\n", path); + pos += strlen(path) + 1; + } + + // URL query (if exists) + if (query) + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, "%s\n", query); + pos += strlen(query) + 1; + } + else + { + sprintf(signing_data + pos, "\n"); + pos++; + } + + do + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, "%s\n", + current_header->data); + pos += strlen(current_header->data) + 1; + } + while ((current_header = current_header->next)); + + // List if header names + // The newline between headers and this is important + if (has_source && has_token) + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, + "\nhost;x-amz-content-sha256;x-amz-copy-source;x-amz-date;x-amz-security-token\n"); + pos += 77; + } + else if (has_source) + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, + "\nhost;x-amz-content-sha256;x-amz-copy-source;x-amz-date\n"); + pos += 56; + } + else if (has_token) + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, + "\nhost;x-amz-content-sha256;x-amz-date;x-amz-security-token\n"); + pos += 59; + } + else + { + snprintf(signing_data + pos, sizeof(signing_data) - pos, + "\nhost;x-amz-content-sha256;x-amz-date\n"); + pos += 38; + } + + // Hash of post data (can be hash of empty) + snprintf(signing_data + pos, sizeof(signing_data) - pos, "%.*s", 64, post_hash); + //pos+= 64; + ms3debug("Signature data1: %s", signing_data); + + // Hash all of the above + sha256((uint8_t *)signing_data, strlen(signing_data), (uint8_t *)sha256hash); + + for (i = 0; i < 32; i++) + { + sprintf(return_hash + hash_pos, "%.2x", sha256hash[i]); + hash_pos += 2; + } + + ms3debug("Signature data: %s", signing_data); + ms3debug("Signature: %.*s", 64, return_hash); + + return 0; +} + +static uint8_t build_request_headers(CURL *curl, struct curl_slist **head, + const char *base_domain, const char *region, const char *key, + const char *secret, const char *object, const char *query, + uri_method_t method, const char *bucket, const char *source_bucket, + const char *source_key, struct put_buffer_st *post_data, + uint8_t protocol_version, const char *session_token) +{ + uint8_t ret = 0; + time_t now; + struct tm tmp_tm; + char headerbuf[3072]; + char secrethead[45]; + char date[9]; + char sha256hash[65]; + char post_hash[65]; + uint8_t tmp_hash[32]; + // Alternate between these two so hmac doesn't overwrite itself + uint8_t hmac_hash[32]; + uint8_t hmac_hash2[32]; + uint8_t hash_pos = 0; + const char *domain; + struct curl_slist *headers = NULL; + uint8_t offset; + uint8_t i; + bool has_source = false; + bool has_token = false; + struct curl_slist *current_header; + + // Host header + if (base_domain) + { + domain = base_domain; + } + else + { + domain = default_domain; + } + + if (protocol_version == 2) + { + snprintf(headerbuf, sizeof(headerbuf), "host:%s.%s", bucket, domain); + } + else + { + snprintf(headerbuf, sizeof(headerbuf), "host:%s", domain); + } + headers = curl_slist_append(headers, headerbuf); + *head = headers; + + // Hash post data + sha256(post_data->data, post_data->length, tmp_hash); + + for (i = 0; i < 32; i++) + { + sprintf(post_hash + hash_pos, "%.2x", tmp_hash[i]); + hash_pos += 2; + } + + snprintf(headerbuf, sizeof(headerbuf), "x-amz-content-sha256:%.*s", 64, + post_hash); + headers = curl_slist_append(headers, headerbuf); + + if (source_bucket) + { + char *bucket_escape; + char *key_escape; + bucket_escape = curl_easy_escape(curl, source_bucket, (int)strlen(source_bucket)); + key_escape = curl_easy_escape(curl, source_key, (int)strlen(source_key)); + snprintf(headerbuf, sizeof(headerbuf), "x-amz-copy-source:/%s/%s", + bucket_escape, key_escape); + headers = curl_slist_append(headers, headerbuf); + ms3_cfree(bucket_escape); + ms3_cfree(key_escape); + } + + // Date/time header + time(&now); + snprintf(headerbuf, sizeof(headerbuf), "x-amz-date:"); + offset = strlen(headerbuf); + gmtime_r(&now, &tmp_tm); + strftime(headerbuf + offset, sizeof(headerbuf) - offset, "%Y%m%dT%H%M%SZ", + &tmp_tm); + headers = curl_slist_append(headers, headerbuf); + + // Temp Credentials Security Token + if (session_token) + { + snprintf(headerbuf, sizeof(headerbuf), "x-amz-security-token:%s",session_token); + headers = curl_slist_append(headers, headerbuf); + has_token = true; + } + + if (source_bucket) + { + has_source = true; + } + + // Builds the request hash + if (protocol_version == 1) + { + ret = generate_request_hash(method, object, bucket, query, post_hash, headers, + has_source, has_token, + sha256hash); + } + else + { + ret = generate_request_hash(method, object, NULL, query, post_hash, headers, + has_source, has_token, + sha256hash); + } + + if (ret) + { + return ret; + } + + // User signing key hash + // Date hashed using AWS4:secret_key + snprintf(secrethead, sizeof(secrethead), "AWS4%.*s", 40, secret); + strftime(headerbuf, sizeof(headerbuf), "%Y%m%d", &tmp_tm); + hmac_sha256((uint8_t *)secrethead, strlen(secrethead), (uint8_t *)headerbuf, + strlen(headerbuf), hmac_hash); + + // Region signed by above key + hmac_sha256(hmac_hash, 32, (uint8_t *)region, strlen(region), + hmac_hash2); + + // Service signed by above key (s3 always) + sprintf(headerbuf, "s3"); + hmac_sha256(hmac_hash2, 32, (uint8_t *)headerbuf, strlen(headerbuf), + hmac_hash); + + // Request version signed by above key (always "aws4_request") + sprintf(headerbuf, "aws4_request"); + hmac_sha256(hmac_hash, 32, (uint8_t *)headerbuf, strlen(headerbuf), + hmac_hash2); + + // Sign everything with the key + snprintf(headerbuf, sizeof(headerbuf), "AWS4-HMAC-SHA256\n"); + offset = strlen(headerbuf); + strftime(headerbuf + offset, sizeof(headerbuf) - offset, "%Y%m%dT%H%M%SZ\n", + &tmp_tm); + offset = strlen(headerbuf); + strftime(date, 9, "%Y%m%d", &tmp_tm); + snprintf(headerbuf + offset, sizeof(headerbuf) - offset, + "%.*s/%s/s3/aws4_request\n%.*s", 8, date, region, 64, sha256hash); + ms3debug("Data to sign: %s", headerbuf); + hmac_sha256(hmac_hash2, 32, (uint8_t *)headerbuf, strlen(headerbuf), + hmac_hash); + + hash_pos = 0; + + for (i = 0; i < 32; i++) + { + sprintf(sha256hash + hash_pos, "%.2x", hmac_hash[i]); + hash_pos += 2; + } + + // Make auth header + if (source_bucket && session_token) + { + snprintf(headerbuf, sizeof(headerbuf), + "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-copy-source;x-amz-date;x-amz-security-token;x-amz-copy-source, Signature=%s", + key, date, region, sha256hash); + } + else if (source_bucket) + { + snprintf(headerbuf, sizeof(headerbuf), + "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-copy-source;x-amz-date, Signature=%s", + key, date, region, sha256hash); + } + else if (session_token) + { + snprintf(headerbuf, sizeof(headerbuf), + "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=%s", + key, date, region, sha256hash); + } + else + { + snprintf(headerbuf, sizeof(headerbuf), + "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=%s", + key, date, region, sha256hash); + } + + headers = curl_slist_append(headers, headerbuf); + + // Disable this header or PUT will barf with a 501 + sprintf(headerbuf, "Transfer-Encoding:"); + headers = curl_slist_append(headers, headerbuf); + + if ((method == MS3_PUT) && !source_bucket) + { + snprintf(headerbuf, sizeof(headerbuf), "Content-Length:%zu", post_data->length); + headers = curl_slist_append(headers, headerbuf); + } + + current_header = headers; + + do + { + ms3debug("Header: %s", current_header->data); + } + while ((current_header = current_header->next)); + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + switch (method) + { + case MS3_GET: + { + // Nothing extra to do here + break; + } + + case MS3_HEAD: + { + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); + break; + } + + case MS3_PUT: + { + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "PUT"); + break; + } + + case MS3_DELETE: + { + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "DELETE"); + break; + } + + default: + ms3debug("Bad method detected"); + return MS3_ERR_IMPOSSIBLE; + } + + return 0; +} +static size_t header_callback(char *buffer, size_t size, + size_t nitems, void *userdata) +{ + ms3debug("%.*s\n", (int)(nitems * size), buffer); + + if (userdata) + { + // HEAD request + if (!strncasecmp(buffer, "Last-Modified", 13)) + { + ms3_status_st *status = (ms3_status_st *) userdata; + // Date/time, format: Fri, 15 Mar 2019 16:58:54 GMT + struct tm ttmp = {0}; + strptime(buffer + 15, "%a, %d %b %Y %H:%M:%S %Z", &ttmp); + status->created = mktime(&ttmp); + } + else if (!strncasecmp(buffer, "Content-Length", 14)) + { + ms3_status_st *status = (ms3_status_st *) userdata; + // Length + status->length = strtoull(buffer + 16, NULL, 10); + } + } + + return nitems * size; +} + +static size_t body_callback(void *buffer, size_t size, + size_t nitems, void *userdata) +{ + uint8_t *ptr; + size_t realsize = nitems * size; + + struct memory_buffer_st *mem = (struct memory_buffer_st *)userdata; + + if (realsize + mem->length >= mem->alloced) + { + size_t additional_size = mem->buffer_chunk_size; + + if (realsize >= mem->buffer_chunk_size) + { + additional_size = (ceil((double)realsize / (double)mem->buffer_chunk_size) + 1) + * mem->buffer_chunk_size; + } + + ptr = (uint8_t *)ms3_crealloc(mem->data, mem->alloced + additional_size); + + if (!ptr) + { + ms3debug("Curl response OOM"); + return 0; + } + + mem->alloced += additional_size; + mem->data = ptr; + } + + memcpy(&(mem->data[mem->length]), buffer, realsize); + mem->length += realsize; + mem->data[mem->length] = '\0'; + + ms3debug("Read %zu bytes, buffer %zu bytes", realsize, mem->length); +// ms3debug("Data: %s", (char*)buffer); + return nitems * size; +} + +uint8_t execute_request(ms3_st *ms3, command_t cmd, const char *bucket, + const char *object, const char *source_bucket, const char *source_object, + const char *filter, const uint8_t *data, size_t data_size, + char *continuation, + void *ret_ptr) +{ + CURL *curl = NULL; + struct curl_slist *headers = NULL; + uint8_t res = 0; + struct memory_buffer_st mem; + uri_method_t method; + char *path = NULL; + char *query = NULL; + struct put_buffer_st post_data; + CURLcode curl_res; + long response_code = 0; + + mem.data = NULL; + mem.length = 0; + mem.alloced = 1; + mem.buffer_chunk_size = ms3->buffer_chunk_size; + + post_data.data = (uint8_t *) data; + post_data.length = data_size; + post_data.offset = 0; + + curl = ms3->curl; + + if (!ms3->first_run) + { + curl_easy_reset(curl); + } + else + { + ms3->first_run = false; + } + + path = generate_path(curl, object, ms3->path_buffer); + + if (cmd == MS3_CMD_LIST_RECURSIVE) + { + query = generate_query(curl, filter, continuation, ms3->list_version, false, + ms3->query_buffer); + } + else if (cmd == MS3_CMD_LIST) + { + query = generate_query(curl, filter, continuation, ms3->list_version, true, + ms3->query_buffer); + } + + res = build_request_uri(curl, ms3->base_domain, bucket, path, query, + ms3->use_http, ms3->protocol_version); + + if (res) + { + return res; + } + + switch (cmd) + { + case MS3_CMD_COPY: + case MS3_CMD_PUT: + method = MS3_PUT; + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (char *)data); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, data_size); + break; + + case MS3_CMD_DELETE: + method = MS3_DELETE; + break; + + case MS3_CMD_HEAD: + method = MS3_HEAD; + curl_easy_setopt(curl, CURLOPT_HEADERDATA, ret_ptr); + break; + + case MS3_CMD_LIST: + case MS3_CMD_LIST_RECURSIVE: + case MS3_CMD_GET: + case MS3_CMD_LIST_ROLE: + method = MS3_GET; + break; + + case MS3_CMD_ASSUME_ROLE: + default: + ms3debug("Bad cmd detected"); + ms3_cfree(mem.data); + + return MS3_ERR_IMPOSSIBLE; + } + + if (ms3->iam_role) + { + ms3debug("Using assumed role: %s",ms3->iam_role); + res = build_request_headers(curl, &headers, ms3->base_domain, ms3->region, + ms3->role_key, ms3->role_secret, path, query, method, bucket, source_bucket, + source_object, &post_data, ms3->protocol_version, ms3->role_session_token); + } + else + { + res = build_request_headers(curl, &headers, ms3->base_domain, ms3->region, + ms3->s3key, ms3->s3secret, path, query, method, bucket, source_bucket, + source_object, &post_data, ms3->protocol_version, NULL); + } + if (res) + { + ms3_cfree(mem.data); + curl_slist_free_all(headers); + + return res; + } + + if (ms3->disable_verification) + { + ms3debug("Disabling SSL verification"); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0); + } + + if (ms3->port) + curl_easy_setopt(curl, CURLOPT_PORT, (long)ms3->port); + + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, header_callback); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, body_callback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&mem); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_res = curl_easy_perform(curl); + + if (curl_res != CURLE_OK) + { + ms3debug("Curl error: %s", curl_easy_strerror(curl_res)); + set_error(ms3, curl_easy_strerror(curl_res)); + ms3_cfree(mem.data); + curl_slist_free_all(headers); + + return MS3_ERR_REQUEST_ERROR; + } + + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + ms3debug("Response code: %ld", response_code); + + if (response_code == 404) + { + char *message = parse_error_message((char *)mem.data, mem.length); + + if (message) + { + ms3debug("Response message: %s", message); + } + + set_error_nocopy(ms3, message); + res = MS3_ERR_NOT_FOUND; + } + else if (response_code == 403) + { + char *message = parse_error_message((char *)mem.data, mem.length); + + if (message) + { + ms3debug("Response message: %s", message); + } + + set_error_nocopy(ms3, message); + res = MS3_ERR_AUTH; + } + else if (response_code >= 400) + { + char *message = parse_error_message((char *)mem.data, mem.length); + + if (message) + { + ms3debug("Response message: %s", message); + } + + set_error_nocopy(ms3, message); + res = MS3_ERR_SERVER; + if (ms3->iam_role) + { + res = MS3_ERR_AUTH_ROLE; + } + } + + switch (cmd) + { + case MS3_CMD_LIST_RECURSIVE: + case MS3_CMD_LIST: + { + char *cont = NULL; + parse_list_response((const char *)mem.data, mem.length, &ms3->list_container, ms3->list_version, + &cont); + + if (cont) + { + res = execute_request(ms3, cmd, bucket, object, source_bucket, source_object, + filter, data, data_size, cont, + NULL); + if (res) + { + return res; + } + + ms3_cfree(cont); + } + + ms3_cfree(mem.data); + break; + } + + case MS3_CMD_COPY: + case MS3_CMD_PUT: + { + ms3_cfree(mem.data); + break; + } + + case MS3_CMD_GET: + { + struct memory_buffer_st *buf = (struct memory_buffer_st *) ret_ptr; + + if (res) + { + ms3_cfree(mem.data); + } + else + { + buf->data = mem.data; + buf->length = mem.length; + } + + break; + } + + case MS3_CMD_DELETE: + { + ms3_cfree(mem.data); + break; + } + + case MS3_CMD_HEAD: + { + ms3_cfree(mem.data); + break; + } + + case MS3_CMD_LIST_ROLE: + case MS3_CMD_ASSUME_ROLE: + default: + { + ms3_cfree(mem.data); + ms3debug("Bad cmd detected"); + res = MS3_ERR_IMPOSSIBLE; + } + } + + curl_slist_free_all(headers); + + return res; +} diff --git a/storage/maria/libmarias3/src/request.h b/storage/maria/libmarias3/src/request.h new file mode 100644 index 00000000..9ce8bb5c --- /dev/null +++ b/storage/maria/libmarias3/src/request.h @@ -0,0 +1,63 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#include "config.h" +#include <stdint.h> +#include <stddef.h> + +// Maxmum S3 file size is 1024 bytes so for protection we make the maximum +// URI length this +#define MAX_URI_LENGTH 1024 + +#define READ_BUFFER_DEFAULT_SIZE 1024*1024 + +enum uri_method_t +{ + MS3_GET, + MS3_HEAD, + MS3_PUT, + MS3_DELETE +}; + +typedef enum uri_method_t uri_method_t; + +enum command_t +{ + MS3_CMD_LIST, + MS3_CMD_LIST_RECURSIVE, + MS3_CMD_PUT, + MS3_CMD_GET, + MS3_CMD_DELETE, + MS3_CMD_HEAD, + MS3_CMD_COPY, + MS3_CMD_LIST_ROLE, + MS3_CMD_ASSUME_ROLE +}; + +typedef enum command_t command_t; + +struct ms3_st; + +uint8_t execute_request(ms3_st *ms3, command_t command, const char *bucket, + const char *object, const char *source_bucket, const char *source_object, + const char *filter, const uint8_t *data, size_t data_size, + char *continuation, + void *ret_ptr); diff --git a/storage/maria/libmarias3/src/response.c b/storage/maria/libmarias3/src/response.c new file mode 100644 index 00000000..4e976aba --- /dev/null +++ b/storage/maria/libmarias3/src/response.c @@ -0,0 +1,512 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include "config.h" +#include "common.h" + +#include "xml.h" + +char *parse_error_message(const char *data, size_t length) +{ + struct xml_document *doc = NULL; + struct xml_node *node = NULL; + struct xml_node *child = NULL; + struct xml_node *root = NULL; + + uint64_t node_it = 0; + + if (!data || !length) + { + return NULL; + } + + doc = xml_parse_document((uint8_t*)data, length); + + if (!doc) + { + return NULL; + } + + root = xml_document_root(doc); + + // First node is Error + child = xml_node_child(root, node_it); + // IAM / STS This will be Error and we need next child + if (!xml_node_name_cmp(child, "Error")) + { + node = xml_node_child(child, node_it); + } + else + { + node = child; + child = root; + } + + if (!node) + { + xml_document_free(doc, false); + return NULL; + } + + while(node) + { + if (!xml_node_name_cmp(node, "Message")) + { + struct xml_string *content = xml_node_content(node); + uint8_t *message = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, message, xml_string_length(content)); + xml_document_free(doc, false); + return (char *)message; + } + + node_it++; + node = xml_node_child(child, node_it); + } + + xml_document_free(doc, false); + return NULL; +} + +static ms3_list_st *get_next_list_ptr(struct ms3_list_container_st *container) +{ + ms3_list_st *new_alloc = NULL; + struct ms3_pool_alloc_list_st *new_pool_next = NULL; + struct ms3_pool_alloc_list_st *new_pool_prev = NULL; + ms3_list_st *ret = NULL; + if (container->pool_free == 0) + { + new_alloc = (ms3_list_st*)ms3_cmalloc(sizeof(ms3_list_st) * 1024); + new_pool_next = (struct ms3_pool_alloc_list_st*)ms3_cmalloc(sizeof(struct ms3_pool_alloc_list_st)); + + if (!new_alloc || !new_pool_next) + { + ms3debug("List realloc OOM"); + return NULL; + } + + new_pool_prev = container->pool_list; + container->pool_list = new_pool_next; + if (new_pool_prev) + { + container->pool_list->prev = new_pool_prev; + } + else + { + container->pool_list->prev = NULL; + } + container->pool_list->pool = new_alloc; + + container->pool_free = 1024; + if (!container->start) + { + container->start = new_alloc; + } + container->pool = container->next = new_alloc; + } + else + { + container->next++; + } + ret = container->next; + container->pool_free--; + return ret; +} + +uint8_t parse_list_response(const char *data, size_t length, struct ms3_list_container_st *list_container, + uint8_t list_version, + char **continuation) +{ + struct xml_document *doc; + struct xml_node *root; + struct xml_node *node; + struct xml_node *child; + char *filename = NULL; + char *filesize = NULL; + char *filedate = NULL; + size_t size = 0; + struct tm ttmp = {0}; + time_t tout = 0; + bool truncated = false; + const char *last_key = NULL; + ms3_list_st *nextptr = NULL, *lastptr = list_container->next; + uint64_t node_it = 0; + + // Empty list + if (!data || !length) + { + return 0; + } + + doc = xml_parse_document((uint8_t*)data, length); + + if (!doc) + { + return MS3_ERR_RESPONSE_PARSE; + } + + /* For version 1: + * If IsTruncated is set, get the last key in the list, this will be used as + * "marker" in the next request. + * For version 2: + * If NextContinuationToken is set, use this for the next request + * + * We use the "continuation" return value for both + */ + + root = xml_document_root(doc); + // First node is ListBucketResponse + node = xml_node_child(root, 0); + + do + { + if (!xml_node_name_cmp(node, "NextContinuationToken")) + { + struct xml_string *content = xml_node_content(node); + *continuation = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)*continuation, xml_string_length(content)); + continue; + } + + if (list_version == 1) + { + if (!xml_node_name_cmp(node, "IsTruncated")) + { + struct xml_string *content = xml_node_content(node); + char *trunc_value = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)trunc_value, xml_string_length(content)); + + if (!strcmp(trunc_value, "true")) + { + truncated = true; + } + + ms3_cfree(trunc_value); + continue; + } + } + + if (!xml_node_name_cmp(node, "Contents")) + { + bool skip = false; + uint64_t child_it = 0; + // Found contents + child = xml_node_child(node, 0); + + do + { + if (!xml_node_name_cmp(child, "Key")) + { + struct xml_string *content = xml_node_content(child); + filename = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)filename, xml_string_length(content)); + + ms3debug("Filename: %s", filename); + + if (filename[strlen((const char *)filename) - 1] == '/') + { + skip = true; + ms3_cfree(filename); + break; + } + + continue; + } + + if (!xml_node_name_cmp(child, "Size")) + { + struct xml_string *content = xml_node_content(child); + filesize = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)filesize, xml_string_length(content)); + + ms3debug("Size: %s", filesize); + size = strtoull((const char *)filesize, NULL, 10); + ms3_cfree(filesize); + continue; + } + + if (!xml_node_name_cmp(child, "LastModified")) + { + struct xml_string *content = xml_node_content(child); + filedate = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)filedate, xml_string_length(content)); + + ms3debug("Date: %s", filedate); + strptime((const char *)filedate, "%Y-%m-%dT%H:%M:%SZ", &ttmp); + tout = mktime(&ttmp); + ms3_cfree(filedate); + continue; + } + } + while ((child = xml_node_child(node, ++child_it))); + + if (!skip) + { + nextptr = get_next_list_ptr(list_container); + nextptr->next = NULL; + + if (lastptr) + { + lastptr->next = nextptr; + } + lastptr = nextptr; + + if (filename) + { + nextptr->key = (char *)filename; + + if (list_version == 1) + { + last_key = nextptr->key; + } + } + else + { + nextptr->key = NULL; + } + + nextptr->length = size; + nextptr->created = tout; + } + + continue; + } + + if (!xml_node_name_cmp(node, "CommonPrefixes")) + { + child = xml_node_child(node, 0); + + if (!xml_node_name_cmp(child, "Prefix")) + { + struct xml_string *content = xml_node_content(child); + filename = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)filename, xml_string_length(content)); + + ms3debug("Filename: %s", filename); + nextptr = get_next_list_ptr(list_container); + nextptr->next = NULL; + + if (lastptr) + { + lastptr->next = nextptr; + } + lastptr = nextptr; + + nextptr->key = (char *)filename; + nextptr->length = 0; + nextptr->created = 0; + } + } + + } + while ((node = xml_node_child(root, ++node_it))); + + if (list_version == 1 && truncated && last_key) + { + *continuation = ms3_cstrdup(last_key); + } + + xml_document_free(doc, false); + return 0; +} + +uint8_t parse_role_list_response(const char *data, size_t length, char *role_name, char *arn, char **continuation) +{ + struct xml_document *doc; + struct xml_node *root; + struct xml_node *list_role_result; + struct xml_node *child; + struct xml_node *roles; + struct xml_node *member; + + char *response_role_name = NULL; + char *response_role_arn = NULL; + uint64_t node_it = 0; + + // Empty list + if (!data || !length) + { + return 0; + } + + doc = xml_parse_document((uint8_t*)data, length); + + if (!doc) + { + return MS3_ERR_RESPONSE_PARSE; + } + + root = xml_document_root(doc); + // First node is listRoleResponse + list_role_result = xml_node_child(root, 0); + child = xml_node_child(list_role_result, 0); + + do + { + + if (!xml_node_name_cmp(child, "Marker")) + { + struct xml_string *content = xml_node_content(child); + *continuation = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)*continuation, xml_string_length(content)); + continue; + } + + if (!xml_node_name_cmp(child, "Roles")) + { + uint64_t child_it = 0; + // Found contents + roles = xml_node_child(child, 0); + do + { + // go down one more child to get members + uint64_t roles_it = 0; + member = xml_node_child(roles, 0); + do + { + if (!xml_node_name_cmp(member, "RoleName")) + { + struct xml_string *content = xml_node_content(member); + response_role_name = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)response_role_name, xml_string_length(content)); + continue; + } + if (!xml_node_name_cmp(member, "Arn")) + { + struct xml_string *content = xml_node_content(member); + response_role_arn = ms3_cmalloc(xml_string_length(content) + 1); + xml_string_copy(content, (uint8_t*)response_role_arn, xml_string_length(content)); + continue; + } + } + while ((member = xml_node_child(roles, ++roles_it))); + if (!strcmp(response_role_name, role_name)) + { + ms3debug("Role Found ARN = %s",response_role_arn); + sprintf(arn, "%s", response_role_arn); + ms3_cfree(response_role_name); + ms3_cfree(response_role_arn); + xml_document_free(doc, false); + return MS3_ERR_NONE; + } + ms3_cfree(response_role_name); + ms3_cfree(response_role_arn); + } + while ((roles = xml_node_child(child, ++child_it))); + } + } + while ((child = xml_node_child(list_role_result, ++node_it))); + + xml_document_free(doc, false); + return MS3_ERR_NOT_FOUND; +} + +uint8_t parse_assume_role_response(const char *data, size_t length, char *assume_role_key, char *assume_role_secret, char *assume_role_token) +{ + struct xml_document *doc; + struct xml_node *root; + struct xml_node *assume_role_result; + struct xml_node *child; + struct xml_node *credentials; + uint64_t node_it = 0; + + // Empty list + if (!data || !length) + { + return 0; + } + + doc = xml_parse_document((uint8_t*)data, length); + + if (!doc) + { + return MS3_ERR_RESPONSE_PARSE; + } + + root = xml_document_root(doc); + // First node is AssumeRoleResponse + assume_role_result = xml_node_child(root, 0); + child = xml_node_child(assume_role_result, 0); + + do + { + if (!xml_node_name_cmp(child, "Credentials")) + { + uint64_t child_it = 0; + // Found contents + credentials = xml_node_child(child, 0); + do + { + if (!xml_node_name_cmp(credentials, "AccessKeyId")) + { + struct xml_string *content = xml_node_content(credentials); + size_t content_length = xml_string_length(content); + assume_role_key[0] = '\0'; + + if (content_length >= 128) + { + ms3debug("AccessKeyId error length = %zu", content_length); + xml_document_free(doc, false); + return MS3_ERR_AUTH_ROLE; + } + xml_string_copy(content, (uint8_t*)assume_role_key, content_length); + + continue; + } + if (!xml_node_name_cmp(credentials, "SecretAccessKey")) + { + struct xml_string *content = xml_node_content(credentials); + size_t content_length = xml_string_length(content); + assume_role_secret[0] = '\0'; + + if (content_length >= 1024) + { + ms3debug("SecretAccessKey error length = %zu", content_length); + xml_document_free(doc, false); + return MS3_ERR_AUTH_ROLE; + } + xml_string_copy(content, (uint8_t*)assume_role_secret, content_length); + + continue; + } + if (!xml_node_name_cmp(credentials, "SessionToken")) + { + struct xml_string *content = xml_node_content(credentials); + size_t content_length = xml_string_length(content); + assume_role_token[0] = '\0'; + + if (content_length >= 2048) + { + ms3debug("SessionToken error length = %zu", content_length); + xml_document_free(doc, false); + return MS3_ERR_AUTH_ROLE; + } + xml_string_copy(content, (uint8_t*)assume_role_token, content_length); + + continue; + } + } + while ((credentials = xml_node_child(child, ++child_it))); + } + } + while ((child = xml_node_child(assume_role_result, ++node_it))); + + xml_document_free(doc, false); + + return MS3_ERR_NONE; +} diff --git a/storage/maria/libmarias3/src/response.h b/storage/maria/libmarias3/src/response.h new file mode 100644 index 00000000..1d8dd907 --- /dev/null +++ b/storage/maria/libmarias3/src/response.h @@ -0,0 +1,32 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#include "config.h" +#include <stdint.h> + +char *parse_error_message(const char *data, size_t length); + +uint8_t parse_list_response(const char *data, size_t length, + struct ms3_list_container_st *list_container, uint8_t list_version, char **continuation); + +uint8_t parse_role_list_response(const char *data, size_t length, char *role_name, char* arn, char **continuation); + +uint8_t parse_assume_role_response(const char *data, size_t length, char *assume_role_key, char *assume_role_secret, char *assume_role_token); diff --git a/storage/maria/libmarias3/src/sha256-internal.c b/storage/maria/libmarias3/src/sha256-internal.c new file mode 100644 index 00000000..3be8aa3c --- /dev/null +++ b/storage/maria/libmarias3/src/sha256-internal.c @@ -0,0 +1,251 @@ +/* + * SHA-256 hash implementation and interface functions + * Copyright (c) 2003-2011, Jouni Malinen <j@w1.fi> + * + * This software may be distributed under the terms of the BSD license. + * See README for more details. + */ + +#include "sha256.h" +#include "sha256_i.h" + + +/** + * sha256_vector - SHA256 hash for data vector + * @num_elem: Number of elements in the data vector + * @addr: Pointers to the data areas + * @len: Lengths of the data blocks + * @mac: Buffer for the hash + * Returns: 0 on success, -1 of failure + */ +int sha256_vector(size_t num_elem, const uint8_t *addr[], const size_t *len, + uint8_t *mac) +{ + struct sha256_state ctx; + size_t i; + + sha256_init(&ctx); + + for (i = 0; i < num_elem; i++) + if (sha256_process(&ctx, addr[i], len[i])) + return -1; + + if (sha256_done(&ctx, mac)) + return -1; + + return 0; +} + + +/* ===== start - public domain SHA256 implementation ===== */ + +/* This is based on SHA256 implementation in LibTomCrypt that was released into + * public domain by Tom St Denis. */ + +/* the K array */ +static const uint32_t K[64] = +{ + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, + 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, + 0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, + 0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL, + 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL, + 0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, + 0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL, + 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL, + 0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, + 0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL +}; + + +/* Various logical functions */ +#define RORc(x, y) \ +( ((((uint32_t) (x) & 0xFFFFFFFFUL) >> (uint32_t) ((y) & 31)) | \ + ((uint32_t) (x) << (uint32_t) (32 - ((y) & 31)))) & 0xFFFFFFFFUL) +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S(x, n) RORc((x), (n)) +#define R(x, n) (((x)&0xFFFFFFFFUL)>>(n)) +#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) +#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) +#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) +#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) +#ifndef MIN +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#endif + +/* compress 512-bits */ +static int sha256_compress(struct sha256_state *md, unsigned char *buf) +{ + uint32_t S[8], W[64]; + int i; + + /* copy state into S */ + for (i = 0; i < 8; i++) + { + S[i] = md->state[i]; + } + + /* copy the state into 512-bits into W[0..15] */ + for (i = 0; i < 16; i++) + W[i] = WPA_GET_BE32(buf + (4 * i)); + + /* fill W[16..63] */ + for (i = 16; i < 64; i++) + { + W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + + W[i - 16]; + } + + /* Compress */ +#define RND(a,b,c,d,e,f,g,h,i) \ + uint32_t t0, t1; \ + t0 = h + Sigma1(e) + Ch(e, f, g) + K[i] + W[i]; \ + t1 = Sigma0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + + for (i = 0; i < 64; ++i) + { + uint32_t t; + RND(S[0], S[1], S[2], S[3], S[4], S[5], S[6], S[7], i); + t = S[7]; + S[7] = S[6]; + S[6] = S[5]; + S[5] = S[4]; + S[4] = S[3]; + S[3] = S[2]; + S[2] = S[1]; + S[1] = S[0]; + S[0] = t; + } + + /* feedback */ + for (i = 0; i < 8; i++) + { + md->state[i] = md->state[i] + S[i]; + } + + return 0; +} + + +/* Initialize the hash state */ +void sha256_init(struct sha256_state *md) +{ + md->curlen = 0; + md->length = 0; + md->state[0] = 0x6A09E667UL; + md->state[1] = 0xBB67AE85UL; + md->state[2] = 0x3C6EF372UL; + md->state[3] = 0xA54FF53AUL; + md->state[4] = 0x510E527FUL; + md->state[5] = 0x9B05688CUL; + md->state[6] = 0x1F83D9ABUL; + md->state[7] = 0x5BE0CD19UL; +} + +/** + Process a block of memory though the hash + @param md The hash state + @param in The data to hash + @param inlen The length of the data (octets) + @return CRYPT_OK if successful +*/ +int sha256_process(struct sha256_state *md, const unsigned char *in, + unsigned long inlen) +{ + unsigned long n; + + if (md->curlen >= sizeof(md->buf)) + return -1; + + while (inlen > 0) + { + if (md->curlen == 0 && inlen >= SHA256_BLOCK_SIZE) + { + if (sha256_compress(md, (unsigned char *) in) < 0) + return -1; + + md->length += SHA256_BLOCK_SIZE * 8; + in += SHA256_BLOCK_SIZE; + inlen -= SHA256_BLOCK_SIZE; + } + else + { + n = MIN(inlen, (SHA256_BLOCK_SIZE - md->curlen)); + memcpy(md->buf + md->curlen, in, n); + md->curlen += n; + in += n; + inlen -= n; + + if (md->curlen == SHA256_BLOCK_SIZE) + { + if (sha256_compress(md, md->buf) < 0) + return -1; + + md->length += 8 * SHA256_BLOCK_SIZE; + md->curlen = 0; + } + } + } + + return 0; +} + + +/** + Terminate the hash to get the digest + @param md The hash state + @param out [out] The destination of the hash (32 bytes) + @return CRYPT_OK if successful +*/ +int sha256_done(struct sha256_state *md, unsigned char *out) +{ + int i; + + if (md->curlen >= sizeof(md->buf)) + return -1; + + /* increase the length of the message */ + md->length += md->curlen * 8; + + /* append the '1' bit */ + md->buf[md->curlen++] = (unsigned char) 0x80; + + /* if the length is currently above 56 bytes we append zeros + * then compress. Then we can fall back to padding zeros and length + * encoding like normal. + */ + if (md->curlen > 56) + { + while (md->curlen < SHA256_BLOCK_SIZE) + { + md->buf[md->curlen++] = (unsigned char) 0; + } + + sha256_compress(md, md->buf); + md->curlen = 0; + } + + /* pad up to 56 bytes of zeroes */ + while (md->curlen < 56) + { + md->buf[md->curlen++] = (unsigned char) 0; + } + + /* store length */ + WPA_PUT_BE64(md->buf + 56, md->length); + sha256_compress(md, md->buf); + + /* copy output */ + for (i = 0; i < 8; i++) + WPA_PUT_BE32(out + (4 * i), md->state[i]); + + return 0; +} + +/* ===== end - public domain SHA256 implementation ===== */ diff --git a/storage/maria/libmarias3/src/sha256.c b/storage/maria/libmarias3/src/sha256.c new file mode 100644 index 00000000..8a28f906 --- /dev/null +++ b/storage/maria/libmarias3/src/sha256.c @@ -0,0 +1,131 @@ +/* + * SHA-256 hash implementation and interface functions + * Copyright (c) 2003-2012, Jouni Malinen <j@w1.fi> + * + * This software may be distributed under the terms of the BSD license. + * See README for more details. + */ + +#include "sha256.h" +#include "sha256_i.h" + +/** + * sha256 - SHA256 hash for data vector + * @num_elem: Number of elements in the data vector + * @addr: Pointer to the data areas + * @len: Length of the data blocks + * @mac: Buffer for the hash + * Returns: 0 on success, -1 of failure + */ +int sha256(const uint8_t *addr, const size_t len, uint8_t *mac) +{ + struct sha256_state ctx; + + sha256_init(&ctx); + + if (sha256_process(&ctx, addr, len)) + return -1; + + if (sha256_done(&ctx, mac)) + return -1; + + return 0; +} + +/** + * hmac_sha256_vector - HMAC-SHA256 over data vector (RFC 2104) + * @key: Key for HMAC operations + * @key_len: Length of the key in bytes + * @num_elem: Number of elements in the data vector + * @addr: Pointers to the data areas + * @len: Lengths of the data blocks + * @mac: Buffer for the hash (32 bytes) + * Returns: 0 on success, -1 on failure + */ +int hmac_sha256_vector(const uint8_t *key, size_t key_len, size_t num_elem, + const uint8_t *addr[], const size_t *len, uint8_t *mac) +{ + unsigned char k_pad[64]; /* padding - key XORd with ipad/opad */ + unsigned char tk[32]; + const uint8_t *_addr[6]; + size_t _len[6], i; + + if (num_elem > 5) + { + /* + * Fixed limit on the number of fragments to avoid having to + * allocate memory (which could fail). + */ + return -1; + } + + /* if key is longer than 64 bytes reset it to key = SHA256(key) */ + if (key_len > 64) + { + if (sha256_vector(1, &key, &key_len, tk) < 0) + return -1; + + key = tk; + key_len = 32; + } + + /* the HMAC_SHA256 transform looks like: + * + * SHA256(K XOR opad, SHA256(K XOR ipad, text)) + * + * where K is an n byte key + * ipad is the byte 0x36 repeated 64 times + * opad is the byte 0x5c repeated 64 times + * and text is the data being protected */ + + /* start out by storing key in ipad */ + memset(k_pad, 0, sizeof(k_pad)); + memcpy(k_pad, key, key_len); + + /* XOR key with ipad values */ + for (i = 0; i < 64; i++) + k_pad[i] ^= 0x36; + + /* perform inner SHA256 */ + _addr[0] = k_pad; + _len[0] = 64; + + for (i = 0; i < num_elem; i++) + { + _addr[i + 1] = addr[i]; + _len[i + 1] = len[i]; + } + + if (sha256_vector(1 + num_elem, _addr, _len, mac) < 0) + return -1; + + memset(k_pad, 0, sizeof(k_pad)); + memcpy(k_pad, key, key_len); + + /* XOR key with opad values */ + for (i = 0; i < 64; i++) + k_pad[i] ^= 0x5c; + + /* perform outer SHA256 */ + _addr[0] = k_pad; + _len[0] = 64; + _addr[1] = mac; + _len[1] = SHA256_MAC_LEN; + return sha256_vector(2, _addr, _len, mac); +} + + +/** + * hmac_sha256 - HMAC-SHA256 over data buffer (RFC 2104) + * @key: Key for HMAC operations + * @key_len: Length of the key in bytes + * @data: Pointers to the data area + * @data_len: Length of the data area + * @mac: Buffer for the hash (32 bytes) + * Returns: 0 on success, -1 on failure + */ +int hmac_sha256(const uint8_t *key, size_t key_len, const uint8_t *data, + size_t data_len, uint8_t *mac) +{ + return hmac_sha256_vector(key, key_len, 1, &data, &data_len, mac); +} diff --git a/storage/maria/libmarias3/src/sha256.h b/storage/maria/libmarias3/src/sha256.h new file mode 100644 index 00000000..45457b9b --- /dev/null +++ b/storage/maria/libmarias3/src/sha256.h @@ -0,0 +1,25 @@ +/* + * SHA256 hash implementation and interface functions + * Copyright (c) 2003-2016, Jouni Malinen <j@w1.fi> + * + * This software may be distributed under the terms of the BSD license. + * See README for more details. + */ + +#ifndef SHA256_H +#define SHA256_H + +#define SHA256_MAC_LEN 32 + +#include <stdint.h> +#include <stddef.h> +#include <string.h> + +int hmac_sha256_vector(const uint8_t *key, size_t key_len, size_t num_elem, + const uint8_t *addr[], const size_t *len, uint8_t *mac); +int hmac_sha256(const uint8_t *key, size_t key_len, const uint8_t *data, + size_t data_len, uint8_t *mac); + +int sha256(const uint8_t *addr, const size_t len, uint8_t *mac); + +#endif /* SHA256_H */ diff --git a/storage/maria/libmarias3/src/sha256_i.h b/storage/maria/libmarias3/src/sha256_i.h new file mode 100644 index 00000000..9b8aa2b7 --- /dev/null +++ b/storage/maria/libmarias3/src/sha256_i.h @@ -0,0 +1,66 @@ +/* + * SHA-256 internal definitions + * Copyright (c) 2003-2011, Jouni Malinen <j@w1.fi> + * + * This software may be distributed under the terms of the BSD license. + * See README for more details. + */ + +#ifndef SHA256_I_H +#define SHA256_I_H + +#define SHA256_BLOCK_SIZE 64 + +#include <stdint.h> + +struct sha256_state +{ + uint64_t length; + uint32_t state[8], curlen; + uint8_t buf[SHA256_BLOCK_SIZE]; +}; + +void sha256_init(struct sha256_state *md); +int sha256_process(struct sha256_state *md, const unsigned char *in, + unsigned long inlen); +int sha256_done(struct sha256_state *md, unsigned char *out); + +/** + * sha256_vector - SHA256 hash for data vector + * @num_elem: Number of elements in the data vector + * @addr: Pointers to the data areas + * @len: Lengths of the data blocks + * @mac: Buffer for the hash + * Returns: 0 on success, -1 on failure + */ +int sha256_vector(size_t num_elem, const uint8_t *addr[], const size_t *len, + uint8_t *mac); + +static inline void WPA_PUT_BE64(uint8_t *a, uint64_t val) +{ + a[0] = val >> 56; + a[1] = val >> 48; + a[2] = val >> 40; + a[3] = val >> 32; + a[4] = val >> 24; + a[5] = val >> 16; + a[6] = val >> 8; + a[7] = val & 0xff; +} + + +static inline uint32_t WPA_GET_BE32(const uint8_t *a) +{ + return ((uint32_t) a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]; +} + +static inline void WPA_PUT_BE32(uint8_t *a, uint32_t val) +{ + a[0] = (val >> 24) & 0xff; + a[1] = (val >> 16) & 0xff; + a[2] = (val >> 8) & 0xff; + a[3] = val & 0xff; +} + + +#endif /* SHA256_I_H */ diff --git a/storage/maria/libmarias3/src/structs.h b/storage/maria/libmarias3/src/structs.h new file mode 100644 index 00000000..34cbd817 --- /dev/null +++ b/storage/maria/libmarias3/src/structs.h @@ -0,0 +1,83 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#pragma once + +#include "config.h" + +struct ms3_pool_alloc_list_st +{ + struct ms3_list_st *pool; + struct ms3_pool_alloc_list_st *prev; +}; + +struct ms3_list_container_st +{ + struct ms3_list_st *pool; + struct ms3_list_st *start; + struct ms3_pool_alloc_list_st *pool_list; + struct ms3_list_st *next; + size_t pool_free; +}; + +struct ms3_st +{ + char *s3key; + char *s3secret; + char *region; + char *base_domain; + int port; // 0 means "Use default" + + char *sts_endpoint; + char *sts_region; + char *iam_endpoint; + char *iam_role; + char *role_key; + char *role_secret; + char *role_session_token; + char *iam_role_arn; + size_t role_session_duration; + + size_t buffer_chunk_size; + CURL *curl; + char *last_error; + bool use_http; + bool disable_verification; + uint8_t list_version; + uint8_t protocol_version; + bool first_run; + char *path_buffer; + char *query_buffer; + struct ms3_list_container_st list_container; +}; + +struct memory_buffer_st +{ + uint8_t *data; + size_t length; + size_t alloced; + size_t buffer_chunk_size; +}; + +struct put_buffer_st +{ + const uint8_t *data; + size_t length; + size_t offset; +}; diff --git a/storage/maria/libmarias3/src/xml.c b/storage/maria/libmarias3/src/xml.c new file mode 100644 index 00000000..2c48a4ea --- /dev/null +++ b/storage/maria/libmarias3/src/xml.c @@ -0,0 +1,1157 @@ +/** + * Copyright (c) 2012 ooxi/xml.c + * https://github.com/ooxi/xml.c + * + * This software is provided 'as-is', without any express or implied warranty. + * In no event will the authors be held liable for any damages arising from the + * use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software in a + * product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * + * 3. This notice may not be removed or altered from any source distribution. + */ + +#include "config.h" + +#ifdef XML_PARSER_VERBOSE +#include <alloca.h> +#endif + +#include <ctype.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include "common.h" +#include "xml.h" + + +/* + * public domain strtok_r() by Charlie Gordon + * + * from comp.lang.c 9/14/2007 + * + * http://groups.google.com/group/comp.lang.c/msg/2ab1ecbb86646684 + * + * (Declaration that it's public domain): + * http://groups.google.com/group/comp.lang.c/msg/7c7b39328fefab9c + */ +static char* xml_strtok_r(char *str, const char *delim, char **nextp) { + char *ret; + + if (str == NULL) { + str = *nextp; + } + + str += strspn(str, delim); + + if (*str == '\0') { + return NULL; + } + + ret = str; + + str += strcspn(str, delim); + + if (*str) { + *str++ = '\0'; + } + + *nextp = str; + + return ret; +} + + + + + + +/** + * [OPAQUE API] + * + * UTF-8 text + */ +struct xml_string { + uint8_t const* buffer; + size_t length; +}; + +/** + * [OPAQUE API] + * + * An xml_attribute may contain text content. + */ +struct xml_attribute { + struct xml_string* name; + struct xml_string* content; +}; + +/** + * [OPAQUE API] + * + * An xml_node will always contain a tag name, a 0-terminated list of attributes + * and a 0-terminated list of children. Moreover it may contain text content. + */ +struct xml_node { + struct xml_string* name; + struct xml_string* content; + struct xml_attribute** attributes; + struct xml_node** children; +}; + +/** + * [OPAQUE API] + * + * An xml_document simply contains the root node and the underlying buffer + */ +struct xml_document { + struct { + uint8_t* buffer; + size_t length; + } buffer; + + struct xml_node* root; +}; + + + + + +/** + * [PRIVATE] + * + * Parser context + */ +struct xml_parser { + uint8_t* buffer; + size_t position; + size_t length; +}; + +/** + * [PRIVATE] + * + * Character offsets + */ +enum xml_parser_offset { + NO_CHARACTER = -1, + CURRENT_CHARACTER = 0, + NEXT_CHARACTER = 1, +}; + + + + + +/** + * [PRIVATE] + * + * @return Number of attributes in 0-terminated array + */ +static size_t get_zero_terminated_array_attributes(struct xml_attribute** attributes) { + size_t elements = 0; + + while (attributes[elements]) { + ++elements; + } + + return elements; +} + + + +/** + * [PRIVATE] + * + * @return Number of nodes in 0-terminated array + */ +static size_t get_zero_terminated_array_nodes(struct xml_node** nodes) { + size_t elements = 0; + + while (nodes[elements]) { + ++elements; + } + + return elements; +} + + + +/** + * [PRIVATE] + * + * @warning No UTF conversions will be attempted + * + * @return true iff a == b + */ +static _Bool xml_string_equals(struct xml_string* a, struct xml_string* b) { + + size_t i = 0; + if (a->length != b->length) { + return false; + } + + for (; i < a->length; ++i) { + if (a->buffer[i] != b->buffer[i]) { + return false; + } + } + + return true; +} + + + +/** + * [PRIVATE] + */ +static uint8_t* xml_string_clone(struct xml_string* s) { + uint8_t* clone; + if (!s) { + return 0; + } + + clone = ms3_ccalloc(s->length + 1, sizeof(uint8_t)); + + xml_string_copy(s, clone, s->length); + clone[s->length] = 0; + + return clone; +} + + + +/** + * [PRIVATE] + * + * Frees the resources allocated by the string + * + * @warning `buffer` must _not_ be freed, since it is a reference to the + * document's buffer + */ +static void xml_string_free(struct xml_string* string) { + ms3_cfree(string); +} + + + +/** + * [PRIVATE] + * + * Frees the resources allocated by the attribute + */ +static void xml_attribute_free(struct xml_attribute* attribute) { + if(attribute->name) { + xml_string_free(attribute->name); + } + if(attribute->content) { + xml_string_free(attribute->content); + } + ms3_cfree(attribute); +} + +/** + * [PRIVATE] + * + * Frees the resources allocated by the node + */ +static void xml_node_free(struct xml_node* node) { + struct xml_attribute** at; + struct xml_node** it; + + xml_string_free(node->name); + + if (node->content) { + xml_string_free(node->content); + } + + at = node->attributes; + while(*at) { + xml_attribute_free(*at); + ++at; + } + ms3_cfree(node->attributes); + + it = node->children; + while (*it) { + xml_node_free(*it); + ++it; + } + ms3_cfree(node->children); + + ms3_cfree(node); +} + + + +/** + * [PRIVATE] + * + * Echos the parsers call stack for debugging purposes + */ +#ifdef XML_PARSER_VERBOSE +static void xml_parser_info(struct xml_parser* parser, char const* message) { + fprintf(stdout, "xml_parser_info %s\n", message); +} +#else +#define xml_parser_info(parser, message) {} +#endif + + + +/** + * [PRIVATE] + * + * Echos an error regarding the parser's source to the console + */ + +#define tmp_min(X,Y) ((X) < (Y) ? (X) : (Y)) + + +static void xml_parser_error(struct xml_parser* parser, enum xml_parser_offset offset, char const* message) { + int row = 0; + int column = 0; + + size_t character = tmp_min(parser->length, parser->position + offset); + size_t position = 0; for (; position < character; ++position) { + column++; + + if ('\n' == parser->buffer[position]) { + row++; + column = 0; + } + } + + if (NO_CHARACTER != offset) { + fprintf(stderr, "xml_parser_error at %i:%i (is %c): %s\n", + row + 1, column, parser->buffer[character], message + ); + } else { + fprintf(stderr, "xml_parser_error at %i:%i: %s\n", + row + 1, column, message + ); + } +} + + + +/** + * [PRIVATE] + * + * Returns the n-th not-whitespace byte in parser and 0 if such a byte does not + * exist + */ +static uint8_t xml_parser_peek(struct xml_parser* parser, size_t n) { + size_t position = parser->position; + + while (position < parser->length) { + if (!isspace(parser->buffer[position])) { + if (n == 0) { + return parser->buffer[position]; + } else { + --n; + } + } + + position++; + } + + return 0; +} + + + +/** + * [PRIVATE] + * + * Moves the parser's position n bytes. If the new position would be out of + * bounds, it will be converted to the bounds itself + */ +static void xml_parser_consume(struct xml_parser* parser, size_t n) { + + /* Debug information + */ + #ifdef XML_PARSER_VERBOSE + #define min(X,Y) ((X) < (Y) ? (X) : (Y)) + char* consumed = alloca((n + 1) * sizeof(char)); + memcpy(consumed, &parser->buffer[parser->position], min(n, parser->length - parser->position)); + consumed[n] = 0; + #undef min + + size_t message_buffer_length = 512; + char* message_buffer = alloca(512 * sizeof(char)); + snprintf(message_buffer, message_buffer_length, "Consuming %li bytes \"%s\"", (long)n, consumed); + message_buffer[message_buffer_length - 1] = 0; + + xml_parser_info(parser, message_buffer); + #endif + + + /* Move the position forward + */ + parser->position += n; + + /* Don't go too far + * + * @warning Valid because parser->length must be greater than 0 + */ + if (parser->position >= parser->length) { + parser->position = parser->length - 1; + } +} + + + +/** + * [PRIVATE] + * + * Skips to the next non-whitespace character + */ +static void xml_skip_whitespace(struct xml_parser* parser) { + xml_parser_info(parser, "whitespace"); + + while (isspace(parser->buffer[parser->position])) { + if (parser->position + 1 >= parser->length) { + return; + } else { + parser->position++; + } + } +} + + + +/** + * [PRIVATE] + * + * Finds and creates all attributes on the given node. + * + * @author Blake Felt + * @see https://github.com/Molorius + */ +static struct xml_attribute** xml_find_attributes(struct xml_parser* parser, struct xml_string* tag_open) { + char* tmp; + char* rest = NULL; + char* token; + char* str_name; + char* str_content; + const unsigned char* start_name; + const unsigned char* start_content; + size_t old_elements; + size_t new_elements; + struct xml_attribute* new_attribute; + struct xml_attribute** attributes; + long position; + + (void) parser; // clang for some reason thinks this isn't used + xml_parser_info(parser, "find_attributes"); + attributes = ms3_ccalloc(1, sizeof(struct xml_attribute*)); + attributes[0] = 0; + + tmp = (char*) xml_string_clone(tag_open); + + token = xml_strtok_r(tmp, " ", &rest); // skip the first value + if(token == NULL) { + goto cleanup; + } + tag_open->length = strlen(token); + + for(token=xml_strtok_r(NULL," ", &rest); token!=NULL; token=xml_strtok_r(NULL," ", &rest)) { + str_name = ms3_cmalloc(strlen(token)+1); + str_content = ms3_cmalloc(strlen(token)+1); + // %s=\"%s\" wasn't working for some reason, ugly hack to make it work + if(sscanf(token, "%[^=]=\"%[^\"]", str_name, str_content) != 2) { + if(sscanf(token, "%[^=]=\'%[^\']", str_name, str_content) != 2) { + ms3_cfree(str_name); + ms3_cfree(str_content); + continue; + } + } + position = token-tmp; + start_name = &tag_open->buffer[position]; + start_content = &tag_open->buffer[position + strlen(str_name) + 2]; + + new_attribute = ms3_cmalloc(sizeof(struct xml_attribute)); + new_attribute->name = ms3_cmalloc(sizeof(struct xml_string)); + new_attribute->name->buffer = (unsigned char*)start_name; + new_attribute->name->length = strlen(str_name); + new_attribute->content = ms3_cmalloc(sizeof(struct xml_string)); + new_attribute->content->buffer = (unsigned char*)start_content; + new_attribute->content->length = strlen(str_content); + + old_elements = get_zero_terminated_array_attributes(attributes); + new_elements = old_elements + 1; + attributes = ms3_crealloc(attributes, (new_elements+1)*sizeof(struct xml_attributes*)); + + attributes[new_elements-1] = new_attribute; + attributes[new_elements] = 0; + + + ms3_cfree(str_name); + ms3_cfree(str_content); + } + +cleanup: + ms3_cfree(tmp); + return attributes; +} + + + +/** + * [PRIVATE] + * + * Parses the name out of the an XML tag's ending + * + * ---( Example )--- + * tag_name> + * --- + */ +static struct xml_string* xml_parse_tag_end(struct xml_parser* parser) { + size_t start; + size_t length = 0; + struct xml_string* name; + + xml_parser_info(parser, "tag_end"); + start = parser->position; + + /* Parse until `>' or a whitespace is reached + */ + while (start + length < parser->length) { + uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER); + + if (('>' == current) || isspace(current)) { + break; + } else { + xml_parser_consume(parser, 1); + length++; + } + } + + /* Consume `>' + */ + if ('>' != xml_parser_peek(parser, CURRENT_CHARACTER)) { + xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_end::expected tag end"); + return 0; + } + xml_parser_consume(parser, 1); + + /* Return parsed tag name + */ + name = ms3_cmalloc(sizeof(struct xml_string)); + name->buffer = &parser->buffer[start]; + name->length = length; + return name; +} + +/** + * [PRIVATE] + * + * Parses an opening XML tag without attributes + * + * ---( Example )--- + * <tag_name> + * --- + */ +static struct xml_string* xml_parse_tag_open(struct xml_parser* parser) { + xml_parser_info(parser, "tag_open"); + xml_skip_whitespace(parser); + + /* Consume `<' + */ + if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) { + xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_open::expected opening tag"); + return 0; + } + xml_parser_consume(parser, 1); + + /* Consume tag name + */ + return xml_parse_tag_end(parser); +} + + + +/** + * [PRIVATE] + * + * Parses an closing XML tag without attributes + * + * ---( Example )--- + * </tag_name> + * --- + */ +static struct xml_string* xml_parse_tag_close(struct xml_parser* parser) { + xml_parser_info(parser, "tag_close"); + xml_skip_whitespace(parser); + + /* Consume `</' + */ + if ( ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) + || ('/' != xml_parser_peek(parser, NEXT_CHARACTER))) { + + if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) { + xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_tag_close::expected closing tag `<'"); + } + if ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) { + xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_tag_close::expected closing tag `/'"); + } + + return 0; + } + xml_parser_consume(parser, 2); + + /* Consume tag name + */ + return xml_parse_tag_end(parser); +} + + + +/** + * [PRIVATE] + * + * Parses a tag's content + * + * ---( Example )--- + * this is + * a + * tag {} content + * --- + * + * @warning CDATA etc. is _not_ and will never be supported + */ +static struct xml_string* xml_parse_content(struct xml_parser* parser) { + size_t start; + size_t length = 0; + struct xml_string* content; + + xml_parser_info(parser, "content"); + + /* Whitespace will be ignored + */ + xml_skip_whitespace(parser); + + start = parser->position; + + /* Consume until `<' is reached + */ + while (start + length < parser->length) { + uint8_t current = xml_parser_peek(parser, CURRENT_CHARACTER); + + if ('<' == current) { + break; + } else { + xml_parser_consume(parser, 1); + length++; + } + } + + /* Next character must be an `<' or we have reached end of file + */ + if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) { + xml_parser_error(parser, CURRENT_CHARACTER, "xml_parse_content::expected <"); + return 0; + } + + /* Ignore tailing whitespace + */ + while ((length > 0) && isspace(parser->buffer[start + length - 1])) { + length--; + } + + /* Return text + */ + content = ms3_cmalloc(sizeof(struct xml_string)); + content->buffer = &parser->buffer[start]; + content->length = length; + return content; +} + + + +/** + * [PRIVATE] + * + * Parses an XML fragment node + * + * ---( Example without children )--- + * <Node>Text</Node> + * --- + * + * ---( Example with children )--- + * <Parent> + * <Child>Text</Child> + * <Child>Text</Child> + * <Test>Content</Test> + * </Parent> + * --- + */ +static struct xml_node* xml_parse_node(struct xml_parser* parser) { + /* Setup variables + */ + struct xml_string* tag_open = 0; + struct xml_string* tag_close = 0; + struct xml_string* content = 0; + struct xml_node* node; + struct xml_node** it; + size_t original_length; + struct xml_attribute** attributes; + struct xml_node** children = ms3_ccalloc(1, sizeof(struct xml_node*)); + children[0] = 0; + + xml_parser_info(parser, "node"); + + /* Parse open tag + */ + tag_open = xml_parse_tag_open(parser); + if (!tag_open) { + xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_open"); + goto exit_failure; + } + + original_length = tag_open->length; + attributes = xml_find_attributes(parser, tag_open); + + /* If tag ends with `/' it's self closing, skip content lookup */ + if (tag_open->length > 0 && '/' == tag_open->buffer[original_length - 1]) { + /* Drop `/' + */ + goto node_creation; + } + + /* If the content does not start with '<', a text content is assumed + */ + if ('<' != xml_parser_peek(parser, CURRENT_CHARACTER)) { + content = xml_parse_content(parser); + + if (!content) { + xml_parser_error(parser, 0, "xml_parse_node::content"); + goto exit_failure; + } + + + /* Otherwise children are to be expected + */ + } else while ('/' != xml_parser_peek(parser, NEXT_CHARACTER)) { + + /* Parse child node + */ + struct xml_node* child = xml_parse_node(parser); + size_t old_elements, new_elements; + + if (!child) { + xml_parser_error(parser, NEXT_CHARACTER, "xml_parse_node::child"); + goto exit_failure; + } + + /* Grow child array :) + */ + old_elements = get_zero_terminated_array_nodes(children); + new_elements = old_elements + 1; + children = ms3_crealloc(children, (new_elements + 1) * sizeof(struct xml_node*)); + + /* Save child + */ + children[new_elements - 1] = child; + children[new_elements] = 0; + } + + + /* Parse close tag + */ + tag_close = xml_parse_tag_close(parser); + if (!tag_close) { + xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag_close"); + goto exit_failure; + } + + + /* Close tag has to match open tag + */ + if (!xml_string_equals(tag_open, tag_close)) { + xml_parser_error(parser, NO_CHARACTER, "xml_parse_node::tag mismatch"); + goto exit_failure; + } + + + /* Return parsed node + */ + xml_string_free(tag_close); + +node_creation:; + node = ms3_cmalloc(sizeof(struct xml_node)); + node->name = tag_open; + node->content = content; + node->attributes = attributes; + node->children = children; + return node; + + + /* A failure occured, so free all allocalted resources + */ +exit_failure: + if (tag_open) { + xml_string_free(tag_open); + } + if (tag_close) { + xml_string_free(tag_close); + } + if (content) { + xml_string_free(content); + } + + it = children; + while (*it) { + xml_node_free(*it); + ++it; + } + ms3_cfree(children); + + return 0; +} + + +/** + * [PRIVATE] + * Skips XML headers in <? text ?> format + */ +static void xml_parse_skip_meta(struct xml_parser* parser) { + if ('<' == xml_parser_peek(parser, CURRENT_CHARACTER) && + '?' == xml_parser_peek(parser, NEXT_CHARACTER)) { + size_t pos = parser->position; + while (pos < parser->length) { + if ('?' == parser->buffer[pos] && + '>' == parser->buffer[pos + 1]) { + parser->position = pos + 2; + return; + } + pos++; + } + } +} + +/** + * [PUBLIC API] + */ +struct xml_document* xml_parse_document(uint8_t* buffer, size_t length) { + + /* Initialize parser + */ + struct xml_parser parser = { + .buffer = buffer, + .position = 0, + .length = length + }; + struct xml_node* root; + struct xml_document* document; + + /* An empty buffer can never contain a valid document + */ + if (!length) { + xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::length equals zero"); + return 0; + } + + /* Parse the root node + */ + xml_parse_skip_meta(&parser); + root = xml_parse_node(&parser); + if (!root) { + xml_parser_error(&parser, NO_CHARACTER, "xml_parse_document::parsing document failed"); + return 0; + } + + /* Return parsed document + */ + document = ms3_cmalloc(sizeof(struct xml_document)); + document->buffer.buffer = buffer; + document->buffer.length = length; + document->root = root; + + return document; +} + + + +/** + * [PUBLIC API] + */ +struct xml_document* xml_open_document(FILE* source) { + + /* Prepare buffer + */ + size_t const read_chunk = 1; // TODO 4096; + + size_t document_length = 0; + size_t buffer_size = 1; // TODO 4069 + struct xml_document* document; + uint8_t* buffer = ms3_cmalloc(buffer_size * sizeof(uint8_t)); + + + /* Read hole file into buffer + */ + while (!feof(source)) { + size_t read; + /* Reallocate buffer + */ + if (buffer_size - document_length < read_chunk) { + buffer = ms3_crealloc(buffer, buffer_size + 2 * read_chunk); + buffer_size += 2 * read_chunk; + } + + read = fread(&buffer[document_length], + sizeof(uint8_t), read_chunk, + source + ); + + document_length += read; + } + fclose(source); + + /* Try to parse buffer + */ + document = xml_parse_document(buffer, document_length); + + if (!document) { + ms3_cfree(buffer); + return 0; + } + return document; +} + + + +/** + * [PUBLIC API] + */ +void xml_document_free(struct xml_document* document, bool free_buffer) { + xml_node_free(document->root); + + if (free_buffer) { + ms3_cfree(document->buffer.buffer); + } + ms3_cfree(document); +} + + + +/** + * [PUBLIC API] + */ +struct xml_node* xml_document_root(struct xml_document* document) { + return document->root; +} + + + +/** + * [PUBLIC API] + */ +struct xml_string* xml_node_name(struct xml_node* node) { + return node->name; +} + +int xml_node_name_cmp(struct xml_node* node, const char *name) { + return strncmp((char*)node->name->buffer, name, node->name->length); +} + +/** + * [PUBLIC API] + */ +struct xml_string* xml_node_content(struct xml_node* node) { + return node->content; +} + + + +/** + * [PUBLIC API] + * + * @warning O(n) + */ +size_t xml_node_children(struct xml_node* node) { + return get_zero_terminated_array_nodes(node->children); +} + + + +/** + * [PUBLIC API] + */ +struct xml_node* xml_node_child(struct xml_node* node, size_t child) { + if (child >= xml_node_children(node)) { + return 0; + } + + return node->children[child]; +} + + + +/** + * [PUBLIC API] + */ +size_t xml_node_attributes(struct xml_node* node) { + return get_zero_terminated_array_attributes(node->attributes); +} + + + +/** + * [PUBLIC API] + */ +struct xml_string* xml_node_attribute_name(struct xml_node* node, size_t attribute) { + if(attribute >= xml_node_attributes(node)) { + return 0; + } + + return node->attributes[attribute]->name; +} + + + +/** + * [PUBLIC API] + */ +struct xml_string* xml_node_attribute_content(struct xml_node* node, size_t attribute) { + if(attribute >= xml_node_attributes(node)) { + return 0; + } + + return node->attributes[attribute]->content; +} + + + +/** + * [PUBLIC API] + */ +struct xml_node* xml_easy_child(struct xml_node* node, uint8_t const* child_name, ...) { + + /* Find children, one by one + */ + struct xml_node* current = node; + + va_list arguments; + va_start(arguments, child_name); + + + /* Descent to current.child + */ + while (child_name) { + + /* Convert child_name to xml_string for easy comparison + */ + struct xml_string cn = { + .buffer = child_name, + .length = strlen((const char*)child_name) + }; + + /* Interate through all children + */ + struct xml_node* next = 0; + + size_t i = 0; for (; i < xml_node_children(current); ++i) { + struct xml_node* child = xml_node_child(current, i); + + if (xml_string_equals(xml_node_name(child), &cn)) { + if (!next) { + next = child; + + /* Two children with the same name + */ + } else { + va_end(arguments); + return 0; + } + } + } + + /* No child with that name found + */ + if (!next) { + va_end(arguments); + return 0; + } + current = next; + + /* Find name of next child + */ + child_name = va_arg(arguments, uint8_t const*); + } + va_end(arguments); + + + /* Return current element + */ + return current; +} + + + +/** + * [PUBLIC API] + */ +uint8_t* xml_easy_name(struct xml_node* node) { + if (!node) { + return 0; + } + + return xml_string_clone(xml_node_name(node)); +} + + + +/** + * [PUBLIC API] + */ +uint8_t* xml_easy_content(struct xml_node* node) { + if (!node) { + return 0; + } + + return xml_string_clone(xml_node_content(node)); +} + + + +/** + * [PUBLIC API] + */ +size_t xml_string_length(struct xml_string* string) { + if (!string) { + return 0; + } + return string->length; +} + + + +/** + * [PUBLIC API] + */ +void xml_string_copy(struct xml_string* string, uint8_t* buffer, size_t length) { + if (!string) { + return; + } + + #define min(X,Y) ((X) < (Y) ? (X) : (Y)) + length = min(length, string->length); + #undef min + + memcpy(buffer, string->buffer, length); + buffer[length]= '\0'; +} diff --git a/storage/maria/libmarias3/src/xml.h b/storage/maria/libmarias3/src/xml.h new file mode 100644 index 00000000..9574b2bc --- /dev/null +++ b/storage/maria/libmarias3/src/xml.h @@ -0,0 +1,197 @@ +/** + * Copyright (c) 2012 ooxi/xml.c + * https://github.com/ooxi/xml.c + * + * This software is provided 'as-is', without any express or implied warranty. + * In no event will the authors be held liable for any damages arising from the + * use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software in a + * product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * + * 3. This notice may not be removed or altered from any source distribution. + */ +#ifndef HEADER_XML +#define HEADER_XML + + +/** + * Includes + */ +#include <stdint.h> +#include <string.h> +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Opaque structure holding the parsed xml document + */ +struct xml_document; +struct xml_node; +struct xml_attribute; + +/** + * Internal character sequence representation + */ +struct xml_string; + + + +/** + * Tries to parse the XML fragment in buffer + * + * @param buffer Chunk to parse + * @param length Size of the buffer + * + * @warning `buffer` will be referenced by the document, you may not free it + * until you free the xml_document + * @warning You have to call xml_document_free after you finished using the + * document + * + * @return The parsed xml fragment iff parsing was successful, 0 otherwise + */ +struct xml_document* xml_parse_document(uint8_t* buffer, size_t length); + + + +/** + * Tries to read an XML document from disk + * + * @param source File that will be read into an xml document. Will be closed + * + * @warning You have to call xml_document_free with free_buffer = true after you + * finished using the document + * + * @return The parsed xml fragment iff parsing was successful, 0 otherwise + */ +struct xml_document* xml_open_document(FILE* source); + + + +/** + * Frees all resources associated with the document. All xml_node and xml_string + * references obtained through the document will be invalidated + * + * @param document xml_document to free + * @param free_buffer iff true the internal buffer supplied via xml_parse_buffer + * will be freed with the `free` system call + */ +void xml_document_free(struct xml_document* document, bool free_buffer); + + +/** + * @return xml_node representing the document root + */ +struct xml_node* xml_document_root(struct xml_document* document); + + + +/** + * @return The xml_node's tag name + */ +struct xml_string* xml_node_name(struct xml_node* node); + + + +/** + * @return The xml_node's string content (if available, otherwise NULL) + */ +struct xml_string* xml_node_content(struct xml_node* node); + + + +/** + * @return Number of child nodes + */ +size_t xml_node_children(struct xml_node* node); + + + +/** + * @return The n-th child or 0 if out of range + */ +struct xml_node* xml_node_child(struct xml_node* node, size_t child); + + + +/** + * @return Number of attribute nodes + */ +size_t xml_node_attributes(struct xml_node* node); + + + +/** + * @return the n-th attribute name or 0 if out of range + */ +struct xml_string* xml_node_attribute_name(struct xml_node* node, size_t attribute); + + + +/** + * @return the n-th attribute content or 0 if out of range + */ +struct xml_string* xml_node_attribute_content(struct xml_node* node, size_t attribute); + + + +/** + * @return The node described by the path or 0 if child cannot be found + * @warning Each element on the way must be unique + * @warning Last argument must be 0 + */ +struct xml_node* xml_easy_child(struct xml_node* node, uint8_t const* child, ...); + + + +/** + * @return 0-terminated copy of node name + * @warning User must free the result + */ +uint8_t* xml_easy_name(struct xml_node* node); + + + +/** + * @return 0-terminated copy of node content + * @warning User must free the result + */ +uint8_t* xml_easy_content(struct xml_node* node); + + + +/** + * @return Length of the string + */ +size_t xml_string_length(struct xml_string* string); + + + +/** + * Copies the string into the supplied buffer + * + * @warning String will not be 0-terminated + * @warning Will write at most length bytes, even if the string is longer + */ +void xml_string_copy(struct xml_string* string, uint8_t* buffer, size_t length); + +int xml_node_name_cmp(struct xml_node* node, const char *name); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/storage/maria/libmarias3/tests/basic.c b/storage/maria/libmarias3/tests/basic.c new file mode 100644 index 00000000..309c77bb --- /dev/null +++ b/storage/maria/libmarias3/tests/basic.c @@ -0,0 +1,180 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests basic put, list, get, status, delete using the thread calls */ + +int main(int argc, char *argv[]) +{ + int res; + ms3_list_st *list = NULL, *list_it = NULL; + uint8_t *data; + size_t length; + int i; + bool found; + uint8_t list_version; + const char *test_string = "Another one bites the dust"; + ms3_status_st status; + ms3_st *ms3; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/basic_thread.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + // A prefix that will give no results; + res = ms3_list(ms3, s3bucket, "asdfghjkl", &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_NULL_(list, "List not empty"); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/basic_thread.txt", 21)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 26, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + // Retry list with V1 API + list_version = 1; + list = NULL; + ms3_set_option(ms3, MS3_OPT_FORCE_LIST_VERSION, &list_version); + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/basic_thread.txt", 21)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 26, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + res = ms3_get(ms3, s3bucket, "test/basic_thread.txt", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 26); + ASSERT_STREQ((char *)data, test_string); + + for (i = 0; i <= 3; i++) + { + res = ms3_status(ms3, s3bucket, "test/basic_thread.txt", &status); + + if (res == MS3_ERR_NOT_FOUND) + { + continue; + } + + ASSERT_EQ_(res, 0, "Result: %u", res); + + if (res == 0) + { + break; + } + } + + ASSERT_EQ(status.length, 26); + ASSERT_NEQ(status.created, 0); + res = ms3_delete(ms3, s3bucket, "test/basic_thread.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + ms3_free(data); + res = ms3_get(ms3, s3bucket, "test/basic_thread.txt", &data, &length); + ASSERT_NEQ_(res, 0, "Object should error"); + ASSERT_NULL_(data, "Data should be NULL"); + ASSERT_EQ_(length, 0, "There should be no data"); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/basic_host.c b/storage/maria/libmarias3/tests/basic_host.c new file mode 100644 index 00000000..7b9aa3da --- /dev/null +++ b/storage/maria/libmarias3/tests/basic_host.c @@ -0,0 +1,158 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests basic calls with an expicit hostname */ + +int main(int argc, char *argv[]) +{ + int res; + ms3_list_st *list = NULL, *list_it = NULL; + uint8_t *data; + size_t length; + int i; + bool found; + uint8_t protocol_version; + const char *test_string = "Another one bites the dust"; + ms3_status_st status; + ms3_st *ms3; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + SKIP_IF_(s3host, "Test is for non-explicit hostnames"); + + (void) argc; + (void) argv; + + if (!s3host || s3host[0] == '\0') + { + const char *default_host = "s3.amazonaws.com"; + s3host = (char *)default_host; + } + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + protocol_version = 2; + ms3_set_option(ms3, MS3_OPT_FORCE_PROTOCOL_VERSION, &protocol_version); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/basic_host.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + // A prefix that will give no results; + res = ms3_list(ms3, s3bucket, "asdfghjkl", &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_NULL_(list, "List not empty"); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/basic_host.txt", 19)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 26, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + res = ms3_get(ms3, s3bucket, "test/basic_host.txt", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 26); + ASSERT_STREQ((char *)data, test_string); + + for (i = 0; i <= 3; i++) + { + res = ms3_status(ms3, s3bucket, "test/basic_host.txt", &status); + + if (res == MS3_ERR_NOT_FOUND) + { + continue; + } + + ASSERT_EQ_(res, 0, "Result: %u", res); + + if (res == 0) + { + break; + } + } + + ASSERT_EQ(status.length, 26); + ASSERT_NEQ(status.created, 0); + res = ms3_delete(ms3, s3bucket, "test/basic_host.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + ms3_free(data); + res = ms3_get(ms3, s3bucket, "test/basic_host.txt", &data, &length); + ASSERT_NEQ_(res, 0, "Object should error"); + ASSERT_NULL_(data, "Data should be NULL"); + ASSERT_EQ_(length, 0, "There should be no data"); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/copy.c b/storage/maria/libmarias3/tests/copy.c new file mode 100644 index 00000000..9587e214 --- /dev/null +++ b/storage/maria/libmarias3/tests/copy.c @@ -0,0 +1,181 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests copy and move calls */ + +int main(int argc, char *argv[]) +{ + int res; + ms3_list_st *list = NULL, *list_it = NULL; + uint8_t *data; + size_t length; + bool found, found_orig, found_new; + ms3_st *ms3; + const char *test_string = "Another one bites the dust"; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/copy_test.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + res = ms3_copy(ms3, s3bucket, "test/copy_test.txt", s3bucket, + "test/copied.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/copied.txt", 12)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Copied file not found"); + + // Test with hash chars in filename + + res = ms3_put(ms3, s3bucket, "test/copy_###_test.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + res = ms3_copy(ms3, s3bucket, "test/copy_###_test.txt", s3bucket, + "test/copied###.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/copied###.txt", 12)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Copied file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 26, "Copied file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Copied file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + res = ms3_move(ms3, s3bucket, "test/copy_test.txt", s3bucket, "test/moved.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found_orig = false; + found_new = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/moved.txt", 12)) + { + found_new = true; + } + else if (!strncmp(list_it->key, "test/copy_test.txt", 12)) + { + found_orig = true; + } + + + list_it = list_it->next; + } + + ASSERT_EQ_(found_new, 1, "Copied file not found"); + ASSERT_EQ_(found_orig, 0, "Original file still exists after move"); + + res = ms3_get(ms3, s3bucket, "test/moved.txt", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 26); + ASSERT_STREQ((char *)data, test_string); + + res = ms3_delete(ms3, s3bucket, "test/moved.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + res = ms3_delete(ms3, s3bucket, "test/copied.txt"); + res = ms3_delete(ms3, s3bucket, "test/copy_###_test.txt"); + res = ms3_delete(ms3, s3bucket, "test/copied###.txt"); + + ms3_free(data); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/custom_malloc.c b/storage/maria/libmarias3/tests/custom_malloc.c new file mode 100644 index 00000000..80f1b393 --- /dev/null +++ b/storage/maria/libmarias3/tests/custom_malloc.c @@ -0,0 +1,207 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests functions using custom allocators */ + +static void *cust_malloc(size_t size) +{ + printf("Malloc of %zu bytes\n", size); + return malloc(size); +} + +static void cust_free(void *ptr) +{ + printf("Free called\n"); + free(ptr); +} + +static void *cust_realloc(void *ptr, size_t size) +{ + printf("Realloc of %zu bytes\n", size); + return realloc(ptr, size); +} + +static char *cust_strdup(const char *str) +{ + printf("Strdup called\n"); + return strdup(str); +} + +static void *cust_calloc(size_t nmemb, size_t size) +{ + printf("Calloc of %zu elements, %zu size\n", nmemb, size); + return calloc(nmemb, size); +} + +int main(int argc, char *argv[]) +{ + int res; + int i; + ms3_list_st *list = NULL, *list_it = NULL; + uint8_t *data; + size_t length; + bool found; + uint8_t list_version; + const char *test_string = "Another one bites the dust"; + ms3_status_st status; + ms3_st *ms3; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3_library_init_malloc(cust_malloc, cust_free, cust_realloc, cust_strdup, + cust_calloc); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(true); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/custom_malloc.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + // A prefix that will give no results; + res = ms3_list(ms3, s3bucket, "asdfghjkl", &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_NULL_(list, "List not empty"); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/custom_malloc.txt", 12)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 26, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + // Retry list with V1 API + list_version = 1; + list = NULL; + ms3_set_option(ms3, MS3_OPT_FORCE_LIST_VERSION, &list_version); + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "test/custom_malloc.txt", 12)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 26, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + res = ms3_get(ms3, s3bucket, "test/custom_malloc.txt", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 26); + ASSERT_STREQ((char *)data, test_string); + + for (i = 0; i <= 3; i++) + { + res = ms3_status(ms3, s3bucket, "test/custom_malloc.txt", &status); + + if (res == MS3_ERR_NOT_FOUND) + { + continue; + } + + ASSERT_EQ_(res, 0, "Result: %u", res); + + if (res == 0) + { + break; + } + } + + ASSERT_EQ(status.length, 26); + ASSERT_NEQ(status.created, 0); + res = ms3_delete(ms3, s3bucket, "test/custom_malloc.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + ms3_free(data); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/error.c b/storage/maria/libmarias3/tests/error.c new file mode 100644 index 00000000..9be45d44 --- /dev/null +++ b/storage/maria/libmarias3/tests/error.c @@ -0,0 +1,58 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests basic error handling */ + +int main(int argc, char *argv[]) +{ + uint8_t *data; + size_t length; + const char *errmsg; + uint8_t res; + ms3_st *ms3 = ms3_init("12345678901234567890", + "1234567890123456789012345678901234567890", "us-east-1", NULL); + + (void) argc; + (void) argv; + + // Enable here so cppcheck shows coverage + ms3_debug(); + ASSERT_NOT_NULL(ms3); + errmsg = ms3_error(255); + ASSERT_STREQ(errmsg, "No such error code"); + errmsg = ms3_error(0); + ASSERT_STREQ(errmsg, "No error"); + res = ms3_get(ms3, "bad", "bad/file.txt", &data, &length); + printf("%d\n", res); + printf("%s\n", ms3_server_error(ms3)); + ASSERT_EQ(res, MS3_ERR_AUTH); // Bad auth + free(data); + ms3_deinit(ms3); + ms3 = ms3_init("12345678901234567890", + "1234567890123456789012345678901234567890", "us-east-1", "bad-domain"); + res = ms3_get(ms3, "bad", "bad/file.txt", &data, &length); + ASSERT_EQ(res, MS3_ERR_REQUEST_ERROR); + free(data); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/include.am b/storage/maria/libmarias3/tests/include.am new file mode 100644 index 00000000..1cb25a1c --- /dev/null +++ b/storage/maria/libmarias3/tests/include.am @@ -0,0 +1,69 @@ +# vim:ft=automake +# Copyright (C) 2012 Data Differential +# All rights reserved. +# +# Use and distribution licensed under the BSD license. See +# the COPYING file in the parent directory for full text. +# +# included from Top Level Makefile.am +# All paths should be given relative to the root + +LIBTOOL_COMMAND= ${abs_top_builddir}/libtool --mode=execute +GDB_COMMAND= $(LIBTOOL_COMMAND) gdb -f -x support/run.gdb + +t_error_SOURCES= tests/error.c +t_error_LDADD= src/libmarias3.la +check_PROGRAMS+= t/error +noinst_PROGRAMS+= t/error + +t_basic_SOURCES= tests/basic.c +t_basic_LDADD= src/libmarias3.la +check_PROGRAMS+= t/basic +noinst_PROGRAMS+= t/basic + +t_snowman_SOURCES= tests/snowman.c +t_snowman_LDADD= src/libmarias3.la +check_PROGRAMS+= t/snowman +noinst_PROGRAMS+= t/snowman + +t_basic_host_SOURCES= tests/basic_host.c +t_basic_host_LDADD= src/libmarias3.la +check_PROGRAMS+= t/basic_host +noinst_PROGRAMS+= t/basic_host + +t_copy_SOURCES= tests/copy.c +t_copy_LDADD= src/libmarias3.la +check_PROGRAMS+= t/copy +noinst_PROGRAMS+= t/copy + +t_large_file_SOURCES= tests/large_file.c +t_large_file_LDADD= src/libmarias3.la +check_PROGRAMS+= t/large_file +noinst_PROGRAMS+= t/large_file + +t_small_buffer_SOURCES= tests/small_buffer.c +t_small_buffer_LDADD= src/libmarias3.la +check_PROGRAMS+= t/small_buffer +noinst_PROGRAMS+= t/small_buffer + +t_prefix_SOURCES= tests/prefix.c +t_prefix_LDADD= src/libmarias3.la +check_PROGRAMS+= t/prefix +noinst_PROGRAMS+= t/prefix + +t_longlist_SOURCES= tests/longlist.c +t_longlist_LDADD= src/libmarias3.la +t_longlist_LDADD+= -lpthread +check_PROGRAMS+= t/longlist +noinst_PROGRAMS+= t/longlist + +t_custom_malloc_SOURCES= tests/custom_malloc.c +t_custom_malloc_LDADD= src/libmarias3.la +check_PROGRAMS+= t/custom_malloc +noinst_PROGRAMS+= t/custom_malloc + +t_list_SOURCES= tests/list.c +t_list_LDADD= src/libmarias3.la +check_PROGRAMS+= t/list +noinst_PROGRAMS+= t/list + diff --git a/storage/maria/libmarias3/tests/large_file.c b/storage/maria/libmarias3/tests/large_file.c new file mode 100644 index 00000000..b25213f8 --- /dev/null +++ b/storage/maria/libmarias3/tests/large_file.c @@ -0,0 +1,90 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests basic PUT/GET a 64MB file */ + +int main(int argc, char *argv[]) +{ + int res; + uint8_t *data; + size_t length; + size_t new_buffer_size; + ms3_st *ms3; + char *test_string = malloc(64 * 1024 * 1024); + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + memset(test_string, 'a', 64 * 1024 * 1024); + + (void) argc; + (void) argv; + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(true); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/large_file.dat", + (const uint8_t *)test_string, + 64 * 1024 * 1024); + ASSERT_EQ_(res, 0, "Result: %u", res); + new_buffer_size = 4 * 1024 * 1024; + res = ms3_set_option(ms3, MS3_OPT_BUFFER_CHUNK_SIZE, &new_buffer_size); + ASSERT_EQ_(res, 0, "Result: %u", res); + res = ms3_get(ms3, s3bucket, "test/large_file.dat", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 64 * 1024 * 1024); + res = ms3_delete(ms3, s3bucket, "test/large_file.dat"); + ASSERT_EQ_(res, 0, "Result: %u", res); + free(test_string); + ms3_free(data); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/list.c b/storage/maria/libmarias3/tests/list.c new file mode 100644 index 00000000..ef268730 --- /dev/null +++ b/storage/maria/libmarias3/tests/list.c @@ -0,0 +1,156 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests list command */ + +int main(int argc, char *argv[]) +{ + int res; + ms3_list_st *list = NULL, *list_it = NULL; + ms3_st *ms3; + bool found, found_bad; + uint8_t list_version; + const char *test_string = "Another one bites the dust"; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "list1/test1.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + res = ms3_put(ms3, s3bucket, "list2/test2.txt", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + // A prefix that will give no results; + res = ms3_list_dir(ms3, s3bucket, "asdfghjkl", &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_NULL_(list, "List not empty"); + + res = ms3_list_dir(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + found_bad = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "list1/test1.txt", 15)) + { + found_bad = true; + } + + if (!strncmp(list_it->key, "list2/", 6)) + { + found = true; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + ASSERT_NEQ_(found_bad, 1, "File listed should not be here"); + + if (!list) + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + // Retry list with V1 API + list_version = 1; + list = NULL; + ms3_set_option(ms3, MS3_OPT_FORCE_LIST_VERSION, &list_version); + res = ms3_list_dir(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "list1/test1.txt", 15)) + { + found_bad = true; + } + + if (!strncmp(list_it->key, "list2/", 6)) + { + found = true; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + ASSERT_NEQ_(found_bad, 1, "File listed should not be here"); + + if (!list) + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + res = ms3_delete(ms3, s3bucket, "list1/test1.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + res = ms3_delete(ms3, s3bucket, "list2/test2.txt"); + ASSERT_EQ_(res, 0, "Result: %u", res); + + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/longlist.c b/storage/maria/libmarias3/tests/longlist.c new file mode 100644 index 00000000..5a14031c --- /dev/null +++ b/storage/maria/libmarias3/tests/longlist.c @@ -0,0 +1,283 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> +#include <pthread.h> + +/* Tests listing 1500 items + * This test creates 1500 items using 10 threads, lists them to check + * we can get them all due to the 1000 item pagination limit and then + * deletes them using 10 threads. + */ + +struct thread_info +{ + pthread_t thread_id; + int thread_num; + int start_count; + char *s3bucket; + char *s3key; + char *s3secret; + char *s3region; + char *s3host; + char *s3port; + bool usehttp; + bool noverify; +}; + +const char *test_string = "Another one bites the dust"; + +static void *put_thread(void *arg) +{ + int i; + struct thread_info *tinfo = arg; + ms3_st *ms3 = ms3_init(tinfo->s3key, tinfo->s3secret, tinfo->s3region, + tinfo->s3host); + + if (tinfo->noverify) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (tinfo->usehttp) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (tinfo->s3port) + { + int port = atol(tinfo->s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + + for (i = tinfo->start_count; i < tinfo->start_count + 150; i++) + { + uint8_t res; + char fname[64]; + snprintf(fname, 64, "listtest/list-%d.dat", i); + res = ms3_put(ms3, tinfo->s3bucket, fname, (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ(res, 0); + } + + ms3_deinit(ms3); + + return NULL; +} + +static void *delete_thread(void *arg) +{ + int i; + struct thread_info *tinfo = arg; + ms3_st *ms3 = ms3_init(tinfo->s3key, tinfo->s3secret, tinfo->s3region, + tinfo->s3host); + + if (tinfo->noverify) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (tinfo->usehttp) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (tinfo->s3port) + { + int port = atol(tinfo->s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + + for (i = tinfo->start_count; i < tinfo->start_count + 150; i++) + { + uint8_t res; + char fname[64]; + snprintf(fname, 64, "listtest/list-%d.dat", i); + res = ms3_delete(ms3, tinfo->s3bucket, fname); + ASSERT_EQ(res, 0); + } + + ms3_deinit(ms3); + return NULL; +} + + +int main(int argc, char *argv[]) +{ + + + int tnum; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + bool noverify = false; + bool usehttp = false; + struct thread_info *tinfo; + int start_count; + uint8_t res; + uint8_t list_version; + pthread_attr_t attr; + ms3_st *ms3; + int res_count; + ms3_list_st *list = NULL, *list_it = NULL; + + if (s3noverify && !strcmp(s3noverify, "1")) + { + noverify = true; + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + usehttp = true; + } + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + (void) argc; + (void) argv; + ms3_library_init(); + +// ms3_debug(true); + + tinfo = calloc(10, sizeof(struct thread_info)); + + start_count = 1000; + + pthread_attr_init(&attr); + + // Write 1500 files using 10 threads + printf("Writing 1500 items\n"); + + for (tnum = 0; tnum < 10; tnum++) + { + tinfo[tnum].thread_num = tnum + 1; + tinfo[tnum].start_count = start_count; + start_count += 150; + tinfo[tnum].s3key = s3key; + tinfo[tnum].s3secret = s3secret; + tinfo[tnum].s3region = s3region; + tinfo[tnum].s3host = s3host; + tinfo[tnum].s3bucket = s3bucket; + tinfo[tnum].s3port = s3port; + tinfo[tnum].noverify = noverify; + tinfo[tnum].usehttp = usehttp; + pthread_create(&tinfo[tnum].thread_id, &attr, + &put_thread, &tinfo[tnum]); + } + + for (tnum = 0; tnum < 10; tnum++) + { + pthread_join(tinfo[tnum].thread_id, NULL); + } + + free(tinfo); + + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (noverify) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (usehttp) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + + res = ms3_list(ms3, s3bucket, "listtest/", &list); + ASSERT_EQ(res, 0); + list_it = list; + res_count = 0; + + while (list_it) + { + res_count++; + list_it = list_it->next; + } + + printf("Found %d items\n", res_count); + ASSERT_EQ(res_count, 1500); + + // Reattempt with list version 1 + list_version = 1; + list = NULL; + ms3_set_option(ms3, MS3_OPT_FORCE_LIST_VERSION, &list_version); + res = ms3_list(ms3, s3bucket, "listtest/", &list); + ASSERT_EQ(res, 0); + list_it = list; + res_count = 0; + + while (list_it) + { + res_count++; + list_it = list_it->next; + } + + printf("V1 Found %d items\n", res_count); + ASSERT_EQ(res_count, 1500); + + ms3_deinit(ms3); + + tinfo = calloc(10, sizeof(struct thread_info)); + start_count = 1000; + + // Destroy 1500 files using 10 threads + printf("Deleting 1500 items"); + + for (tnum = 0; tnum < 10; tnum++) + { + tinfo[tnum].thread_num = tnum + 1; + tinfo[tnum].start_count = start_count; + start_count += 150; + tinfo[tnum].s3key = s3key; + tinfo[tnum].s3secret = s3secret; + tinfo[tnum].s3region = s3region; + tinfo[tnum].s3host = s3host; + tinfo[tnum].s3bucket = s3bucket; + tinfo[tnum].noverify = noverify; + tinfo[tnum].usehttp = usehttp; + tinfo[tnum].s3port = s3port; + pthread_create(&tinfo[tnum].thread_id, &attr, + &delete_thread, &tinfo[tnum]); + } + + for (tnum = 0; tnum < 10; tnum++) + { + pthread_join(tinfo[tnum].thread_id, NULL); + } + + free(tinfo); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/prefix.c b/storage/maria/libmarias3/tests/prefix.c new file mode 100644 index 00000000..672120dd --- /dev/null +++ b/storage/maria/libmarias3/tests/prefix.c @@ -0,0 +1,129 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +#include <unistd.h> +/* Test adds two files and checks that list prefix filters one out */ + +int main(int argc, char *argv[]) +{ + int res; + ms3_list_st *list = NULL, *list_it = NULL; + int i; + ms3_st *ms3; + bool found_good, found_bad; + const char *test_string = "Another one bites the dust"; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(true); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/prefix.txt", (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ(res, 0); + res = ms3_put(ms3, s3bucket, "other/prefix.txt", (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ(res, 0); + + found_good = false; + found_bad = false; + + for (i = 0; i <= 3; i++) + { + uint8_t file_count; + res = ms3_list(ms3, s3bucket, "test", &list); + ASSERT_EQ(res, 0); + list_it = list; + file_count = 0; + + while (list_it) + { + if (!strncmp(list_it->key, "test/prefix.txt", 12)) + { + found_good = true; + } + + if (!strncmp(list_it->key, "other/prefix.txt", 12)) + { + found_bad = true; + } + + list_it = list_it->next; + file_count++; + } + + if ((file_count == 0) || !found_good) + { + sleep(1); + printf("Bad file count, retrying"); + found_good = false; + found_bad = false; + continue; + } + else + { + break; + } + } + + ASSERT_EQ_(found_good, 1, "Created file not found"); + ASSERT_EQ_(found_bad, 0, "Filter found file it shouldn't"); + res = ms3_delete(ms3, s3bucket, "test/prefix.txt"); + ASSERT_EQ(res, 0); + res = ms3_delete(ms3, s3bucket, "other/prefix.txt"); + ASSERT_EQ(res, 0); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/small_buffer.c b/storage/maria/libmarias3/tests/small_buffer.c new file mode 100644 index 00000000..4e9900b4 --- /dev/null +++ b/storage/maria/libmarias3/tests/small_buffer.c @@ -0,0 +1,98 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests basic GET with a small buffer */ + +int main(int argc, char *argv[]) +{ + int res; + uint8_t *data; + size_t length; + size_t new_buffer_size; + ms3_st *ms3; + char *test_string = malloc(64 * 1024); + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + memset(test_string, 'a', 64 * 1024); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(true); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "test/small_buffer.dat", + (const uint8_t *)test_string, + 64 * 1024); + ASSERT_EQ_(res, 0, "Result: %u", res); + new_buffer_size = 64 * 1024; + res = ms3_set_option(ms3, MS3_OPT_BUFFER_CHUNK_SIZE, &new_buffer_size); + ASSERT_EQ_(res, 0, "Result: %u", res); + res = ms3_get(ms3, s3bucket, "test/small_buffer.dat", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 64 * 1024); + ms3_free(data); + + new_buffer_size = 1024; + res = ms3_set_option(ms3, MS3_OPT_BUFFER_CHUNK_SIZE, &new_buffer_size); + ASSERT_EQ_(res, 0, "Result: %u", res); + res = ms3_get(ms3, s3bucket, "test/small_buffer.dat", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 64 * 1024); + res = ms3_delete(ms3, s3bucket, "test/small_buffer.dat"); + ASSERT_EQ_(res, 0, "Result: %u", res); + free(test_string); + ms3_free(data); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/tests/snowman.c b/storage/maria/libmarias3/tests/snowman.c new file mode 100644 index 00000000..cd611c7d --- /dev/null +++ b/storage/maria/libmarias3/tests/snowman.c @@ -0,0 +1,180 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * Copyright 2019 MariaDB Corporation Ab. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#include <yatl/lite.h> +#include <libmarias3/marias3.h> + +/* Tests basic calls using UTF-8 */ + +int main(int argc, char *argv[]) +{ + int res; + ms3_list_st *list = NULL, *list_it = NULL; + uint8_t *data; + size_t length; + int i; + bool found; + uint8_t list_version; + const char *test_string = "Another ☃☃☃ bites the dust"; + ms3_status_st status; + ms3_st *ms3; + char *s3key = getenv("S3KEY"); + char *s3secret = getenv("S3SECRET"); + char *s3region = getenv("S3REGION"); + char *s3bucket = getenv("S3BUCKET"); + char *s3host = getenv("S3HOST"); + char *s3noverify = getenv("S3NOVERIFY"); + char *s3usehttp = getenv("S3USEHTTP"); + char *s3port = getenv("S3PORT"); + + SKIP_IF_(!s3key, "Environemnt variable S3KEY missing"); + SKIP_IF_(!s3secret, "Environemnt variable S3SECRET missing"); + SKIP_IF_(!s3region, "Environemnt variable S3REGION missing"); + SKIP_IF_(!s3bucket, "Environemnt variable S3BUCKET missing"); + + (void) argc; + (void) argv; + + ms3_library_init(); + ms3 = ms3_init(s3key, s3secret, s3region, s3host); + + if (s3noverify && !strcmp(s3noverify, "1")) + { + ms3_set_option(ms3, MS3_OPT_DISABLE_SSL_VERIFY, NULL); + } + + if (s3usehttp && !strcmp(s3usehttp, "1")) + { + ms3_set_option(ms3, MS3_OPT_USE_HTTP, NULL); + } + + if (s3port) + { + int port = atol(s3port); + ms3_set_option(ms3, MS3_OPT_PORT_NUMBER, &port); + } + +// ms3_debug(); + ASSERT_NOT_NULL(ms3); + + res = ms3_put(ms3, s3bucket, "☃/☃.☃", + (const uint8_t *)test_string, + strlen(test_string)); + ASSERT_EQ_(res, 0, "Result: %u", res); + + // A prefix that will give no results; + res = ms3_list(ms3, s3bucket, "asdfghjkl", &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_NULL_(list, "List not empty"); + + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "☃/☃.☃", 11)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 32, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + // Retry list with V1 API + list_version = 1; + list = NULL; + ms3_set_option(ms3, MS3_OPT_FORCE_LIST_VERSION, &list_version); + res = ms3_list(ms3, s3bucket, NULL, &list); + ASSERT_EQ_(res, 0, "Result: %u", res); + found = false; + list_it = list; + + while (list_it) + { + if (!strncmp(list_it->key, "☃/☃.☃", 11)) + { + found = true; + break; + } + + list_it = list_it->next; + } + + ASSERT_EQ_(found, 1, "Created file not found"); + + if (list_it) + { + ASSERT_EQ_(list_it->length, 32, "Created file is unexpected length"); + ASSERT_NEQ_(list_it->created, 0, "Created file timestamp is bad"); + } + else + { + ASSERT_TRUE_(false, "No resuts from list"); + } + + res = ms3_get(ms3, s3bucket, "☃/☃.☃", &data, &length); + ASSERT_EQ_(res, 0, "Result: %u", res); + ASSERT_EQ(length, 32); + ASSERT_STREQ((char *)data, test_string); + + for (i = 0; i <= 3; i++) + { + res = ms3_status(ms3, s3bucket, "☃/☃.☃", &status); + + if (res == MS3_ERR_NOT_FOUND) + { + continue; + } + + ASSERT_EQ_(res, 0, "Result: %u", res); + + if (res == 0) + { + break; + } + } + + ASSERT_EQ(status.length, 32); + ASSERT_NEQ(status.created, 0); + res = ms3_delete(ms3, s3bucket, "☃/☃.☃"); + ASSERT_EQ_(res, 0, "Result: %u", res); + ms3_free(data); + res = ms3_get(ms3, s3bucket, "☃/☃.☃", &data, &length); + ASSERT_NEQ_(res, 0, "Object should error"); + ASSERT_NULL_(data, "Data should be NULL"); + ASSERT_EQ_(length, 0, "There should be no data"); + ms3_deinit(ms3); + ms3_library_deinit(); + return 0; +} diff --git a/storage/maria/libmarias3/version.h.in b/storage/maria/libmarias3/version.h.in new file mode 100644 index 00000000..c4da559e --- /dev/null +++ b/storage/maria/libmarias3/version.h.in @@ -0,0 +1,4 @@ +#pragma once + +#define LIBMARIAS3_VERSION_STRING "@LIBMARIAS3_VERSION_STRING@" +#define LIBMARIAS3_VERSION_HEX @LIBMARIAS3_VERSION_HEX@ diff --git a/storage/maria/libmarias3/yatl/include.am b/storage/maria/libmarias3/yatl/include.am new file mode 100644 index 00000000..933ab9c2 --- /dev/null +++ b/storage/maria/libmarias3/yatl/include.am @@ -0,0 +1,11 @@ +# vim:ft=automake +# Copyright (C) 2012 Data Differential +# All rights reserved. +# +# Use and distribution licensed under the BSD license. See +# the COPYING file in the parent directory for full text. +# +# included from Top Level Makefile.am +# All paths should be given relative to the root + +noinst_HEADERS+= yatl/lite.h diff --git a/storage/maria/libmarias3/yatl/lite.h b/storage/maria/libmarias3/yatl/lite.h new file mode 100644 index 00000000..4e2f4a1a --- /dev/null +++ b/storage/maria/libmarias3/yatl/lite.h @@ -0,0 +1,424 @@ +/* vim:expandtab:shiftwidth=2:tabstop=2:smarttab: + * + * Data Differential YATL (i.e. libtest) library + * + * Copyright (C) 2012 Data Differential, http://datadifferential.com/ + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * * The names of its contributors may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#pragma once + +#ifdef __cplusplus +# include <cstdarg> +# include <cstddef> +# include <cstdio> +# include <cstdlib> +# include <cstring> +#else +# include <stdarg.h> +# include <stdbool.h> +# include <stddef.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +#endif + +#ifdef _WIN32 +# include <malloc.h> +#else +# include <alloca.h> +#endif + +#ifndef __PRETTY_FUNCTION__ +# define __PRETTY_FUNCTION__ __func__ +#endif + +#ifndef EXIT_SKIP +# define EXIT_SKIP 77 +#endif + +#ifndef YATL_FULL +# define YATL_FULL 0 +#endif + +#ifndef FAIL +# define FAIL(__message_format, ...) +#endif + +#ifndef SKIP +# define SKIP(__message_format, ...) +#endif + +static inline bool valgrind_is_caller(void) +{ + if (getenv("TESTS_ENVIRONMENT") + && strstr(getenv("TESTS_ENVIRONMENT"), "valgrind")) + { + return true; + } + + return false; +} + +static inline size_t yatl_strlen(const char *s) +{ + if (s) + { + return strlen(s); + } + + return 0UL; +} + +static inline int yatl_strcmp(const char *s1, const char *s2, size_t *s1_length, + size_t *s2_length) +{ + *s1_length = yatl_strlen(s1); + *s2_length = yatl_strlen(s2); + + if (*s1_length == 0 && *s1_length == *s2_length) + { + return 0; + } + + if (*s1_length == 0 && *s2_length) + { + return 1; + } + + if (*s1_length && *s2_length == 0) + { + return 1; + } + + return strcmp(s1, s2); +} + +#define SKIP_IF(__expression) \ +do \ +{ \ + if ((__expression)) { \ + if (YATL_FULL) { \ + SKIP(#__expression); \ + } \ + fprintf(stdout, "\n%s:%d: %s SKIP '!(%s)'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression); \ + exit(EXIT_SKIP); \ + } \ +} while (0) + +#define SKIP_IF_(__expression, ...) \ +do \ +{ \ + if ((__expression)) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + char *buffer; \ + ask++; \ + buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + SKIP(#__expression, buffer); \ + } \ + fprintf(stdout, "\n%s:%d: %s SKIP '%s' [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression, buffer); \ + exit(EXIT_SKIP); \ + } \ +} while (0) + +#define ASSERT_TRUE(__expression) \ +do \ +{ \ + if (! (__expression)) { \ + if (YATL_FULL) { \ + FAIL("Assertion '%s'", #__expression); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression);\ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_FALSE(__expression) \ +do \ +{ \ + if ((__expression)) { \ + if (YATL_FULL) { \ + FAIL("Assertion '!%s'", #__expression); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '!%s'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression);\ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_NULL_(__expression, ...) \ +do \ +{ \ + if ((__expression) != NULL) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + char *buffer; \ + ask++; \ + buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' != NULL [ %s ]", #__expression, buffer);\ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' != NULL [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression, buffer);\ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_NOT_NULL(__expression) \ +do \ +{ \ + if ((__expression) == NULL) { \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' == NULL", #__expression);\ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' == NULL\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression);\ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_NOT_NULL_(__expression, ...) \ +do \ +{ \ + if ((__expression) == NULL) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + ask++; \ + char *buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' == NULL [ %s ]", #__expression, buffer);\ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' == NULL [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression, buffer);\ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_TRUE_(__expression, ...) \ +do \ +{ \ + if (! (__expression)) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + char *buffer; \ + ask++; \ + buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' [ %s ]", #__expression, buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression, buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_EQ(__expected, __actual) \ +do \ +{ \ + if ((__expected) != (__actual)) { \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' != '%s'", #__expected, #__actual); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' != '%s'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expected, #__actual); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_EQ_(__expected, __actual, ...) \ +do \ +{ \ + if ((__expected) != (__actual)) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + char *buffer; \ + ask++; \ + buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' != '%s' [ %s ]", #__expected, #__actual, buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' != '%s' [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expected, #__actual, buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_STREQ(__expected_str, __actual_str) \ +do \ +{ \ + size_t __expected_length; \ + size_t __actual_length; \ + int ret= yatl_strcmp(__expected_str, __actual_str, &__expected_length, &__actual_length); \ + if (ret) { \ + if (YATL_FULL) { \ + FAIL("Assertion '%.*s' != '%.*s'\n", \ + (int)(__expected_length), (__expected_str), \ + (int)__actual_length, (__actual_str)) ; \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%.*s' != '%.*s'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, \ + (int)(__expected_length), (__expected_str), \ + (int)__actual_length, (__actual_str)) ; \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_STREQ_(__expected_str, __actual_str, ...) \ +do \ +{ \ + size_t __expected_length; \ + size_t __actual_length; \ + int ret= yatl_strcmp(__expected_str, __actual_str, &__expected_length, &__actual_length); \ + if (ret) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + ask++; \ + char *buffer= (char*)alloca(sizeof(char) * ask); \ + ask= snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%.*s' != '%.*s' [ %.*s ]", \ + (int)(__expected_length), (__expected_str), \ + (int)(__actual_length), (__actual_str), \ + (int)(ask), buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%.*s' != '%.*s' [ %.*s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, \ + (int)(__expected_length), (__expected_str), \ + (int)(__actual_length), (__actual_str), \ + (int)(ask), buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_STREQL_(__expected_str, __actual_str, __length,...) \ +do \ +{ \ + int ret= strncmp(__expected_str, __actual_str, __length); \ + if (ret) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + ask++; \ + char *buffer= (char*)alloca(sizeof(char) * ask); \ + ask= snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%.*s' != '%.*s' [ %.*s ]", \ + (int)(__length), (__expected_str), \ + (int)(__length), (__actual_str), \ + (int)(ask), buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%.*s' != '%.*s' [ %.*s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, \ + (int)(__length), (__expected_str), \ + (int)(__length), (__actual_str), \ + (int)(ask), buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + + +#define ASSERT_STRNE(__expected_str, __actual_str) \ +do \ +{ \ + size_t __expected_length; \ + size_t __actual_length; \ + int ret= yatl_strcmp(__expected_str, __actual_str, &__expected_length, &__actual_length); \ + if (ret == 0) { \ + if (YATL_FULL) { \ + FAIL("Assertion '%.*s' == '%.*s'", \ + (int)(__expected_length), (__expected_str), \ + (int)__actual_length, (__actual_str)) ; \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%.*s' == '%.*s'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, \ + (int)(__expected_length), (__expected_str), \ + (int)__actual_length, (__actual_str)) ; \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_STRNE_(__expected_str, __actual_str, ...) \ +do \ +{ \ + size_t __expected_length; \ + size_t __actual_length; \ + int ret= yatl_strcmp(__expected_str, __actual_str, &__expected_length, &__actual_length); \ + if (ret == 0) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + ask++; \ + char *buffer= (char*)alloca(sizeof(char) * ask); \ + ask= snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%.*s' == '%.*s' [ %.*s ]", \ + (int)(__expected_length), (__expected_str), \ + (int)(__actual_length), (__actual_str), \ + (int)(ask), buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%.*s' == '%.*s' [ %.*s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, \ + (int)(__expected_length), (__expected_str), \ + (int)(__actual_length), (__actual_str), \ + (int)(ask), buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_NEQ(__expected, __actual) \ +do \ +{ \ + if ((__expected) == (__actual)) { \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' == '%s'", #__expected, #__actual); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' == '%s'\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expected, #__actual); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_NEQ_(__expected, __actual, ...) \ +do \ +{ \ + if ((__expected) == (__actual)) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + char *buffer; \ + ask++; \ + buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '%s' == '%s' [ %s ]", #__expected, #__actual, buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '%s' == '%s' [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expected, #__actual, buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define ASSERT_FALSE_(__expression, ...) \ +do \ +{ \ + if ((__expression)) { \ + size_t ask= snprintf(0, 0, __VA_ARGS__); \ + ask++; \ + char *buffer= (char*)alloca(sizeof(char) * ask); \ + snprintf(buffer, ask, __VA_ARGS__); \ + if (YATL_FULL) { \ + FAIL("Assertion '!%s' [ %s ]", #__expression, buffer); \ + } \ + fprintf(stderr, "\n%s:%d: %s Assertion '!%s' [ %s ]\n", __FILE__, __LINE__, __PRETTY_FUNCTION__, #__expression, buffer); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c new file mode 100644 index 00000000..4cf6a46e --- /dev/null +++ b/storage/maria/lockman.c @@ -0,0 +1,776 @@ +/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */ +/* QQ: TODO instant duration locks */ +/* QQ: #warning automatically place S instead of LS if possible */ + +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Generic Lock Manager + + Lock manager handles locks on "resources", a resource must be uniquely + identified by a 64-bit number. Lock manager itself does not imply + anything about the nature of a resource - it can be a row, a table, a + database, or just anything. + + Locks belong to "lock owners". A Lock owner is uniquely identified by a + 16-bit number. A function loid2lo must be provided by the application + that takes such a number as an argument and returns a LOCK_OWNER + structure. + + Lock levels are completely defined by three tables. Lock compatibility + matrix specifies which locks can be held at the same time on a resource. + Lock combining matrix specifies what lock level has the same behaviour as + a pair of two locks of given levels. getlock_result matrix simplifies + intention locking and lock escalation for an application, basically it + defines which locks are intention locks and which locks are "loose" + locks. It is only used to provide better diagnostics for the + application, lock manager itself does not differentiate between normal, + intention, and loose locks. + + Internally lock manager is based on a lock-free hash, see lf_hash.c for + details. All locks are stored in a hash, with a resource id as a search + key, so all locks for the same resource will be considered collisions and + will be put in a one (lock-free) linked list. The main lock-handling + logic is in the inner loop that searches for a lock in such a linked + list - lockfind(). + + This works as follows. Locks generally are added to the end of the list + (with one exception, see below). When scanning the list it is always + possible to determine what locks are granted (active) and what locks are + waiting - first lock is obviously active, the second is active if it's + compatible with the first, and so on, a lock is active if it's compatible + with all previous locks and all locks before it are also active. + To calculate the "compatible with all previous locks" all locks are + accumulated in prev_lock variable using lock_combining_matrix. + + Lock upgrades: when a thread that has a lock on a given resource, + requests a new lock on the same resource and the old lock is not enough + to satisfy new lock requirements (which is defined by + lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock is + placed in the list. Depending on other locks it is immediately active or + it will wait for other locks. Here's an exception to "locks are added + to the end" rule - upgraded locks are added after the last active lock + but before all waiting locks. Old lock (the one we upgraded from) is + not removed from the list, indeed it may be needed if the new lock was + in a savepoint that gets rolled back. So old lock is marked as "ignored" + (IGNORE_ME flag). New lock gets an UPGRADED flag. + + Loose locks add an important exception to the above. Loose locks do not + always commute with other locks. In the list IX-LS both locks are active, + while in the LS-IX list only the first lock is active. This creates a + problem in lock upgrades. If the list was IX-LS and the owner of the + first lock wants to place LS lock (which can be immediately granted), the + IX lock is upgraded to LSIX and the list becomes IX-LS-LSIX, which, + according to the lock compatibility matrix means that the last lock is + waiting - of course it all happened because IX and LS were swapped and + they don't commute. To work around this there's ACTIVE flag which is set + in every lock that never waited (was placed active), and this flag + overrides "compatible with all previous locks" rule. + + When a lock is placed to the end of the list it's either compatible with + all locks and all locks are active - new lock becomes active at once, or + it conflicts with some of the locks, in this case in the 'blocker' + variable a conflicting lock is returned and the calling thread waits on a + pthread condition in the LOCK_OWNER structure of the owner of the + conflicting lock. Or a new lock is compatible with all locks, but some + existing locks are not compatible with each other (example: request IS, + when the list is S-IX) - that is not all locks are active. In this case a + first waiting lock is returned in the 'blocker' variable, lockman_getlock() + notices that a "blocker" does not conflict with the requested lock, and + "dereferences" it, to find the lock that it's waiting on. The calling + thread than begins to wait on the same lock. + + To better support table-row relations where one needs to lock the table + with an intention lock before locking the row, extended diagnostics is + provided. When an intention lock (presumably on a table) is granted, + lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row, + perhaps the thread already has a normal lock on this table), + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual), + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check + whether it's possible to lock the row, but no need to lock it - perhaps + the thread has a loose lock on this table). This is defined by + getlock_result[] table. +*/ + +#include <my_global.h> +#include <my_sys.h> +#include <my_bit.h> +#include <lf.h> +#include "my_cpu.h" +#include "lockman.h" + +/* + Lock compatibility matrix. + + It's asymmetric. Read it as "Somebody has the lock <value in the row + label>, can I set the lock <value in the column label> ?" + + ') Though you can take LS lock while somebody has S lock, it makes no + sense - it's simpler to take S lock too. + + 1 - compatible + 0 - incompatible + -1 - "impossible", so that we can assert the impossibility. +*/ +static int lock_compatibility_matrix[10][10]= +{ /* N S X IS IX SIX LS LX SLX LSIX */ + { -1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */ + { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */ + { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */ +}; + +/* + Lock combining matrix. + + It's symmetric. Read it as "what lock level L is identical to the + set of two locks A and B" + + One should never get N from it, we assert the impossibility +*/ +static enum lockman_lock_type lock_combining_matrix[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { N, S, X, IS, IX, SIX, S, SLX, SLX, SIX}, /* N */ + { S, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */ + { X, X, X, X, X, X, X, X, X, X}, /* X */ + { IS, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */ + { IX, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */ + { SIX, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */ + { LS, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */ + { LX, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */ + { SLX, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */ + { LSIX, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */ +}; + +#define REPEAT_ONCE_MORE 0 +#define OK_TO_PLACE_THE_LOCK 1 +#define OK_TO_PLACE_THE_REQUEST 2 +#define ALREADY_HAVE_THE_LOCK 4 +#define ALREADY_HAVE_THE_REQUEST 8 +#define PLACE_NEW_DISABLE_OLD 16 +#define REQUEST_NEW_DISABLE_OLD 32 +#define RESOURCE_WAS_UNLOCKED 64 + +#define NEED_TO_WAIT (OK_TO_PLACE_THE_REQUEST | ALREADY_HAVE_THE_REQUEST |\ + REQUEST_NEW_DISABLE_OLD) +#define ALREADY_HAVE (ALREADY_HAVE_THE_LOCK | ALREADY_HAVE_THE_REQUEST) +#define LOCK_UPGRADE (PLACE_NEW_DISABLE_OLD | REQUEST_NEW_DISABLE_OLD) + + +/* + the return codes for lockman_getlock + + It's asymmetric. Read it as "I have the lock <value in the row label>, + what value should be returned for <value in the column label> ?" + + 0 means impossible combination (assert!) + + Defines below help to preserve the table structure. + I/L/A values are self explanatory + x means the combination is possible (assert should not crash) + but it cannot happen in row locks, only in table locks (S,X), + or lock escalations (LS,LX) +*/ +#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE +#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +#define A GOT_THE_LOCK +#define x GOT_THE_LOCK +static enum lockman_getlock_result getlock_result[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */ + { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */ + { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */ + { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */ + { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */ + { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */ + { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */ + { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */ + { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */ + { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */ +}; +#undef I +#undef L +#undef A +#undef x + +typedef struct lockman_lock { + uint64 resource; + struct lockman_lock *lonext; + intptr volatile link; + uint32 hashnr; + /* QQ: TODO - remove hashnr from LOCK */ + uint16 loid; + uchar lock; /* sizeof(uchar) <= sizeof(enum) */ + uchar flags; +} LOCK; + +#define IGNORE_ME 1 +#define UPGRADED 2 +#define ACTIVE 4 + +typedef struct { + intptr volatile *prev; + LOCK *curr, *next; + LOCK *blocker, *upgrade_from; +} CURSOR; + +#define PTR(V) (LOCK *)((V) & (~(intptr)1)) +#define DELETED(V) ((V) & 1) + +/* + NOTE + cursor is positioned in either case + pins[0..3] are used, they are NOT removed on return +*/ +static int lockfind(LOCK * volatile *head, LOCK *node, + CURSOR *cursor, LF_PINS *pins) +{ + uint32 hashnr, cur_hashnr; + uint64 resource, cur_resource; + intptr cur_link; + my_bool cur_active, compatible, upgrading, prev_active; + enum lockman_lock_type lock, prev_lock, cur_lock; + uint16 loid, cur_loid; + int cur_flags, flags; + + hashnr= node->hashnr; + resource= node->resource; + lock= node->lock; + loid= node->loid; + flags= node->flags; + +retry: + cursor->prev= (intptr *)head; + prev_lock= N; + cur_active= TRUE; + compatible= TRUE; + upgrading= FALSE; + cursor->blocker= cursor->upgrade_from= 0; + lf_unpin(pins, 3); + do { + cursor->curr= PTR(*cursor->prev); + lf_pin(pins, 1, cursor->curr); + } while(*cursor->prev != (intptr)cursor->curr && LF_BACKOFF()); + for (;;) + { + if (!cursor->curr) + break; + do { + cur_link= cursor->curr->link; + cursor->next= PTR(cur_link); + lf_pin(pins, 0, cursor->next); + } while (cur_link != cursor->curr->link && LF_BACKOFF()); + cur_hashnr= cursor->curr->hashnr; + cur_resource= cursor->curr->resource; + cur_lock= cursor->curr->lock; + cur_loid= cursor->curr->loid; + cur_flags= cursor->curr->flags; + if (*cursor->prev != (intptr)cursor->curr) + { + (void)LF_BACKOFF(); + goto retry; + } + if (!DELETED(cur_link)) + { + if (cur_hashnr > hashnr || + (cur_hashnr == hashnr && cur_resource >= resource)) + { + if (cur_hashnr > hashnr || cur_resource > resource) + break; + /* ok, we have a lock for this resource */ + DBUG_ASSERT(lock_compatibility_matrix[prev_lock][cur_lock] >= 0); + DBUG_ASSERT(lock_compatibility_matrix[cur_lock][lock] >= 0); + if ((cur_flags & IGNORE_ME) && ! (flags & IGNORE_ME)) + { + DBUG_ASSERT(cur_active); + if (cur_loid == loid) + cursor->upgrade_from= cursor->curr; + } + else + { + prev_active= cur_active; + if (cur_flags & ACTIVE) + DBUG_ASSERT(prev_active == TRUE); + else + cur_active&= lock_compatibility_matrix[prev_lock][cur_lock]; + if (upgrading && !cur_active /*&& !(cur_flags & UPGRADED)*/) + break; + if (prev_active && !cur_active) + { + cursor->blocker= cursor->curr; + lf_pin(pins, 3, cursor->curr); + } + if (cur_loid == loid) + { + /* we already have a lock on this resource */ + DBUG_ASSERT(lock_combining_matrix[cur_lock][lock] != N); + DBUG_ASSERT(!upgrading || (flags & IGNORE_ME)); + if (lock_combining_matrix[cur_lock][lock] == cur_lock) + { + /* new lock is compatible */ + if (cur_active) + { + cursor->blocker= cursor->curr; /* loose-locks! */ + lf_unpin(pins, 3); /* loose-locks! */ + return ALREADY_HAVE_THE_LOCK; + } + else + return ALREADY_HAVE_THE_REQUEST; + } + /* not compatible, upgrading */ + upgrading= TRUE; + cursor->upgrade_from= cursor->curr; + } + else + { + if (!lock_compatibility_matrix[cur_lock][lock]) + { + compatible= FALSE; + cursor->blocker= cursor->curr; + lf_pin(pins, 3, cursor->curr); + } + } + prev_lock= lock_combining_matrix[prev_lock][cur_lock]; + DBUG_ASSERT(prev_lock != N); + } + } + cursor->prev= &(cursor->curr->link); + lf_pin(pins, 2, cursor->curr); + } + else + { + if (my_atomic_casptr((void **)cursor->prev, + (void **)(char*) &cursor->curr, cursor->next)) + lf_alloc_free(pins, cursor->curr); + else + { + (void)LF_BACKOFF(); + goto retry; + } + } + cursor->curr= cursor->next; + lf_pin(pins, 1, cursor->curr); + } + /* + either the end of lock list - no more locks for this resource, + or upgrading and the end of active lock list + */ + if (upgrading) + { + if (compatible /*&& prev_active*/) + return PLACE_NEW_DISABLE_OLD; + else + return REQUEST_NEW_DISABLE_OLD; + } + if (cur_active && compatible) + { + /* + either no locks for this resource or all are compatible. + ok to place the lock in any case. + */ + return prev_lock == N ? RESOURCE_WAS_UNLOCKED + : OK_TO_PLACE_THE_LOCK; + } + /* we have a lock conflict. ok to place a lock request. And wait */ + return OK_TO_PLACE_THE_REQUEST; +} + +/* + NOTE + it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays +*/ +static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins, + LOCK **blocker) +{ + CURSOR cursor; + int res; + + do + { + res= lockfind(head, node, &cursor, pins); + DBUG_ASSERT(res != ALREADY_HAVE_THE_REQUEST); + if (!(res & ALREADY_HAVE)) + { + if (res & LOCK_UPGRADE) + { + node->flags|= UPGRADED; + node->lock= lock_combining_matrix[cursor.upgrade_from->lock][node->lock]; + } + if (!(res & NEED_TO_WAIT)) + node->flags|= ACTIVE; + node->link= (intptr)cursor.curr; + DBUG_ASSERT(node->link != (intptr)node); + DBUG_ASSERT(cursor.prev != &node->link); + if (!my_atomic_casptr((void **)cursor.prev, + (void **)(char*) &cursor.curr, node)) + { + res= REPEAT_ONCE_MORE; + node->flags&= ~ACTIVE; + } + if (res & LOCK_UPGRADE) + cursor.upgrade_from->flags|= IGNORE_ME; + /* + QQ: is this OK ? if a reader has already read upgrade_from, + it may find it conflicting with node :( + - see the last test from test_lockman_simple() + */ + } + + } while (res == REPEAT_ONCE_MORE); + lf_unpin(pins, 0); + lf_unpin(pins, 1); + lf_unpin(pins, 2); + /* + note that blocker is not necessarily pinned here (when it's == curr). + this is ok as in such a case it's either a dummy node for + initialize_bucket() and dummy nodes don't need pinning, + or it's a lock of the same transaction for lockman_getlock, + and it cannot be removed by another thread + */ + *blocker= cursor.blocker; + return res; +} + +/* + NOTE + it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays +*/ +static int lockpeek(LOCK * volatile *head, LOCK *node, LF_PINS *pins, + LOCK **blocker) +{ + CURSOR cursor; + int res; + + res= lockfind(head, node, &cursor, pins); + + lf_unpin(pins, 0); + lf_unpin(pins, 1); + lf_unpin(pins, 2); + if (blocker) + *blocker= cursor.blocker; + return res; +} + +/* + NOTE + it uses pins[0..3], on return all pins are removed. + + One _must_ have the lock (or request) to call this +*/ +static int lockdelete(LOCK * volatile *head, LOCK *node, LF_PINS *pins) +{ + CURSOR cursor; + int res; + + do + { + res= lockfind(head, node, &cursor, pins); + DBUG_ASSERT(res & ALREADY_HAVE); + + if (cursor.upgrade_from) + cursor.upgrade_from->flags&= ~IGNORE_ME; + + /* + XXX this does not work with savepoints, as old lock is left ignored. + It cannot be unignored, as would basically mean moving the lock back + in the lock chain (from upgraded). And the latter is not allowed - + because it breaks list scanning. So old ignored lock must be deleted, + new - same - lock must be installed right after the lock we're deleting, + then we can delete. Good news is - this is only required when rolling + back a savepoint. + */ + if (my_atomic_casptr((void **)(char*)&(cursor.curr->link), + (void **)(char*)&cursor.next, 1+(char *)cursor.next)) + { + if (my_atomic_casptr((void **)cursor.prev, + (void **)(char*)&cursor.curr, cursor.next)) + lf_alloc_free(pins, cursor.curr); + else + lockfind(head, node, &cursor, pins); + } + else + { + res= REPEAT_ONCE_MORE; + if (cursor.upgrade_from) + cursor.upgrade_from->flags|= IGNORE_ME; + } + } while (res == REPEAT_ONCE_MORE); + lf_unpin(pins, 0); + lf_unpin(pins, 1); + lf_unpin(pins, 2); + lf_unpin(pins, 3); + return res; +} + +void lockman_init(LOCKMAN *lm, loid_to_lo_func *func, uint timeout) +{ + lf_alloc_init(&lm->alloc, sizeof(LOCK), offsetof(LOCK, lonext)); + lf_dynarray_init(&lm->array, sizeof(LOCK **)); + lm->size= 1; + lm->count= 0; + lm->loid_to_lo= func; + lm->lock_timeout= timeout; +} + +void lockman_destroy(LOCKMAN *lm) +{ + LOCK *el= *(LOCK **)lf_dynarray_lvalue(&lm->array, 0); + while (el) + { + intptr next= el->link; + if (el->hashnr & 1) + lf_alloc_direct_free(&lm->alloc, el); + else + my_free((void *)el); + el= (LOCK *)next; + } + lf_alloc_destroy(&lm->alloc); + lf_dynarray_destroy(&lm->array); +} + +/* TODO: optimize it */ +#define MAX_LOAD 1 + +static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node, + uint bucket, LF_PINS *pins) +{ + int res; + uint parent= my_clear_highest_bit(bucket); + LOCK *dummy= (LOCK *)my_malloc(PSI_INSTRUMENT_ME, sizeof(LOCK), MYF(MY_WME)); + LOCK **tmp= 0, *cur; + LOCK * volatile *el= lf_dynarray_lvalue(&lm->array, parent); + + if (*el == NULL && bucket) + initialize_bucket(lm, el, parent, pins); + dummy->hashnr= my_reverse_bits(bucket); + dummy->loid= 0; + dummy->lock= X; /* doesn't matter, in fact */ + dummy->resource= 0; + dummy->flags= 0; + res= lockinsert(el, dummy, pins, &cur); + DBUG_ASSERT(res & (ALREADY_HAVE_THE_LOCK | RESOURCE_WAS_UNLOCKED)); + if (res & ALREADY_HAVE_THE_LOCK) + { + my_free((void *)dummy); + dummy= cur; + } + my_atomic_casptr((void **)node, (void **)(char*) &tmp, dummy); +} + +static inline uint calc_hash(uint64 resource) +{ + const uchar *pos= (uchar *)&resource; + ulong nr1= 1, nr2= 4, i; + for (i= 0; i < sizeof(resource) ; i++, pos++) + { + nr1^= (ulong) ((((uint) nr1 & 63)+nr2) * ((uint)*pos)) + (nr1 << 8); + nr2+= 3; + } + return nr1 & INT_MAX32; +} + +/* + RETURN + see enum lockman_getlock_result + NOTE + uses pins[0..3], they're removed on return +*/ +enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo, + uint64 resource, + enum lockman_lock_type lock) +{ + int res; + uint csize, bucket, hashnr; + LOCK *node, * volatile *el, *blocker; + LF_PINS *pins= lo->pins; + enum lockman_lock_type old_lock; + + DBUG_ASSERT(lo->loid); + node= (LOCK *)lf_alloc_new(pins); + node->flags= 0; + node->lock= lock; + node->loid= lo->loid; + node->resource= resource; + hashnr= calc_hash(resource); + bucket= hashnr % lm->size; + el= lf_dynarray_lvalue(&lm->array, bucket); + if (*el == NULL) + initialize_bucket(lm, el, bucket, pins); + node->hashnr= my_reverse_bits(hashnr) | 1; + res= lockinsert(el, node, pins, &blocker); + if (res & ALREADY_HAVE) + { + int r; + old_lock= blocker->lock; + lf_alloc_free(pins, node); + r= getlock_result[old_lock][lock]; + DBUG_ASSERT(r); + return r; + } + /* a new value was added to the hash */ + csize= lm->size; + if ((my_atomic_add32(&lm->count, 1)+1.0) / csize > MAX_LOAD) + my_atomic_cas32(&lm->size, (int*) &csize, csize*2); + node->lonext= lo->all_locks; + lo->all_locks= node; + for ( ; res & NEED_TO_WAIT; res= lockpeek(el, node, pins, &blocker)) + { + LOCK_OWNER *wait_for_lo; + ulonglong deadline; + struct timespec timeout; + + lf_assert_pin(pins, 3); /* blocker must be pinned here */ + wait_for_lo= lm->loid_to_lo(blocker->loid); + + /* + now, this is tricky. blocker is not necessarily a LOCK + we're waiting for. If it's compatible with what we want, + then we're waiting for a lock that blocker is waiting for + (see two places where blocker is set in lockfind) + In the latter case, let's "dereference" it + */ + if (lock_compatibility_matrix[blocker->lock][lock]) + { + blocker= wait_for_lo->all_locks; + lf_pin(pins, 3, blocker); + if (blocker != wait_for_lo->all_locks) + continue; + wait_for_lo= wait_for_lo->waiting_for; + } + + /* + note that the blocker transaction may have ended by now, + its LOCK_OWNER and short id were reused, so 'wait_for_lo' may point + to an unrelated - albeit valid - LOCK_OWNER + */ + if (!wait_for_lo) + continue; + + lo->waiting_for= wait_for_lo; + + /* + We lock a mutex - it may belong to a wrong LOCK_OWNER, but it must + belong to _some_ LOCK_OWNER. It means, we can never free() a LOCK_OWNER, + if there're other active LOCK_OWNERs. + */ + /* QQ: race condition here */ + pthread_mutex_lock(wait_for_lo->mutex); + if (DELETED(blocker->link)) + { + /* + blocker transaction was ended, or a savepoint that owned + the lock was rolled back. Either way - the lock was removed + */ + pthread_mutex_unlock(wait_for_lo->mutex); + continue; + } + + /* yuck. waiting */ + deadline= my_hrtime().val*1000 + lm->lock_timeout * 1000000; + set_timespec_time_nsec(timeout, deadline); + do + { + pthread_cond_timedwait(wait_for_lo->cond, wait_for_lo->mutex, &timeout); + } while (!DELETED(blocker->link) && my_hrtime().val < deadline/1000); + pthread_mutex_unlock(wait_for_lo->mutex); + if (!DELETED(blocker->link)) + { + /* + timeout. + note that we _don't_ release the lock request here. + Instead we're relying on the caller to abort the transaction, + and release all locks at once - see lockman_release_locks() + */ + lf_unpin(pins, 3); + return DIDNT_GET_THE_LOCK; + } + } + lo->waiting_for= 0; + lf_assert_unpin(pins, 3); /* unpin should not be needed */ + return getlock_result[lock][lock]; +} + +/* + RETURN + 0 - deleted + 1 - didn't (not found) + NOTE + see lockdelete() for pin usage notes +*/ +int lockman_release_locks(LOCKMAN *lm, LOCK_OWNER *lo) +{ + LOCK * volatile *el, *node, *next; + uint bucket; + LF_PINS *pins= lo->pins; + + pthread_mutex_lock(lo->mutex); + for (node= lo->all_locks; node; node= next) + { + next= node->lonext; + bucket= calc_hash(node->resource) % lm->size; + el= lf_dynarray_lvalue(&lm->array, bucket); + if (*el == NULL) + initialize_bucket(lm, el, bucket, pins); + lockdelete(el, node, pins); + my_atomic_add32(&lm->count, -1); + } + lo->all_locks= 0; + /* now signal all waiters */ + pthread_cond_broadcast(lo->cond); + pthread_mutex_unlock(lo->mutex); + return 0; +} + +#ifdef MY_LF_EXTRA_DEBUG +static const char *lock2str[]= +{ "N", "S", "X", "IS", "IX", "SIX", "LS", "LX", "SLX", "LSIX" }; +/* + NOTE + the function below is NOT thread-safe !!! +*/ +void print_lockhash(LOCKMAN *lm) +{ + LOCK *el= *(LOCK **)lf_dynarray_lvalue(&lm->array, 0); + printf("hash: size %u count %u\n", lm->size, lm->count); + while (el) + { + intptr next= el->link; + if (el->hashnr & 1) + { + printf("0x%08lx { resource %lu, loid %u, lock %s", + (long) el->hashnr, (ulong) el->resource, el->loid, + lock2str[el->lock]); + if (el->flags & IGNORE_ME) printf(" IGNORE_ME"); + if (el->flags & UPGRADED) printf(" UPGRADED"); + if (el->flags & ACTIVE) printf(" ACTIVE"); + if (DELETED(next)) printf(" ***DELETED***"); + printf("}\n"); + } + else + { + /*printf("0x%08x { dummy }\n", el->hashnr);*/ + DBUG_ASSERT(el->resource == 0 && el->loid == 0 && el->lock == X); + } + el= PTR(next); + } +} +#endif diff --git a/storage/maria/lockman.h b/storage/maria/lockman.h new file mode 100644 index 00000000..35d0d8f5 --- /dev/null +++ b/storage/maria/lockman.h @@ -0,0 +1,76 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _lockman_h +#define _lockman_h + +/* + Lock levels: + ^^^^^^^^^^^ + + N - "no lock", not a lock, used sometimes internally to simplify the code + S - Shared + X - eXclusive + IS - Intention Shared + IX - Intention eXclusive + SIX - Shared + Intention eXclusive + LS - Loose Shared + LX - Loose eXclusive + SLX - Shared + Loose eXclusive + LSIX - Loose Shared + Intention eXclusive +*/ +enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST }; + +struct lockman_lock; + +typedef struct st_lock_owner LOCK_OWNER; +struct st_lock_owner { + LF_PINS *pins; /* must be allocated from lockman's pinbox */ + struct lockman_lock *all_locks; /* a LIFO */ + LOCK_OWNER *waiting_for; + pthread_cond_t *cond; /* transactions waiting for this, wait on 'cond' */ + pthread_mutex_t *mutex; /* mutex is required to use 'cond' */ + uint16 loid; +}; + +typedef LOCK_OWNER *loid_to_lo_func(uint16); +typedef struct { + LF_DYNARRAY array; /* hash itself */ + LF_ALLOCATOR alloc; /* allocator for elements */ + int32 volatile size; /* size of array */ + int32 volatile count; /* number of elements in the hash */ + uint lock_timeout; + loid_to_lo_func *loid_to_lo; +} LOCKMAN; +#define DIDNT_GET_THE_LOCK 0 +enum lockman_getlock_result { + NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT, + GOT_THE_LOCK, + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE, + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +}; + +void lockman_init(LOCKMAN *, loid_to_lo_func *, uint); +void lockman_destroy(LOCKMAN *); +enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo, + uint64 resource, + enum lockman_lock_type lock); +int lockman_release_locks(LOCKMAN *, LOCK_OWNER *); + +#ifdef EXTRA_DEBUG +void print_lockhash(LOCKMAN *lm); +#endif + +#endif diff --git a/storage/maria/ma_backup.c b/storage/maria/ma_backup.c new file mode 100644 index 00000000..0384dfb4 --- /dev/null +++ b/storage/maria/ma_backup.c @@ -0,0 +1,278 @@ +/* Copyright (C) 2018, 2020 MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ + +/* Code for doing backups of Aria tables */ + +#include "maria_def.h" +#include "ma_blockrec.h" /* PAGE_SUFFIX_SIZE */ +#include "ma_checkpoint.h" +#include <aria_backup.h> + +/** + @brief Get capabilities for an Aria table + + @param kfile key file (.MAI) + @param cap Capabilities are stored here + + @return 0 ok + @return X errno +*/ + +int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap)__attribute__((visibility("default"))) ; +int aria_get_capabilities(File kfile, ARIA_TABLE_CAPABILITIES *cap) +{ + MARIA_SHARE share; + int error= 0; + uint head_length= sizeof(share.state.header), base_pos; + uint aligned_bit_blocks; + size_t info_length; + uchar *disc_cache; + DBUG_ENTER("aria_get_capabilities"); + + bzero(cap, sizeof(*cap)); + if (my_pread(kfile,share.state.header.file_version, head_length, 0, + MYF(MY_NABP))) + DBUG_RETURN(HA_ERR_NOT_A_TABLE); + + if (memcmp(share.state.header.file_version, maria_file_magic, 4)) + DBUG_RETURN(HA_ERR_NOT_A_TABLE); + + share.options= mi_uint2korr(share.state.header.options); + + info_length= mi_uint2korr(share.state.header.header_length); + base_pos= mi_uint2korr(share.state.header.base_pos); + + /* + Allocate space for header information and for data that is too + big to keep on stack + */ + if (!(disc_cache= my_malloc(PSI_NOT_INSTRUMENTED, info_length, MYF(MY_WME)))) + DBUG_RETURN(ENOMEM); + + if (my_pread(kfile, disc_cache, info_length, 0L, MYF(MY_NABP))) + { + error= my_errno; + goto err; + } + _ma_base_info_read(disc_cache + base_pos, &share.base); + cap->transactional= share.base.born_transactional; + cap->checksum= MY_TEST(share.options & HA_OPTION_PAGE_CHECKSUM); + cap->online_backup_safe= cap->transactional && cap->checksum; + cap->header_size= share.base.keystart; + cap->keypage_header= ((share.base.born_transactional ? + LSN_STORE_SIZE + TRANSID_SIZE : + 0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + + KEYPAGE_USED_SIZE); + cap->block_size= share.base.block_size; + cap->data_file_type= share.state.header.data_file_type; + cap->s3_block_size= share.base.s3_block_size; + cap->compression= share.base.compression_algorithm; + cap->encrypted= MY_TEST(share.base.extra_options & + MA_EXTRA_OPTIONS_ENCRYPTED); + + if (share.state.header.data_file_type == BLOCK_RECORD) + { + /* Calulate how man pages the row bitmap covers. From _ma_bitmap_init() */ + aligned_bit_blocks= (cap->block_size - PAGE_SUFFIX_SIZE) / 6; + /* + In each 6 bytes, we have 6*8/3 = 16 pages covered + The +1 is to add the bitmap page, as this doesn't have to be covered + */ + cap->bitmap_pages_covered= aligned_bit_blocks * 16 + 1; + } + + /* Do a check that that we got things right */ + if (share.state.header.data_file_type != BLOCK_RECORD && + cap->online_backup_safe) + error= HA_ERR_NOT_A_TABLE; + +err: + my_free(disc_cache); + DBUG_RETURN(error); +} /* aria_get_capabilities */ + +/**************************************************************************** +** store MARIA_BASE_INFO +****************************************************************************/ + +uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base) +{ + bmove(base->uuid, ptr, MY_UUID_SIZE); ptr+= MY_UUID_SIZE; + base->keystart= mi_sizekorr(ptr); ptr+= 8; + base->max_data_file_length= mi_sizekorr(ptr); ptr+= 8; + base->max_key_file_length= mi_sizekorr(ptr); ptr+= 8; + base->records= (ha_rows) mi_sizekorr(ptr); ptr+= 8; + base->reloc= (ha_rows) mi_sizekorr(ptr); ptr+= 8; + base->mean_row_length= mi_uint4korr(ptr); ptr+= 4; + base->reclength= mi_uint4korr(ptr); ptr+= 4; + base->pack_reclength= mi_uint4korr(ptr); ptr+= 4; + base->min_pack_length= mi_uint4korr(ptr); ptr+= 4; + base->max_pack_length= mi_uint4korr(ptr); ptr+= 4; + base->min_block_length= mi_uint4korr(ptr); ptr+= 4; + base->fields= mi_uint2korr(ptr); ptr+= 2; + base->fixed_not_null_fields= mi_uint2korr(ptr); ptr+= 2; + base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2; + base->max_field_lengths= mi_uint2korr(ptr); ptr+= 2; + base->pack_fields= mi_uint2korr(ptr); ptr+= 2; + base->extra_options= mi_uint2korr(ptr); ptr+= 2; + base->null_bytes= mi_uint2korr(ptr); ptr+= 2; + base->original_null_bytes= mi_uint2korr(ptr); ptr+= 2; + base->field_offsets= mi_uint2korr(ptr); ptr+= 2; + base->language= mi_uint2korr(ptr); ptr+= 2; + base->block_size= mi_uint2korr(ptr); ptr+= 2; + + base->rec_reflength= *ptr++; + base->key_reflength= *ptr++; + base->keys= *ptr++; + base->auto_key= *ptr++; + base->born_transactional= *ptr++; + base->compression_algorithm= *ptr++; + base->pack_bytes= mi_uint2korr(ptr); ptr+= 2; + base->blobs= mi_uint2korr(ptr); ptr+= 2; + base->max_key_block_length= mi_uint2korr(ptr); ptr+= 2; + base->max_key_length= mi_uint2korr(ptr); ptr+= 2; + base->extra_alloc_bytes= mi_uint2korr(ptr); ptr+= 2; + base->extra_alloc_procent= *ptr++; + base->s3_block_size= mi_uint3korr(ptr); ptr+= 3; + ptr+= 13; + return ptr; +} + + +/** + @brief Copy an index block with re-read if checksum doesn't match + + @param dfile data file (.MAD) + @param cap aria capabilities from aria_get_capabilities + @param block block number to read (0, 1, 2, 3...) + @param buffer read data to this buffer + @param bytes_read number of bytes actually read (in case of end of file) + + @return 0 ok + @return HA_ERR_END_OF_FILE ; End of file + @return # error number +*/ + +#define MAX_RETRY 10 + +int aria_read_index(File kfile, ARIA_TABLE_CAPABILITIES *cap, ulonglong block, + uchar *buffer) +{ + MARIA_SHARE share; + int retry= 0; + DBUG_ENTER("aria_read_index"); + + share.keypage_header= cap->keypage_header; + share.block_size= cap->block_size; + do + { + int error; + size_t length; + if ((length= my_pread(kfile, buffer, cap->block_size, + block * cap->block_size, MYF(0))) != cap->block_size) + { + if (length == 0) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (length == (size_t) -1) + DBUG_RETURN(my_errno ? my_errno : -1); + /* Assume we got a half read; Do a re-read */ + } + /* If not transactional or key file header, there are no checksums */ + if (!cap->online_backup_safe || + block < cap->header_size/ cap->block_size) + DBUG_RETURN(length == cap->block_size ? 0 : HA_ERR_CRASHED); + + if (length == cap->block_size) + { + length= _ma_get_page_used(&share, buffer); + if (length > cap->block_size - CRC_SIZE) + DBUG_RETURN(HA_ERR_CRASHED); + error= maria_page_crc_check(buffer, block, &share, + MARIA_NO_CRC_NORMAL_PAGE, + (int) length); + if (error != HA_ERR_WRONG_CRC) + DBUG_RETURN(error); + } + my_sleep(100000); /* Sleep 0.1 seconds */ + } while (retry < MAX_RETRY); + DBUG_RETURN(HA_ERR_WRONG_CRC); +} + + +/** + @brief Copy a data block with re-read if checksum doesn't match + + @param dfile data file (.MAD) + @param cap aria capabilities from aria_get_capabilities + @param block block number to read (0, 1, 2, 3...) + @param buffer read data to this buffer + @param bytes_read number of bytes actually read (in case of end of file) + + @return 0 ok + @return HA_ERR_END_OF_FILE ; End of file + @return # error number +*/ + +int aria_read_data(File dfile, ARIA_TABLE_CAPABILITIES *cap, ulonglong block, + uchar *buffer, size_t *bytes_read) +{ + MARIA_SHARE share; + int retry= 0; + DBUG_ENTER("aria_read_data"); + + share.keypage_header= cap->keypage_header; + share.block_size= cap->block_size; + + if (!cap->online_backup_safe) + { + *bytes_read= my_pread(dfile, buffer, cap->block_size, + block * cap->block_size, MY_WME); + if (*bytes_read == 0) + DBUG_RETURN(HA_ERR_END_OF_FILE); + DBUG_RETURN(*bytes_read > 0 ? 0 : (my_errno ? my_errno : -1)); + } + + *bytes_read= cap->block_size; + do + { + int error; + size_t length; + if ((length= my_pread(dfile, buffer, cap->block_size, + block * cap->block_size, MYF(0))) != cap->block_size) + { + if (length == 0) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (length == (size_t) -1) + DBUG_RETURN(my_errno ? my_errno : -1); + } + + /* If not transactional or key file header, there are no checksums */ + if (!cap->online_backup_safe) + DBUG_RETURN(length == cap->block_size ? 0 : HA_ERR_CRASHED); + + if (length == cap->block_size) + { + error= maria_page_crc_check(buffer, block, &share, + ((block % cap->bitmap_pages_covered) == 0 ? + MARIA_NO_CRC_BITMAP_PAGE : + MARIA_NO_CRC_NORMAL_PAGE), + share.block_size - CRC_SIZE); + if (error != HA_ERR_WRONG_CRC) + DBUG_RETURN(error); + } + my_sleep(100000); /* Sleep 0.1 seconds */ + } while (retry < MAX_RETRY); + DBUG_RETURN(HA_ERR_WRONG_CRC); +} diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c new file mode 100644 index 00000000..61fe4f9d --- /dev/null +++ b/storage/maria/ma_bitmap.c @@ -0,0 +1,3386 @@ +/* Copyright (C) 2007 Michael Widenius + Copyright (c) 2010, 2013, Monty Program Ab. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Bitmap handling (for records in block) + + The data file starts with a bitmap page, followed by as many data + pages as the bitmap can cover. After this there is a new bitmap page + and more data pages etc. + + The bitmap code assumes there is always an active bitmap page and thus + that there is at least one bitmap page in the file + + Structure of bitmap page: + + Fixed size records (to be implemented later): + + 2 bits are used to indicate: + + 0 Empty + 1 0-75 % full (at least room for 2 records) + 2 75-100 % full (at least room for one record) + 3 100 % full (no more room for records) + + Assuming 8K pages, this will allow us to map: + 8192 (bytes per page) * 4 (pages mapped per byte) * 8192 (page size)= 256M + + (For Maria this will be 7*4 * 8192 = 224K smaller because of LSN) + + Note that for fixed size rows, we can't add more columns without doing + a full reorganization of the table. The user can always force a dynamic + size row format by specifying ROW_FORMAT=dynamic. + + + Dynamic size records: + + 3 bits are used to indicate Bytes free in 8K page + + 0 Empty page 8176 (head or tail) + 1 0-30 % full (at least room for 3 records) 5724 + 2 30-60 % full (at least room for 2 records) 3271 + 3 60-90 % full (at least room for one record) 818 + 4 100 % full (no more room for records) 0 + 5 Tail page, 0-40 % full 4906 + 6 Tail page, 40-80 % full 1636 + 7 Full tail page or full blob page 0 + + Assuming 8K pages, this will allow us to map: + 8192 (bytes per page) * 8 bits/byte / 3 bits/page * 8192 (page size)= 170.7M + + Note that values 1-3 may be adjust for each individual table based on + 'min record length'. Tail pages are for overflow data which can be of + any size and thus doesn't have to be adjusted for different tables. + If we add more columns to the table, some of the originally calculated + 'cut off' points may not be optimal, but they shouldn't be 'drasticly + wrong'. + + When allocating data from the bitmap, we are trying to do it in a + 'best fit' manner. Blobs and varchar blocks are given out in large + continuous extents to allow fast access to these. Before allowing a + row to 'flow over' to other blocks, we will compact the page and use + all space on it. If there is many rows in the page, we will ensure + there is *LEFT_TO_GROW_ON_SPLIT* bytes left on the page to allow other + rows to grow. + + The bitmap format allows us to extend the row file in big chunks, if needed. + + When calculating the size for a packed row, we will calculate the following + things separately: + - Row header + null_bits + empty_bits fixed size segments etc. + - Size of all char/varchar fields + - Size of each blob field + + The bitmap handler will get all the above information and return + either one page or a set of pages to put the different parts. + + Bitmaps are read on demand in response to insert/delete/update operations. + The following bitmap pointers will be cached and stored on disk on close: + - Current insert_bitmap; When inserting new data we will first try to + fill this one. + - First bitmap which is not completely full. This is updated when we + free data with an update or delete. + + While flushing out bitmaps, we will cache the status of the bitmap in memory + to avoid having to read a bitmap for insert of new data that will not + be of any use + - Total empty space + - Largest number of continuous pages + + Bitmap ONLY goes to disk in the following scenarios + - The file is closed (and we flush all changes to disk) + - On checkpoint + (Ie: When we do a checkpoint, we have to ensure that all bitmaps are + put on disk even if they are not in the page cache). + - When explicitly requested (for example on backup or after recovery, + to simplify things) + + The flow of writing a row is that: + - Mark the bitmap not flushable (_ma_bitmap_flushable(X, 1)) + - Lock the bitmap + - Decide which data pages we will write to + - Mark them full in the bitmap page so that other threads do not try to + use the same data pages as us + - We unlock the bitmap + - Write the data pages + - Lock the bitmap + - Correct the bitmap page with the true final occupation of the data + pages (that is, we marked pages full but when we are done we realize + we didn't fill them) + - Unlock the bitmap. + - Mark the bitmap flushable (_ma_bitmap_flushable(X, -1)) +*/ + +#include "maria_def.h" +#include "ma_blockrec.h" + +#define FULL_HEAD_PAGE 4 +#define FULL_TAIL_PAGE 7 + +const char *bits_to_txt[]= +{ + "empty", "00-30% full", "30-60% full", "60-90% full", "full", + "tail 00-40 % full", "tail 40-80 % full", "tail/blob full" +}; + +#define WRONG_BITMAP_FLUSH 0 /*define to 1 only for provoking bugs*/ + +static my_bool _ma_read_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page); +static my_bool _ma_bitmap_create_missing(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page); +static void _ma_bitmap_unpin_all(MARIA_SHARE *share); +#ifndef DBUG_OFF +static void _ma_check_bitmap(MARIA_FILE_BITMAP *bitmap); +#else +#define _ma_check_bitmap(A) do { } while(0) +#endif + + +/* Write bitmap page to key cache */ + +static inline my_bool write_changed_bitmap(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap) +{ + my_bool res; + DBUG_ENTER("write_changed_bitmap"); + DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size); + DBUG_ASSERT(bitmap->file.pre_write_hook != 0); + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + + /* + Mark that a bitmap page has been written to page cache and we have + to flush it during checkpoint. + */ + bitmap->changed_not_flushed= 1; + + if ((bitmap->non_flushable == 0) || WRONG_BITMAP_FLUSH) + { + res= pagecache_write(share->pagecache, + &bitmap->file, bitmap->page, 0, + bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE); + DBUG_ASSERT(!res); + DBUG_RETURN(res); + } + else + { + /* + bitmap->non_flushable means that someone has changed the bitmap, + but it's not yet complete so it can't yet be written to disk. + In this case we write the changed bitmap to the disk cache, + but keep it pinned until the change is completed. The page will + be unpinned later by _ma_bitmap_unpin_all() as soon as non_flushable + is set back to 0. + */ + MARIA_PINNED_PAGE page_link; + DBUG_PRINT("info", ("Writing pinned bitmap page")); + res= pagecache_write(share->pagecache, + &bitmap->file, bitmap->page, 0, + bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE); + page_link.unlock= PAGECACHE_LOCK_LEFT_UNLOCKED; + page_link.changed= 1; + push_dynamic(&bitmap->pinned_pages, (const uchar*) (void*) &page_link); + DBUG_ASSERT(!res); + DBUG_RETURN(res); + } +} + +/* + Initialize bitmap variables in share + + SYNOPSIS + _ma_bitmap_init() + share Share handler + file Data file handler + last_page Pointer to last page (max_file_size) that needs to be + mapped by the bitmap. This is adjusted to bitmap + alignment. + + NOTES + This is called the first time a file is opened. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_init(MARIA_SHARE *share, File file, + pgcache_page_no_t *last_page) +{ + uint aligned_bit_blocks; + uint max_page_size; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + uint size= share->block_size; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + pgcache_page_no_t first_bitmap_with_space; +#ifndef DBUG_OFF + /* We want to have a copy of the bitmap to be able to print differences */ + size*= 2; +#endif + + if (!((bitmap->map= (uchar*) my_malloc(PSI_INSTRUMENT_ME, size, flag))) || + my_init_dynamic_array(PSI_INSTRUMENT_ME, &bitmap->pinned_pages, + sizeof(MARIA_PINNED_PAGE), 1, 1, flag)) + return 1; + + bitmap->share= share; + bitmap->block_size= share->block_size; + bitmap->file.file= file; + _ma_bitmap_set_pagecache_callbacks(&bitmap->file, share); + + /* Size needs to be aligned on 6 */ + aligned_bit_blocks= (share->block_size - PAGE_SUFFIX_SIZE) / 6; + bitmap->max_total_size= bitmap->total_size= aligned_bit_blocks * 6; + /* + In each 6 bytes, we have 6*8/3 = 16 pages covered + The +1 is to add the bitmap page, as this doesn't have to be covered + */ + bitmap->pages_covered= aligned_bit_blocks * 16 + 1; + bitmap->flush_all_requested= bitmap->waiting_for_flush_all_requested= + bitmap->waiting_for_non_flushable= 0; + bitmap->non_flushable= 0; + + /* Update size for bits */ + /* TODO; Make this dependent of the row size */ + max_page_size= share->block_size - PAGE_OVERHEAD_SIZE(share) + DIR_ENTRY_SIZE; + bitmap->sizes[0]= max_page_size; /* Empty page */ + bitmap->sizes[1]= max_page_size - max_page_size * 30 / 100; + bitmap->sizes[2]= max_page_size - max_page_size * 60 / 100; + bitmap->sizes[3]= max_page_size - max_page_size * 90 / 100; + bitmap->sizes[4]= 0; /* Full page */ + bitmap->sizes[5]= max_page_size - max_page_size * 40 / 100; + bitmap->sizes[6]= max_page_size - max_page_size * 80 / 100; + bitmap->sizes[7]= 0; + + /* + If a record size will fit into the smallest empty page, return first + found page in find_head() + */ + if (bitmap->sizes[3] >= share->base.max_pack_length) + bitmap->return_first_match= 1; + + mysql_mutex_init(key_SHARE_BITMAP_lock, + &share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW); + mysql_cond_init(key_SHARE_BITMAP_cond, + &share->bitmap.bitmap_cond, 0); + + first_bitmap_with_space= share->state.first_bitmap_with_space; + _ma_bitmap_reset_cache(share); + + /* + The bitmap used to map the file are aligned on 6 bytes. We now + calculate the max file size that can be used by the bitmap. This + is needed to get ma_info() give a true file size so that the user can + estimate if there is still space free for records in the file. + */ + { + pgcache_page_no_t last_bitmap_page; + ulong blocks, bytes; + + last_bitmap_page= *last_page - *last_page % bitmap->pages_covered; + blocks= (ulong) (*last_page - last_bitmap_page); + bytes= (blocks * 3) / 8; /* 3 bit per page / 8 bits per byte */ + /* Size needs to be aligned on 6 */ + bytes/= 6; + bytes*= 6; + bitmap->last_bitmap_page= last_bitmap_page; + bitmap->last_total_size= (uint)bytes; + *last_page= ((last_bitmap_page + bytes*8/3)); + } + + /* Restore first_bitmap_with_space if it's resonable */ + if (first_bitmap_with_space <= (share->state.state.data_file_length / + share->block_size)) + share->state.first_bitmap_with_space= first_bitmap_with_space; + + return 0; +} + + +/* + Free data allocated by _ma_bitmap_init + + SYNOPSIS + _ma_bitmap_end() + share Share handler +*/ + +my_bool _ma_bitmap_end(MARIA_SHARE *share) +{ + my_bool res; + +#ifndef DBUG_OFF + if (! share->internal_table) + mysql_mutex_assert_owner(&share->close_lock); +#endif + DBUG_ASSERT(share->bitmap.non_flushable == 0); + DBUG_ASSERT(share->bitmap.flush_all_requested == 0); + DBUG_ASSERT(share->bitmap.waiting_for_non_flushable == 0 && + share->bitmap.waiting_for_flush_all_requested == 0); + DBUG_ASSERT(share->bitmap.pinned_pages.elements == 0); + + res= _ma_bitmap_flush(share); + mysql_mutex_destroy(&share->bitmap.bitmap_lock); + mysql_cond_destroy(&share->bitmap.bitmap_cond); + delete_dynamic(&share->bitmap.pinned_pages); + my_free(share->bitmap.map); + share->bitmap.map= 0; + /* + This is to not get an assert in checkpoint. The bitmap will be flushed + at once by _ma_once_end_block_record() as part of the normal flush + of the kfile. + */ + share->bitmap.changed_not_flushed= 0; + return res; +} + +/* + Ensure that we have incremented open count before we try to read/write + a page while we have the bitmap lock. + This is needed to ensure that we don't call _ma_mark_file_changed() as + part of flushing a page to disk, as this locks share->internal_lock + and then mutex lock would happen in the wrong order. +*/ + +static inline void _ma_bitmap_mark_file_changed(MARIA_SHARE *share, + my_bool flush_translog) +{ + /* + It's extremely unlikely that the following test is true as it + only happens once if the table has changed. + */ + if (unlikely(!share->global_changed && + (share->state.changed & STATE_CHANGED))) + { + /* purecov: begin inspected */ + /* unlock mutex as it can't be hold during _ma_mark_file_changed() */ + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + + /* + We have to flush the translog to ensure we have registered that the + table is open. + */ + if (flush_translog && share->now_transactional) + (void) translog_flush(share->state.logrec_file_id); + + _ma_mark_file_changed_now(share); + mysql_mutex_lock(&share->bitmap.bitmap_lock); + /* purecov: end */ + } +} + +/* + Send updated bitmap to the page cache + + SYNOPSIS + _ma_bitmap_flush() + share Share handler + + NOTES + In the future, _ma_bitmap_flush() will be called to flush changes don't + by this thread (ie, checking the changed flag is ok). The reason we + check it again in the mutex is that if someone else did a flush at the + same time, we don't have to do the write. + This is also ok for _ma_scan_init_block_record() which does not want to + miss rows: it cares only for committed rows, that is, rows for which there + was a commit before our transaction started; as commit and transaction's + start are protected by the same LOCK_trn_list mutex, we see memory at + least as new as at other transaction's commit time, so if the committed + rows caused bitmap->changed to be true, we see it; if we see 0 it really + means a flush happened since then. So, it's ok to read without bitmap's + mutex. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_flush(MARIA_SHARE *share) +{ + my_bool res= 0; + DBUG_ENTER("_ma_bitmap_flush"); + if (share->bitmap.changed) + { + mysql_mutex_lock(&share->bitmap.bitmap_lock); + if (share->bitmap.changed) + { + /* + We have to mark the file changed here, as otherwise the following + write to pagecache may force a page out from this file, which would + cause _ma_mark_file_changed() to be called with bitmaplock hold! + */ + _ma_bitmap_mark_file_changed(share, 1); + res= write_changed_bitmap(share, &share->bitmap); + share->bitmap.changed= 0; + } + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + } + DBUG_RETURN(res); +} + + +/** + Dirty-page filtering criteria for bitmap pages + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg pages_covered of bitmap +*/ + +static enum pagecache_flush_filter_result +filter_flush_bitmap_pages(enum pagecache_page_type type + __attribute__ ((unused)), + pgcache_page_no_t pageno, + LSN rec_lsn __attribute__ ((unused)), + void *arg) +{ + return ((pageno % (*(ulong*)arg)) == 0); +} + + +/** + Flushes current bitmap page to the pagecache, and then all bitmap pages + from pagecache to the file. Used by Checkpoint. + + @param share Table's share +*/ + +my_bool _ma_bitmap_flush_all(MARIA_SHARE *share) +{ + my_bool res= 0; + uint send_signal= 0; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_flush_all"); + +#ifdef EXTRA_DEBUG_BITMAP + { + char buff[160]; + uint len= my_sprintf(buff, + (buff, "bitmap_flush: fd: %d id: %u " + "changed: %d changed_not_flushed: %d " + "flush_all_requested: %d", + share->bitmap.file.file, + share->id, + bitmap->changed, + bitmap->changed_not_flushed, + bitmap->flush_all_requested)); + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) buff, len); + } +#endif + + mysql_mutex_lock(&bitmap->bitmap_lock); + if (!bitmap->changed && !bitmap->changed_not_flushed) + { + mysql_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(0); + } + + _ma_bitmap_mark_file_changed(share, 0); + + /* + The following should be true as it was tested above. We have to test + this again as _ma_bitmap_mark_file_changed() did temporarly release + the bitmap mutex. + */ + if (bitmap->changed || bitmap->changed_not_flushed) + { + bitmap->flush_all_requested++; + bitmap->waiting_for_non_flushable++; +#if !WRONG_BITMAP_FLUSH + while (bitmap->non_flushable > 0) + { + DBUG_PRINT("info", ("waiting for bitmap to be flushable")); + mysql_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock); + } +#endif + bitmap->waiting_for_non_flushable--; +#ifdef EXTRA_DEBUG_BITMAP + { + char tmp[MAX_BITMAP_INFO_LENGTH]; + size_t len; + len= _ma_get_bitmap_description(bitmap, bitmap->map, bitmap->page, tmp); + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) tmp, len); + } +#endif + + DBUG_ASSERT(bitmap->flush_all_requested == 1); + /* + Bitmap is in a flushable state: its contents in memory are reflected by + log records (complete REDO-UNDO groups) and all bitmap pages are + unpinned. We keep the mutex to preserve this situation, and flush to the + file. + */ + if (bitmap->changed) + { + bitmap->changed= FALSE; + res= write_changed_bitmap(share, bitmap); + } + /* + We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap + pages have been flushed. That's a condition of correctness of + Recovery: data pages may have been all flushed, if we write the + checkpoint record Recovery will start from after their REDOs. If + bitmap page was not flushed, as the REDOs about it will be skipped, it + will wrongly not be recovered. If bitmap pages had a rec_lsn it would + be different. + There should be no pinned pages as bitmap->non_flushable==0. + */ + if (flush_pagecache_blocks_with_filter(share->pagecache, + &bitmap->file, FLUSH_KEEP, + filter_flush_bitmap_pages, + &bitmap->pages_covered) & + PCFLUSH_PINNED_AND_ERROR) + res= TRUE; + bitmap->changed_not_flushed= FALSE; + bitmap->flush_all_requested--; + /* + Some well-behaved threads may be waiting for flush_all_requested to + become false, wake them up. + */ + DBUG_PRINT("info", ("bitmap flusher waking up others")); + send_signal= (bitmap->waiting_for_flush_all_requested | + bitmap->waiting_for_non_flushable); + } + mysql_mutex_unlock(&bitmap->bitmap_lock); + if (send_signal) + mysql_cond_broadcast(&bitmap->bitmap_cond); + DBUG_RETURN(res); +} + + +/** + @brief Lock bitmap from being used by another thread + + @fn _ma_bitmap_lock() + @param share Table's share + + @notes + This is a temporary solution for allowing someone to delete an inserted + duplicate-key row while someone else is doing concurrent inserts. + This is ok for now as duplicate key errors are not that common. + + In the future we will add locks for row-pages to ensure two threads doesn't + work at the same time on the same page. +*/ + +void _ma_bitmap_lock(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_lock"); + + if (!share->now_transactional) + DBUG_VOID_RETURN; + + mysql_mutex_lock(&bitmap->bitmap_lock); + bitmap->flush_all_requested++; + bitmap->waiting_for_non_flushable++; + while (bitmap->non_flushable) + { + DBUG_PRINT("info", ("waiting for bitmap to be flushable")); + mysql_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock); + } + bitmap->waiting_for_non_flushable--; + /* + Ensure that _ma_bitmap_flush_all() and _ma_bitmap_lock() are blocked. + ma_bitmap_flushable() is blocked thanks to 'flush_all_requested'. + */ + bitmap->non_flushable= 1; + mysql_mutex_unlock(&bitmap->bitmap_lock); + DBUG_VOID_RETURN; +} + +/** + @brief Unlock bitmap after _ma_bitmap_lock() + + @fn _ma_bitmap_unlock() + @param share Table's share +*/ + +void _ma_bitmap_unlock(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + uint send_signal; + DBUG_ENTER("_ma_bitmap_unlock"); + + if (!share->now_transactional) + DBUG_VOID_RETURN; + DBUG_ASSERT(bitmap->flush_all_requested > 0 && bitmap->non_flushable == 1); + + mysql_mutex_lock(&bitmap->bitmap_lock); + bitmap->non_flushable= 0; + _ma_bitmap_unpin_all(share); + send_signal= bitmap->waiting_for_non_flushable; + if (!--bitmap->flush_all_requested) + send_signal|= bitmap->waiting_for_flush_all_requested; + mysql_mutex_unlock(&bitmap->bitmap_lock); + if (send_signal) + mysql_cond_broadcast(&bitmap->bitmap_cond); + DBUG_VOID_RETURN; +} + + +/** + @brief Unpin all pinned bitmap pages + + @param share Table's share + + @return Operation status + @retval 0 ok + + @note This unpins pages pinned by other threads. +*/ + +static void _ma_bitmap_unpin_all(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&bitmap->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + bitmap->pinned_pages.elements; + DBUG_ENTER("_ma_bitmap_unpin_all"); + DBUG_PRINT("info", ("pinned: %zu", bitmap->pinned_pages.elements)); + while (pinned_page-- != page_link) + pagecache_unlock_by_link(share->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, FALSE, TRUE); + bitmap->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +/* + Intialize bitmap in memory to a zero bitmap + + SYNOPSIS + _ma_bitmap_delete_all() + share Share handler + + NOTES + This is called on maria_delete_all_rows (truncate data file). +*/ + +void _ma_bitmap_delete_all(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + DBUG_ENTER("_ma_bitmap_delete_all"); + if (bitmap->map) /* Not in create */ + { + bzero(bitmap->map, bitmap->block_size); + bitmap->changed= 1; + bitmap->page= 0; + bitmap->used_size= bitmap->full_tail_size= bitmap->full_head_size= 0; + bitmap->total_size= bitmap->max_total_size; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Reset bitmap caches + + @fn _ma_bitmap_reset_cache() + @param share Maria share + + @notes + This is called after we have swapped file descriptors and we want + bitmap to forget all cached information. + It's also called directly after we have opened a file. +*/ + +void _ma_bitmap_reset_cache(MARIA_SHARE *share) +{ + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + + if (bitmap->map) /* If using bitmap */ + { + /* Forget changes in current bitmap page */ + bitmap->changed= 0; + + /* + We can't read a page yet, as in some case we don't have an active + page cache yet. + Pretend we have a dummy, full and not changed bitmap page in memory. + + We set bitmap->page to a value so that if we use it in + move_to_next_bitmap() it will point to page 0. + (This can only happen if writing to a bitmap page fails) + */ + bitmap->page= ((pgcache_page_no_t) 0) - bitmap->pages_covered; + bitmap->used_size= bitmap->total_size= bitmap->max_total_size; + bitmap->full_head_size= bitmap->full_tail_size= bitmap->max_total_size; + bfill(bitmap->map, share->block_size, 255); +#ifndef DBUG_OFF + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + + /* Start scanning for free space from start of file */ + share->state.first_bitmap_with_space = 0; + } +} + + +/* + Return bitmap pattern for the smallest head block that can hold 'size' + + SYNOPSIS + size_to_head_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0-3 For a description of the bitmap sizes, see the header +*/ + +static uint size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size <= bitmap->sizes[3]) + return 3; + if (size <= bitmap->sizes[2]) + return 2; + if (size <= bitmap->sizes[1]) + return 1; + DBUG_ASSERT(size <= bitmap->sizes[0]); + return 0; +} + + +/* + Return bitmap pattern for head block where there is size bytes free + + SYNOPSIS + _ma_free_size_to_head_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0-4 (Possible bitmap patterns for head block) +*/ + +uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size < bitmap->sizes[3]) + return 4; + if (size < bitmap->sizes[2]) + return 3; + if (size < bitmap->sizes[1]) + return 2; + return (size < bitmap->sizes[0]) ? 1 : 0; +} + + +/* + Return bitmap pattern for the smallest tail block that can hold 'size' + + SYNOPSIS + size_to_tail_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0, 5 or 6 For a description of the bitmap sizes, see the header +*/ + +static uint size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size <= bitmap->sizes[6]) + return 6; + if (size <= bitmap->sizes[5]) + return 5; + DBUG_ASSERT(size <= bitmap->sizes[0]); + return 0; +} + + +/* + Return bitmap pattern for tail block where there is size bytes free + + SYNOPSIS + free_size_to_tail_pattern() + bitmap Bitmap + size Requested size + + RETURN + 0, 5, 6, 7 For a description of the bitmap sizes, see the header +*/ + +static uint free_size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size) +{ + if (size >= bitmap->sizes[0]) + return 0; /* Revert to empty page */ + if (size < bitmap->sizes[6]) + return 7; + if (size < bitmap->sizes[5]) + return 6; + return 5; +} + + +/* + Return size guranteed to be available on a page + + SYNOPSIS + pattern_to_head_size() + bitmap Bitmap + pattern Pattern (0-7) + + RETURN + 0 - block_size +*/ + +static inline uint pattern_to_size(MARIA_FILE_BITMAP *bitmap, uint pattern) +{ + DBUG_ASSERT(pattern <= 7); + return bitmap->sizes[pattern]; +} + + +/* + Print bitmap for debugging + + SYNOPSIS + _ma_print_bitmap_changes() + bitmap Bitmap to print + + IMPLEMENTATION + Prints all changed bits since last call to _ma_print_bitmap(). + This is done by having a copy of the last bitmap in + bitmap->map+bitmap->block_size. +*/ + +#ifndef DBUG_OFF + +static void _ma_print_bitmap_changes(MARIA_FILE_BITMAP *bitmap) +{ + uchar *pos, *end, *org_pos; + ulong page; + DBUG_ENTER("_ma_print_bitmap_changes"); + + end= bitmap->map + bitmap->used_size; + DBUG_LOCK_FILE; + fprintf(DBUG_FILE,"\nBitmap page changes at page: %lu bitmap: %p\n", + (ulong) bitmap->page, bitmap->map); + + page= (ulong) bitmap->page+1; + for (pos= bitmap->map, org_pos= bitmap->map + bitmap->block_size ; + pos < end ; + pos+= 6, org_pos+= 6) + { + ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */ + ulonglong org_bits= uint6korr(org_pos); + uint i; + + /* + Test if there is any changes in the next 16 bitmaps (to not have to + loop through all bits if we know they are the same) + */ + if (bits != org_bits) + { + for (i= 0; i < 16 ; i++, bits>>= 3, org_bits>>= 3) + { + if ((bits & 7) != (org_bits & 7)) + fprintf(DBUG_FILE, "Page: %8lu %s -> %s\n", page+i, + bits_to_txt[org_bits & 7], bits_to_txt[bits & 7]); + } + } + page+= 16; + } + fputc('\n', DBUG_FILE); + DBUG_UNLOCK_FILE; + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); + DBUG_VOID_RETURN; +} + + +/* Print content of bitmap for debugging */ + +void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data, + pgcache_page_no_t page) +{ + uchar *pos, *end; + char llbuff[22]; + + DBUG_LOCK_FILE; + fprintf(DBUG_FILE,"\nDump of bitmap page at %s\n", llstr(page, llbuff)); + + page++; /* Skip bitmap page */ + for (pos= data, end= pos + bitmap->max_total_size; + pos < end ; + pos+= 6) + { + ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */ + + /* + Test if there is any changes in the next 16 bitmaps (to not have to + loop through all bits if we know they are the same) + */ + if (bits) + { + uint i; + for (i= 0; i < 16 ; i++, bits>>= 3) + { + if (bits & 7) + fprintf(DBUG_FILE, "Page: %8s %s\n", llstr(page+i, llbuff), + bits_to_txt[bits & 7]); + } + } + page+= 16; + } + fputc('\n', DBUG_FILE); + DBUG_UNLOCK_FILE; +} + +#endif /* DBUG_OFF */ + + +/* + Return content of bitmap as a printable string +*/ + +size_t _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap, + uchar *bitmap_data, + pgcache_page_no_t page, + char *out) +{ + uchar *pos, *end; + size_t count=0, dot_printed= 0, len; + char buff[80], last[80]; + + page++; + last[0]=0; + for (pos= bitmap_data, end= pos+ bitmap->used_size ; pos < end ; pos+= 6) + { + ulonglong bits= uint6korr(pos); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + for (i= 0; i < 16 ; i++, bits>>= 3) + { + if (count > 60) + { + if (memcmp(buff, last, count)) + { + memcpy(last, buff, count); + len= sprintf(out, "%8lu: ", (ulong) (page - count)); + memcpy(out+len, buff, count); + out+= len + count + 1; + out[-1]= '\n'; + dot_printed= 0; + } + else if (!(dot_printed++)) + { + out= strmov(out, "...\n"); + } + count= 0; + } + buff[count++]= '0' + (uint) (bits & 7); + page++; + } + } + len= sprintf(out, "%8lu: ", (ulong) (page - count)); + memcpy(out+len, buff, count); + out[len + count]= '\n'; + out[len + count + 1]= 0; + return len + count + 1; +} + + +/* + Adjust bitmap->total_size to not go over max_data_file_size +*/ + +static void adjust_total_size(MARIA_HA *info, pgcache_page_no_t page) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + + if (page < bitmap->last_bitmap_page) + bitmap->total_size= bitmap->max_total_size; /* Use all bits in bitmap */ + else + bitmap->total_size= bitmap->last_total_size; +} + +/*************************************************************************** + Reading & writing bitmap pages +***************************************************************************/ + +/* + Read a given bitmap page + + SYNOPSIS + _ma_read_bitmap_page() + info Maria handler + bitmap Bitmap handler + page Page to read + + NOTE + We don't always have share->bitmap.bitmap_lock here + (when called from_ma_check_bitmap_data() for example). + + RETURN + 0 ok + 1 error (Error writing old bitmap or reading bitmap page) +*/ + +static my_bool _ma_read_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + MARIA_SHARE *share= info->s; + my_bool res; + DBUG_ENTER("_ma_read_bitmap_page"); + DBUG_PRINT("enter", ("page: %lld data_file_length: %lld", + (longlong) page, + (longlong) share->state.state.data_file_length)); + DBUG_ASSERT(page % bitmap->pages_covered == 0); + DBUG_ASSERT(!bitmap->changed); + + bitmap->page= page; + if ((page + 1) * bitmap->block_size > share->state.state.data_file_length) + { + /* Inexistent or half-created page */ + res= _ma_bitmap_create_missing(info, bitmap, page); + if (!res) + adjust_total_size(info, page); + DBUG_RETURN(res); + } + + adjust_total_size(info, page); + bitmap->full_head_size= bitmap->full_tail_size= 0; + DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size); + res= pagecache_read(share->pagecache, + &bitmap->file, page, 0, + bitmap->map, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == NULL; + + if (!res) + { + /* Calculate used_size */ + const uchar *data, *end= bitmap->map; + for (data= bitmap->map + bitmap->total_size; --data >= end && *data == 0; ) + {} + bitmap->used_size= (uint) ((data + 1) - end); + DBUG_ASSERT(bitmap->used_size <= bitmap->total_size); + } + else + { + _ma_set_fatal_error(info, my_errno); + } + /* + We can't check maria_bitmap_marker here as if the bitmap page + previously had a true checksum and the user switched mode to not checksum + this may have any value, except maria_normal_page_marker. + + Using maria_normal_page_marker gives us a protection against bugs + when running without any checksums. + */ + +#ifndef DBUG_OFF + if (!res) + { + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); + _ma_check_bitmap(bitmap); + } +#endif + DBUG_RETURN(res); +} + + +/* + Change to another bitmap page + + SYNOPSIS + _ma_change_bitmap_page() + info Maria handler + bitmap Bitmap handler + page Bitmap page to read + + NOTES + If old bitmap was changed, write it out before reading new one + We return empty bitmap if page is outside of file size + + RETURN + 0 ok + 1 error (Error writing old bitmap or reading bitmap page) +*/ + +static my_bool _ma_change_bitmap_page(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + DBUG_ENTER("_ma_change_bitmap_page"); + + _ma_check_bitmap(bitmap); + + /* + We have to mark the file changed here, as otherwise the following + read/write to pagecache may force a page out from this file, which would + cause _ma_mark_file_changed() to be called with bitmaplock hold! + */ + _ma_bitmap_mark_file_changed(info->s, 1); + + if (bitmap->changed) + { + if (write_changed_bitmap(info->s, bitmap)) + DBUG_RETURN(1); + bitmap->changed= 0; + } + DBUG_RETURN(_ma_read_bitmap_page(info, bitmap, page)); +} + + +/* + Read next suitable bitmap + + SYNOPSIS + move_to_next_bitmap() + bitmap Bitmap handle + + NOTES + The found bitmap may be full, so calling function may need to call this + repeatedly until it finds enough space. + + TODO + Add cache of bitmaps to not read something that is not usable + + RETURN + 0 ok + 1 error (either couldn't save old bitmap or read new one) +*/ + +static my_bool move_to_next_bitmap(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap) +{ + pgcache_page_no_t page= bitmap->page; + MARIA_STATE_INFO *state= &info->s->state; + DBUG_ENTER("move_to_next_bitmap"); + + if (state->first_bitmap_with_space != ~(pgcache_page_no_t) 0 && + state->first_bitmap_with_space != page) + { + page= state->first_bitmap_with_space; + state->first_bitmap_with_space= ~(pgcache_page_no_t) 0; + DBUG_ASSERT(page % bitmap->pages_covered == 0); + } + else + { + page+= bitmap->pages_covered; + DBUG_ASSERT(page % bitmap->pages_covered == 0); + } + DBUG_RETURN(_ma_change_bitmap_page(info, bitmap, page)); +} + + +/**************************************************************************** + Allocate data in bitmaps +****************************************************************************/ + +/* + Store data in 'block' and mark the place used in the bitmap + + SYNOPSIS + fill_block() + bitmap Bitmap handle + block Store data about what we found + best_data Pointer to best 6 uchar aligned area in bitmap->map + best_pos Which bit in *best_data the area starts + 0 = first bit pattern, 1 second bit pattern etc + best_bits The original value of the bits at best_pos + fill_pattern Bitmap pattern to store in best_data[best_pos] + + NOTES + We mark all pages to be 'TAIL's, which means that + block->page_count is really a row position inside the page. +*/ + +static void fill_block(MARIA_FILE_BITMAP *bitmap, + MARIA_BITMAP_BLOCK *block, + uchar *best_data, uint best_pos, uint best_bits, + uint fill_pattern) +{ + uint page, offset, tmp; + uchar *data; + DBUG_ENTER("fill_block"); + + /* For each 6 bytes we have 6*8/3= 16 patterns */ + page= ((uint) (best_data - bitmap->map)) / 6 * 16 + best_pos; + DBUG_ASSERT(page + 1 < bitmap->pages_covered); + block->page= bitmap->page + 1 + page; + block->page_count= TAIL_PAGE_COUNT_MARKER; + block->empty_space= pattern_to_size(bitmap, best_bits); + block->sub_blocks= 0; + block->org_bitmap_value= best_bits; + block->used= BLOCKUSED_TAIL; /* See _ma_bitmap_release_unused() */ + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + best_pos*= 3; + data= best_data+ best_pos / 8; + offset= best_pos & 7; + tmp= uint2korr(data); + + /* we turn off the 3 bits and replace them with fill_pattern */ + tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset); + int2store(data, tmp); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_VOID_RETURN; +} + + +/* + Allocate data for head block + + SYNOPSIS + allocate_head() + bitmap bitmap + size Size of data region we need to store + block Store found information here + + IMPLEMENTATION + Find the best-fit page to put a region of 'size' + This is defined as the first page of the set of pages + with the smallest free space that can hold 'size'. + + NOTES + Updates bitmap->full_head_size while scanning data + + RETURN + 0 ok (block is updated) + 1 error (no space in bitmap; block is not touched) +*/ + + +static my_bool allocate_head(MARIA_FILE_BITMAP *bitmap, uint size, + MARIA_BITMAP_BLOCK *block) +{ + uint min_bits= size_to_head_pattern(bitmap, size); + uchar *data, *end; + uchar *best_data= 0; + uint best_bits= (uint) -1, UNINIT_VAR(best_pos); + my_bool first_pattern= 0; /* if doing insert_order */ + my_bool first_found= 1; + MARIA_SHARE *share= bitmap->share; + my_bool insert_order= + MY_TEST(share->base.extra_options & MA_EXTRA_OPTIONS_INSERT_ORDER); + DBUG_ENTER("allocate_head"); + + DBUG_ASSERT(size <= FULL_PAGE_SIZE(share)); + + end= bitmap->map + bitmap->used_size; + if (insert_order && bitmap->page == share->last_insert_bitmap) + { + uint last_insert_page= share->last_insert_page; + uint byte= 6 * (last_insert_page / 16); + first_pattern= last_insert_page % 16; + data= bitmap->map+byte; + first_found= 0; /* Don't update full_head_size */ + DBUG_ASSERT(data <= end); + } + else + data= bitmap->map + (bitmap->full_head_size/6)*6; + + for (; data < end; data+= 6, first_pattern= 0) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + /* + Skip common patterns + We can skip empty pages (if we already found a match) or + anything matching the following pattern as this will be either + a full page or a tail page + */ + if ((!bits && best_data) || + ((bits & 04444444444444444LL) == 04444444444444444LL)) + continue; + + for (i= first_pattern, bits >>= (3 * first_pattern); i < 16 ; + i++, bits >>= 3) + { + uint pattern= (uint) (bits & 7); + + if (pattern <= 3) /* Room for more data */ + { + if (first_found) + { + first_found= 0; + bitmap->full_head_size= (uint)(data - bitmap->map); + } + } + if (pattern <= min_bits) + { + /* There is enough space here, check if we have found better */ + if ((int) pattern > (int) best_bits) + { + /* + There is more than enough space here and it's better than what + we have found so far. Remember it, as we will choose it if we + don't find anything in this bitmap page. + */ + best_bits= pattern; + best_data= data; + best_pos= i; + if (pattern == min_bits || bitmap->return_first_match) + goto found; /* Best possible match */ + } + } + } + } + if (!best_data) /* Found no place */ + { + if (data >= bitmap->map + bitmap->total_size) + DBUG_RETURN(1); /* No space in bitmap */ + DBUG_ASSERT(uint6korr(data) == 0); + /* Allocate data at end of bitmap */ + bitmap->used_size= (uint) (data - bitmap->map) + 6; + best_data= data; + best_pos= best_bits= 0; + } + else + { + /* + This is not stricly needed as used_size should be alligned on 6, + but for easier debugging lets try to keep it more accurate + */ + uint position= (uint) (best_data - bitmap->map) + 6; + set_if_bigger(bitmap->used_size, position); + } + DBUG_ASSERT(bitmap->used_size <= bitmap->total_size); + +found: + if (insert_order) + { + share->last_insert_page= + ((uint) (best_data - bitmap->map)) / 6 * 16 + best_pos; + share->last_insert_bitmap= bitmap->page; + } + fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_HEAD_PAGE); + DBUG_RETURN(0); +} + + +/* + Allocate data for tail block + + SYNOPSIS + allocate_tail() + bitmap bitmap + size Size of block we need to find + block Store found information here + + RETURN + 0 ok (block is updated) + 1 error (no space in bitmap; block is not touched) +*/ + + +static my_bool allocate_tail(MARIA_FILE_BITMAP *bitmap, uint size, + MARIA_BITMAP_BLOCK *block) +{ + uint min_bits= size_to_tail_pattern(bitmap, size); + uchar *data, *end, *best_data= 0; + my_bool first_found= 1; + uint best_bits= (uint) -1, UNINIT_VAR(best_pos); + DBUG_ENTER("allocate_tail"); + DBUG_PRINT("enter", ("size: %u", size)); + + data= bitmap->map + (bitmap->full_tail_size/6)*6; + end= bitmap->map + bitmap->used_size; + + /* + We have to add DIR_ENTRY_SIZE here as this is not part of the data size + See call to allocate_tail() in find_tail(). + */ + DBUG_ASSERT(size <= MAX_TAIL_SIZE(bitmap->block_size) + DIR_ENTRY_SIZE); + + for (; data < end; data += 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + /* + Skip common patterns + We can skip empty pages (if we already found a match) or + the following patterns: 1-4 (head pages, not suitable for tail) or + 7 (full tail page). See 'Dynamic size records' comment at start of file. + + At the moment we only skip full head and tail pages (ie, all bits are + set) as this is easy to detect with one simple test and is a + quite common case if we have blobs. + */ + + if ((!bits && best_data) || bits == 0xffffffffffffLL || + bits == 04444444444444444LL) + continue; + for (i= 0; i < 16; i++, bits >>= 3) + { + uint pattern= (uint) (bits & 7); + + if (pattern == 0 || + (pattern > FULL_HEAD_PAGE && pattern < FULL_TAIL_PAGE)) + { + /* There is room for tail data */ + if (first_found) + { + first_found= 0; + bitmap->full_tail_size= (uint)(data - bitmap->map); + } + } + + if (pattern <= min_bits && (!pattern || pattern > FULL_HEAD_PAGE)) + { + if ((int) pattern > (int) best_bits) + { + best_bits= pattern; + best_data= data; + best_pos= i; + if (pattern == min_bits) + goto found; /* Can't be better */ + } + } + } + } + if (!best_data) + { + if (data >= bitmap->map + bitmap->total_size) + DBUG_RETURN(1); + DBUG_ASSERT(uint6korr(data) == 0); + /* Allocate data at end of bitmap */ + best_data= data; + bitmap->used_size= (uint) (data - bitmap->map) + 6; + DBUG_ASSERT(bitmap->used_size <= bitmap->total_size); + best_pos= best_bits= 0; + } + +found: + fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_TAIL_PAGE); + DBUG_RETURN(0); +} + + +/* + Allocate data for full blocks + + SYNOPSIS + allocate_full_pages() + bitmap bitmap + pages_needed Total size in pages (bitmap->total_size) we would like to have + block Store found information here + full_page 1 if we are not allowed to split extent + + IMPLEMENTATION + We will return the smallest area >= size. If there is no such + block, we will return the biggest area that satisfies + area_size >= MY_MIN(BLOB_SEGMENT_MIN_SIZE*full_page_size, size) + + To speed up searches, we will only consider areas that has at least 16 free + pages starting on an even boundary. When finding such an area, we will + extend it with all previous and following free pages. This will ensure + we don't get holes between areas + + RETURN + # Blocks used + 0 error (no space in bitmap; block is not touched) +*/ + +static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap, + ulong pages_needed, + MARIA_BITMAP_BLOCK *block, my_bool full_page) +{ + uchar *data, *data_end, *page_end; + uchar *best_data= 0; + uint min_size; + uint best_area_size, UNINIT_VAR(best_prefix_area_size); + uint page, size; + ulonglong UNINIT_VAR(best_prefix_bits); + DBUG_ENTER("allocate_full_pages"); + DBUG_PRINT("enter", ("pages_needed: %lu", pages_needed)); + + min_size= pages_needed; + if (!full_page && min_size > BLOB_SEGMENT_MIN_SIZE) + min_size= BLOB_SEGMENT_MIN_SIZE; + best_area_size= ~(uint) 0; + + data= bitmap->map + (bitmap->full_head_size/6)*6; + data_end= bitmap->map + bitmap->used_size; + page_end= bitmap->map + bitmap->total_size; + + for (; data < page_end; data+= 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uchar *data_start; + ulonglong prefix_bits= 0; + uint area_size, prefix_area_size, suffix_area_size; + + /* Find area with at least 16 free pages */ + if (bits) + continue; + data_start= data; + /* Find size of area */ + for (data+=6 ; data < data_end ; data+= 6) + { + if ((bits= uint6korr(data))) + break; + } + /* + Check if we are end of bitmap. In this case we know that + the rest of the bitmap is usable + */ + if (data >= data_end) + data= page_end; + area_size= (uint) (data - data_start) / 6 * 16; + if (area_size >= best_area_size) + continue; + prefix_area_size= suffix_area_size= 0; + if (!bits) + { + /* + End of page; All the rest of the bits on page are part of area + This is needed because bitmap->used_size only covers the set bits + in the bitmap. + */ + area_size+= (uint) (page_end - data) / 6 * 16; + if (area_size >= best_area_size) + break; + data= page_end; + } + else + { + /* Add bits at end of page */ + for (; !(bits & 7); bits >>= 3) + suffix_area_size++; + area_size+= suffix_area_size; + } + if (data_start != bitmap->map) + { + /* Add bits before page */ + bits= prefix_bits= uint6korr(data_start - 6); + DBUG_ASSERT(bits != 0); + /* 111 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 */ + if (!(bits & 07000000000000000LL)) + { + data_start-= 6; + do + { + prefix_area_size++; + bits<<= 3; + } while (!(bits & 07000000000000000LL)); + area_size+= prefix_area_size; + /* Calculate offset to page from data_start */ + prefix_area_size= 16 - prefix_area_size; + } + } + if (area_size >= min_size && area_size <= best_area_size) + { + best_data= data_start; + best_area_size= area_size; + best_prefix_bits= prefix_bits; + best_prefix_area_size= prefix_area_size; + + /* Prefer to put data in biggest possible area */ + if (area_size <= pages_needed) + min_size= area_size; + else + min_size= pages_needed; + } + } + if (!best_data) + DBUG_RETURN(0); /* No room on page */ + + /* + Now allocate MY_MIN(pages_needed, area_size), starting from + best_start + best_prefix_area_size + */ + if (best_area_size > pages_needed) + best_area_size= pages_needed; + + /* For each 6 bytes we have 6*8/3= 16 patterns */ + page= ((uint) (best_data - bitmap->map) * 8) / 3 + best_prefix_area_size; + block->page= bitmap->page + 1 + page; + block->page_count= best_area_size; + block->empty_space= 0; + block->sub_blocks= 0; + block->org_bitmap_value= 0; + block->used= 0; + DBUG_ASSERT(page + best_area_size < bitmap->pages_covered); + DBUG_PRINT("info", ("page: %lu page_count: %u", + (ulong) block->page, block->page_count)); + + if (best_prefix_area_size) + { + ulonglong tmp; + /* Convert offset back to bits */ + best_prefix_area_size= 16 - best_prefix_area_size; + if (best_area_size < best_prefix_area_size) + { + tmp= (1LL << best_area_size*3) - 1; + best_area_size= best_prefix_area_size; /* for easy end test */ + } + else + tmp= (1LL << best_prefix_area_size*3) - 1; + tmp<<= (16 - best_prefix_area_size) * 3; + DBUG_ASSERT((best_prefix_bits & tmp) == 0); + best_prefix_bits|= tmp; + int6store(best_data, best_prefix_bits); + if (!(best_area_size-= best_prefix_area_size)) + goto end; + best_data+= 6; + } + best_area_size*= 3; /* Bits to set */ + size= best_area_size/8; /* Bytes to set */ + bfill(best_data, size, 255); + best_data+= size; + if ((best_area_size-= size * 8)) + { + /* fill last uchar */ + *best_data|= (uchar) ((1 << best_area_size) -1); + best_data++; + } + if (data_end < best_data) + { + bitmap->used_size= (uint) (best_data - bitmap->map); + DBUG_ASSERT(bitmap->used_size <= bitmap->total_size); + } +end: + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(block->page_count); +} + + +/**************************************************************************** + Find right bitmaps where to store data +****************************************************************************/ + +/* + Find right bitmap and position for head block + + SYNOPSIS + find_head() + info Maria handler + length Size of data region we need store + position Position in bitmap_blocks where to store the + information for the head block. + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_head(MARIA_HA *info, uint length, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + /* + There is always place for the head block in bitmap_blocks as these are + preallocated at _ma_init_block_record(). + */ + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + if (info->s->base.extra_options & MA_EXTRA_OPTIONS_INSERT_ORDER) + { + if (bitmap->page != info->s->last_insert_bitmap && + _ma_change_bitmap_page(info, bitmap, + info->s->last_insert_bitmap)) + return 1; + /* Don't allocate any blocks from earlier pages */ + info->s->state.first_bitmap_with_space= info->s->last_insert_bitmap; + } + + /* + We need to have DIRENTRY_SIZE here to take into account that we may + need an extra directory entry for the row + */ + while (allocate_head(bitmap, length + DIR_ENTRY_SIZE, block)) + if (move_to_next_bitmap(info, bitmap)) + return 1; + return 0; +} + + +/* + Find right bitmap and position for tail + + SYNOPSIS + find_tail() + info Maria handler + length Size of data region we need store + position Position in bitmap_blocks where to store the + information for the head block. + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_tail(MARIA_HA *info, uint length, size_t position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + DBUG_ENTER("find_tail"); + DBUG_ASSERT(length <= info->s->block_size - PAGE_OVERHEAD_SIZE(info->s)); + + /* Needed, as there is no error checking in dynamic_element */ + if (allocate_dynamic(&info->bitmap_blocks, position)) + DBUG_RETURN(1); + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + /* + We have to add DIR_ENTRY_SIZE to ensure we have space for the tail and + it's directroy entry on the page + */ + while (allocate_tail(bitmap, length + DIR_ENTRY_SIZE, block)) + if (move_to_next_bitmap(info, bitmap)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/* + Find right bitmap and position for full blocks in one extent + + SYNOPSIS + find_mid() + info Maria handler. + pages How many pages to allocate. + position Position in bitmap_blocks where to store the + information for the head block. + NOTES + This is used to allocate the main extent after the 'head' block + (Ie, the middle part of the head-middle-tail entry) + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_mid(MARIA_HA *info, ulong pages, uint position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *); + + while (!allocate_full_pages(bitmap, pages, block, 1)) + { + if (move_to_next_bitmap(info, bitmap)) + return 1; + } + return 0; +} + + +/* + Find right bitmap and position for putting a blob + + SYNOPSIS + find_blob() + info Maria handler. + length Length of the blob + + NOTES + The extents are stored last in info->bitmap_blocks + + IMPLEMENTATION + Allocate all full pages for the block + optionally one tail + + RETURN + 0 ok + 1 error +*/ + +static my_bool find_blob(MARIA_HA *info, ulong length) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint full_page_size= FULL_PAGE_SIZE(info->s); + ulong pages; + uint rest_length, used; + size_t UNINIT_VAR(first_block_pos); + MARIA_BITMAP_BLOCK *first_block= 0; + DBUG_ENTER("find_blob"); + DBUG_PRINT("enter", ("length: %lu", length)); + + pages= length / full_page_size; + rest_length= (uint) (length - pages * full_page_size); + if (rest_length >= MAX_TAIL_SIZE(info->s->block_size)) + { + pages++; + rest_length= 0; + } + + first_block_pos= info->bitmap_blocks.elements; + if (pages) + { + MARIA_BITMAP_BLOCK *block; + if (allocate_dynamic(&info->bitmap_blocks, + info->bitmap_blocks.elements + + pages / BLOB_SEGMENT_MIN_SIZE + 2)) + DBUG_RETURN(1); + block= dynamic_element(&info->bitmap_blocks, info->bitmap_blocks.elements, + MARIA_BITMAP_BLOCK*); + do + { + /* + We use 0x3fff here as the two upmost bits are reserved for + TAIL_BIT and START_EXTENT_BIT + */ + used= allocate_full_pages(bitmap, + (pages >= 0x3fff ? 0x3fff : (uint) pages), + block, 0); + if (!used) + { + if (move_to_next_bitmap(info, bitmap)) + DBUG_RETURN(1); + } + else + { + pages-= used; + info->bitmap_blocks.elements++; + block++; + } + } while (pages != 0); + } + if (rest_length && find_tail(info, rest_length, + info->bitmap_blocks.elements++)) + DBUG_RETURN(1); + first_block= dynamic_element(&info->bitmap_blocks, first_block_pos, + MARIA_BITMAP_BLOCK*); + first_block->sub_blocks= (uint)(info->bitmap_blocks.elements + - first_block_pos); + DBUG_RETURN(0); +} + + +/* + Find pages to put ALL blobs + + SYNOPSIS + allocate_blobs() + info Maria handler + row Information of what is in the row (from calc_record_size()) + + RETURN + 0 ok + 1 error +*/ + +static my_bool allocate_blobs(MARIA_HA *info, MARIA_ROW *row) +{ + ulong *length, *end; + size_t elements; + /* + Reserve size for: + head block + one extent + tail block + */ + elements= info->bitmap_blocks.elements; + for (length= row->blob_lengths, end= length + info->s->base.blobs; + length < end; length++) + { + if (*length && find_blob(info, *length)) + return 1; + } + row->extents_count= (uint)(info->bitmap_blocks.elements - elements); + return 0; +} + + +/* + Reserve the current head page + + SYNOPSIS + use_head() + info Maria handler + page Page number to update + (Note that caller guarantees this is in the active + bitmap) + size How much free space is left on the page + block_position In which info->bitmap_block we have the + information about the head block. + + NOTES + This is used on update where we are updating an existing head page +*/ + +static void use_head(MARIA_HA *info, pgcache_page_no_t page, uint size, + uint block_position) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + MARIA_BITMAP_BLOCK *block; + uchar *data; + uint offset, tmp, offset_page; + DBUG_ENTER("use_head"); + + DBUG_ASSERT(page % bitmap->pages_covered); + + block= dynamic_element(&info->bitmap_blocks, block_position, + MARIA_BITMAP_BLOCK*); + block->page= page; + block->page_count= 1 + TAIL_BIT; + block->empty_space= size; + block->used= BLOCKUSED_TAIL; + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page= (uint) (page - bitmap->page - 1) * 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + tmp= uint2korr(data); + block->org_bitmap_value= (tmp >> offset) & 7; + tmp= (tmp & ~(7 << offset)) | (FULL_HEAD_PAGE << offset); + int2store(data, tmp); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_VOID_RETURN; +} + + +/* + Find out where to split the row (ie, what goes in head, middle, tail etc) + + SYNOPSIS + find_where_to_split_row() + share Maria share + row Information of what is in the row (from calc_record_size()) + extents Max number of extents we have to store in header + split_size Free size on the page (The head length must be less + than this) + + RETURN + row_length for the head block. +*/ + +static uint find_where_to_split_row(MARIA_SHARE *share, MARIA_ROW *row, + uint extents, uint split_size) +{ + uint *lengths, *lengths_end; + /* + Ensure we have the minimum required space on head page: + - Header + length of field lengths (row->min_length) + - Number of extents + - One extent + */ + uint row_length= (row->min_length + + size_to_store_key_length(extents) + + ROW_EXTENT_SIZE); + DBUG_ASSERT(row_length <= split_size); + + /* + Store first in all_field_lengths the different parts that are written + to the row. This needs to be in same order as in + ma_block_rec.c::write_block_record() + */ + row->null_field_lengths[-3]= extents * ROW_EXTENT_SIZE; + row->null_field_lengths[-2]= share->base.fixed_not_null_fields_length; + row->null_field_lengths[-1]= row->field_lengths_length; + for (lengths= row->null_field_lengths - EXTRA_LENGTH_FIELDS, + lengths_end= (lengths + share->base.fields - share->base.blobs + + EXTRA_LENGTH_FIELDS); lengths < lengths_end; lengths++) + { + if (row_length + *lengths > split_size) + break; + row_length+= *lengths; + } + return row_length; +} + + +/* + Find where to write the middle parts of the row and the tail + + SYNOPSIS + write_rest_of_head() + info Maria handler + position Position in bitmap_blocks. Is 0 for rows that needs + full blocks (ie, has a head, middle part and optional tail) + rest_length How much left of the head block to write. + + RETURN + 0 ok + 1 error +*/ + +static my_bool write_rest_of_head(MARIA_HA *info, uint position, + ulong rest_length) +{ + MARIA_SHARE *share= info->s; + uint full_page_size= FULL_PAGE_SIZE(share); + MARIA_BITMAP_BLOCK *block; + DBUG_ENTER("write_rest_of_head"); + DBUG_PRINT("enter", ("position: %u rest_length: %lu", position, + rest_length)); + + if (position == 0) + { + /* Write out full pages */ + uint pages= rest_length / full_page_size; + + rest_length%= full_page_size; + if (rest_length >= MAX_TAIL_SIZE(share->block_size)) + { + /* Put tail on a full page */ + pages++; + rest_length= 0; + } + if (find_mid(info, pages, 1)) + DBUG_RETURN(1); + /* + Insert empty block after full pages, to allow write_block_record() to + split segment into used + free page + */ + block= dynamic_element(&info->bitmap_blocks, 2, MARIA_BITMAP_BLOCK*); + block->page_count= 0; + block->used= 0; + } + if (rest_length) + { + if (find_tail(info, rest_length, ELEMENTS_RESERVED_FOR_MAIN_PART - 1)) + DBUG_RETURN(1); + } + else + { + /* Empty tail block */ + block= dynamic_element(&info->bitmap_blocks, + ELEMENTS_RESERVED_FOR_MAIN_PART - 1, + MARIA_BITMAP_BLOCK *); + block->page_count= 0; + block->used= 0; + } + DBUG_RETURN(0); +} + + +/* + Find where to store one row + + SYNPOSIS + _ma_bitmap_find_place() + info Maria handler + row Information about row to write + blocks Store data about allocated places here + + RETURN + 0 ok + row->space_on_head_page contains minimum number of bytes we + expect to put on the head page. + 1 error + my_errno is set to error +*/ + +my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_SHARE *share= info->s; + my_bool res= 1; + uint full_page_size, position, max_page_size; + uint head_length, row_length, rest_length, extents_length; + DBUG_ENTER("_ma_bitmap_find_place"); + + blocks->count= 0; + blocks->tail_page_skipped= blocks->page_skipped= 0; + row->extents_count= 0; + + /* + Reserve place for the following blocks: + - Head block + - Full page block + - Marker block to allow write_block_record() to split full page blocks + into full and free part + - Tail block + */ + + info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART; + max_page_size= (share->block_size - PAGE_OVERHEAD_SIZE(share)); + + mysql_mutex_lock(&share->bitmap.bitmap_lock); + + if (row->total_length <= max_page_size) + { + /* Row fits in one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + if (find_head(info, (uint) row->total_length, position)) + goto abort; + row->space_on_head_page= row->total_length; + goto end; + } + + /* + First allocate all blobs so that we can find out the needed size for + the main block. + */ + if (row->blob_length && allocate_blobs(info, row)) + goto abort; + + extents_length= row->extents_count * ROW_EXTENT_SIZE; + /* + The + 3 is reserved for storing the number of segments in the row header. + */ + if ((head_length= (row->head_length + extents_length + 3)) <= + max_page_size) + { + /* Main row part fits into one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + if (find_head(info, head_length, position)) + goto abort; + row->space_on_head_page= head_length; + goto end; + } + + /* Allocate enough space */ + head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE; + + /* The first segment size is stored in 'row_length' */ + row_length= find_where_to_split_row(share, row, row->extents_count + + ELEMENTS_RESERVED_FOR_MAIN_PART-1, + max_page_size); + + full_page_size= MAX_TAIL_SIZE(share->block_size); + position= 0; + rest_length= head_length - row_length; + if (rest_length <= full_page_size) + position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */ + if (find_head(info, row_length, position)) + goto abort; + row->space_on_head_page= row_length; + + if (write_rest_of_head(info, position, rest_length)) + goto abort; + +end: + blocks->block= dynamic_element(&info->bitmap_blocks, position, + MARIA_BITMAP_BLOCK*); + blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position; + /* First block's page_count is for all blocks */ + blocks->count= (uint)(info->bitmap_blocks.elements - position); + res= 0; + +abort: + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/* + Find where to put row on update (when head page is already defined) + + SYNPOSIS + _ma_bitmap_find_new_place() + info Maria handler + row Information about row to write + page On which page original row was stored + free_size Free size on head page + blocks Store data about allocated places here + + NOTES + This function is only called when the new row can't fit in the space of + the old row in the head page. + + This is essently same as _ma_bitmap_find_place() except that + we don't call find_head() to search in bitmaps where to put the page. + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *row, + pgcache_page_no_t page, uint free_size, + MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_SHARE *share= info->s; + my_bool res= 1; + uint position; + uint head_length, row_length, rest_length, extents_length; + ulonglong bitmap_page; + DBUG_ENTER("_ma_bitmap_find_new_place"); + + blocks->count= 0; + blocks->tail_page_skipped= blocks->page_skipped= 0; + row->extents_count= 0; + info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART; + + mysql_mutex_lock(&share->bitmap.bitmap_lock); + + /* + First allocate all blobs (so that we can find out the needed size for + the main block. + */ + if (row->blob_length && allocate_blobs(info, row)) + goto abort; + + /* Switch bitmap to current head page */ + bitmap_page= page - page % share->bitmap.pages_covered; + + if (share->bitmap.page != bitmap_page && + _ma_change_bitmap_page(info, &share->bitmap, bitmap_page)) + goto abort; + + extents_length= row->extents_count * ROW_EXTENT_SIZE; + if ((head_length= (row->head_length + extents_length + 3)) <= free_size) + { + /* Main row part fits into one page */ + position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1; + use_head(info, page, head_length, position); + row->space_on_head_page= head_length; + goto end; + } + + /* Allocate enough space */ + head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE; + + /* + The first segment size is stored in 'row_length' + We have to add ELEMENTS_RESERVED_FOR_MAIN_PART here as the extent + information may be up to this size when the header splits. + */ + row_length= find_where_to_split_row(share, row, row->extents_count + + ELEMENTS_RESERVED_FOR_MAIN_PART-1, + free_size); + + position= 0; + rest_length= head_length - row_length; + if (rest_length <= MAX_TAIL_SIZE(share->block_size)) + position= ELEMENTS_RESERVED_FOR_MAIN_PART -2; /* Only head and tail */ + use_head(info, page, row_length, position); + row->space_on_head_page= row_length; + + if (write_rest_of_head(info, position, rest_length)) + goto abort; + +end: + blocks->block= dynamic_element(&info->bitmap_blocks, position, + MARIA_BITMAP_BLOCK*); + blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position; + /* First block's page_count is for all blocks */ + blocks->count= (uint)(info->bitmap_blocks.elements - position); + res= 0; + +abort: + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Clear and reset bits +****************************************************************************/ + +/* + Set fill pattern for a page + + set_page_bits() + info Maria handler + bitmap Bitmap handler + page Adress to page + fill_pattern Pattern (not size) for page + + NOTES + Page may not be part of active bitmap + + RETURN + 0 ok + 1 error +*/ + +static my_bool set_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, uint fill_pattern) +{ + pgcache_page_no_t bitmap_page; + uint offset_page, offset, tmp, org_tmp, used_offset; + uchar *data; + DBUG_ENTER("set_page_bits"); + DBUG_ASSERT(fill_pattern <= 7); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset_page= (uint) (page - bitmap->page - 1); + + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page*= 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + org_tmp= tmp= uint2korr(data); + tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset); + if (tmp == org_tmp) + DBUG_RETURN(0); /* No changes */ + + /* + Take care to not write bytes outside of bitmap. + fill_pattern is 3 bits, so we need to write two bytes + if bit position we write to is > (8-3) + */ + if (offset > 5) + int2store(data, tmp); + else + data[0]= tmp; + + /* + Reset full_head_size or full_tail_size if we are releasing data before + it. Increase used_size if we are allocating data. + */ + used_offset= (uint) (data - bitmap->map); + if (fill_pattern < 4) + set_if_smaller(bitmap->full_head_size, used_offset); + if (fill_pattern == 0 || (fill_pattern > 4 && fill_pattern < 7)) + set_if_smaller(bitmap->full_tail_size, used_offset); + if (fill_pattern != 0) + { + /* Calulcate which was the last changed byte */ + used_offset+= offset > 5 ? 2 : 1; + set_if_bigger(bitmap->used_size, used_offset); + } + + _ma_check_bitmap(bitmap); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + if (fill_pattern != FULL_HEAD_PAGE && fill_pattern != FULL_TAIL_PAGE) + set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page); + /* + Note that if the condition above is false (page is full), and all pages of + this bitmap are now full, and that bitmap page was + first_bitmap_with_space, we don't modify first_bitmap_with_space, indeed + its value still tells us where to start our search for a bitmap with space + (which is for sure after this full one). + That does mean that first_bitmap_with_space is only a lower bound. + */ + DBUG_RETURN(0); +} + + +/* + Get bitmap pattern for a given page + + SYNOPSIS + bitmap_get_page_bits() + info Maria handler + bitmap Bitmap handler + page Page number + + RETURN + 0-7 Bitmap pattern + ~0 Error (couldn't read page) +*/ + +static uint bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + pgcache_page_no_t bitmap_page; + uint offset_page, offset, tmp; + uchar *data; + DBUG_ENTER("_ma_bitmap_get_page_bits"); + + bitmap_page= page - page % bitmap->pages_covered; + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(~ (uint) 0); + + /* Find page number from start of bitmap */ + offset_page= (uint) (page - bitmap->page - 1); + /* + Mark place used by reading/writing 2 bytes at a time to handle + bitmaps in overlapping bytes + */ + offset_page*= 3; + offset= offset_page & 7; + data= bitmap->map + offset_page / 8; + tmp= uint2korr(data); + DBUG_RETURN((tmp >> offset) & 7); +} + + +/* As above, but take a lock while getting the data */ + +uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + uint tmp; + mysql_mutex_lock(&bitmap->bitmap_lock); + tmp= bitmap_get_page_bits(info, bitmap, page); + mysql_mutex_unlock(&bitmap->bitmap_lock); + return tmp; +} + + +/* + Mark all pages in a region as free + + SYNOPSIS + _ma_bitmap_reset_full_page_bits() + info Maria handler + bitmap Bitmap handler + page Start page + page_count Number of pages + + NOTES + We assume that all pages in region is covered by same bitmap + One must have a lock on info->s->bitmap.bitmap_lock + + RETURN + 0 ok + 1 Error (when reading bitmap) +*/ + +my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, + uint page_count) +{ + ulonglong bitmap_page; + uint offset, bit_start, bit_count, tmp, byte_offset; + uchar *data; + DBUG_ENTER("_ma_bitmap_reset_full_page_bits"); + DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count)); + mysql_mutex_assert_owner(&info->s->bitmap.bitmap_lock); + + bitmap_page= page - page % bitmap->pages_covered; + DBUG_ASSERT(page != bitmap_page); + + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset= (uint) (page - bitmap->page - 1); + + /* Clear bits from 'page * 3' -> '(page + page_count) * 3' */ + bit_start= offset * 3; + bit_count= page_count * 3; + + byte_offset= bit_start/8; + data= bitmap->map + byte_offset; + offset= bit_start & 7; + + tmp= (255 << offset); /* Bits to keep */ + if (bit_count + offset < 8) + { + /* Only clear bits between 'offset' and 'offset+bit_count-1' */ + tmp^= (255 << (offset + bit_count)); + } + *data&= ~tmp; + + set_if_smaller(bitmap->full_head_size, byte_offset); + set_if_smaller(bitmap->full_tail_size, byte_offset); + + if ((int) (bit_count-= (8 - offset)) > 0) + { + uint fill; + data++; + /* + -1 is here to avoid one 'if' statement and to let the following code + handle the last byte + */ + if ((fill= (bit_count - 1) / 8)) + { + bzero(data, fill); + data+= fill; + } + bit_count-= fill * 8; /* Bits left to clear */ + tmp= (1 << bit_count) - 1; + *data&= ~tmp; + } + set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(0); +} + + +/* + Set all pages in a region as used + + SYNOPSIS + _ma_bitmap_set_full_page_bits() + info Maria handler + bitmap Bitmap handler + page Start page + page_count Number of pages + + NOTES + We assume that all pages in region is covered by same bitmap + One must have a lock on info->s->bitmap.bitmap_lock + + RETURN + 0 ok + 1 Error (when reading bitmap) +*/ + +my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, uint page_count) +{ + ulonglong bitmap_page; + uint offset, bit_start, bit_count, tmp; + uchar *data; + DBUG_ENTER("_ma_bitmap_set_full_page_bits"); + DBUG_PRINT("enter", ("page: %lu page_count: %u", (ulong) page, page_count)); + mysql_mutex_assert_owner(&info->s->bitmap.bitmap_lock); + + bitmap_page= page - page % bitmap->pages_covered; + if (page == bitmap_page || + page + page_count > bitmap_page + bitmap->pages_covered) + { + DBUG_ASSERT(0); /* Wrong in data */ + DBUG_RETURN(1); + } + + if (bitmap_page != bitmap->page && + _ma_change_bitmap_page(info, bitmap, bitmap_page)) + DBUG_RETURN(1); + + /* Find page number from start of bitmap */ + offset= (uint) (page - bitmap->page - 1); + + /* Set bits from 'page * 3' -> '(page + page_count) * 3' */ + bit_start= offset * 3; + bit_count= page_count * 3; + + data= bitmap->map + bit_start / 8; + offset= bit_start & 7; + + tmp= (255 << offset); /* Bits to keep */ + if (bit_count + offset < 8) + { + /* Only set bits between 'offset' and 'offset+bit_count-1' */ + tmp^= (255 << (offset + bit_count)); + } + *data|= tmp; + + if ((int) (bit_count-= (8 - offset)) > 0) + { + uint fill; + data++; + /* + -1 is here to avoid one 'if' statement and to let the following code + handle the last byte + */ + if ((fill= (bit_count - 1) / 8)) + { + bfill(data, fill, 255); + data+= fill; + } + bit_count-= fill * 8; /* Bits left to set */ + tmp= (1 << bit_count) - 1; + *data|= tmp; + } + set_if_bigger(bitmap->used_size, (uint) (data - bitmap->map) + 1); + _ma_check_bitmap(bitmap); + bitmap->changed= 1; + DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap);); + DBUG_RETURN(0); +} + + +/** + @brief + Make a transition of MARIA_FILE_BITMAP::non_flushable. + If the bitmap becomes flushable, which requires that REDO-UNDO has been + logged and all bitmap pages touched by the thread have a correct + allocation, it unpins all bitmap pages, and if _ma_bitmap_flush_all() is + waiting (in practice it is a checkpoint), it wakes it up. + If the bitmap becomes or stays unflushable, the function merely records it + unless a concurrent _ma_bitmap_flush_all() is happening, in which case the + function first waits for the flush to be done. + + @note + this sets info->non_flushable_state to 1 if we have incremented + bitmap->non_flushable and not yet decremented it. + + @param share Table's share + @param non_flushable_inc Increment of MARIA_FILE_BITMAP::non_flushable + (-1 or +1). +*/ + +void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc) +{ + MARIA_SHARE *share= info->s; + MARIA_FILE_BITMAP *bitmap; + DBUG_ENTER("_ma_bitmap_flushable"); + + /* + Not transactional tables are never automaticly flushed and needs no + protection + */ + if (!share->now_transactional) + DBUG_VOID_RETURN; + + bitmap= &share->bitmap; + mysql_mutex_lock(&bitmap->bitmap_lock); + + if (non_flushable_inc == -1) + { + DBUG_ASSERT((int) bitmap->non_flushable > 0); + DBUG_ASSERT(info->non_flushable_state == 1); + if (--bitmap->non_flushable == 0) + { + /* + We unlock and unpin pages locked and pinned by other threads. It does + not seem to be an issue as all bitmap changes are serialized with + the bitmap's mutex. + */ + _ma_bitmap_unpin_all(share); + if (unlikely(bitmap->waiting_for_non_flushable)) + { + DBUG_PRINT("info", ("bitmap flushable waking up flusher")); + mysql_cond_broadcast(&bitmap->bitmap_cond); + } + } + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + mysql_mutex_unlock(&bitmap->bitmap_lock); + info->non_flushable_state= 0; + DBUG_VOID_RETURN; + } + DBUG_ASSERT(non_flushable_inc == 1); + DBUG_ASSERT(info->non_flushable_state == 0); + + bitmap->waiting_for_flush_all_requested++; + while (unlikely(bitmap->flush_all_requested)) + { + /* + Some other thread is waiting for the bitmap to become + flushable. Not the moment to make the bitmap unflushable or more + unflushable; let's rather back off and wait. If we didn't do this, with + multiple writers, there may always be one thread causing the bitmap to + be unflushable and _ma_bitmap_flush_all() would wait for long. + There should not be a deadlock because if our thread increased + non_flushable (and thus _ma_bitmap_flush_all() is waiting for at least + our thread), it is not going to increase it more so is not going to come + here. + */ + DBUG_PRINT("info", ("waiting for bitmap flusher")); + mysql_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock); + } + bitmap->waiting_for_flush_all_requested--; + bitmap->non_flushable++; + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + mysql_mutex_unlock(&bitmap->bitmap_lock); + info->non_flushable_state= 1; + DBUG_VOID_RETURN; +} + + +/* + Correct bitmap pages to reflect the true allocation + + SYNOPSIS + _ma_bitmap_release_unused() + info Maria handle + blocks Bitmap blocks + + IMPLEMENTATION + If block->used & BLOCKUSED_TAIL is set: + If block->used & BLOCKUSED_USED is set, then the bits for the + corresponding page is set according to block->empty_space + If block->used & BLOCKUSED_USED is not set, then the bits for + the corresponding page is set to org_bitmap_value; + + If block->used & BLOCKUSED_TAIL is not set: + if block->used is not set, the bits for the corresponding page are + cleared + + For the first block (head block) the logic is same as for a tail block + + Note that we may have 'filler blocks' that are used to split a block + in half; These can be recognized by that they have page_count == 0. + + This code also reverse the effect of ma_bitmap_flushable(.., 1); + + RETURN + 0 ok + 1 error (Couldn't write or read bitmap page) +*/ + +my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks) +{ + MARIA_BITMAP_BLOCK *block= blocks->block, *end= block + blocks->count; + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint bits, current_bitmap_value; + DBUG_ENTER("_ma_bitmap_release_unused"); + + /* + We can skip FULL_HEAD_PAGE (4) as the page was marked as 'full' + when we allocated space in the page + */ + current_bitmap_value= FULL_HEAD_PAGE; + + mysql_mutex_lock(&bitmap->bitmap_lock); + + /* First handle head block */ + if (block->used & BLOCKUSED_USED) + { + DBUG_PRINT("info", ("head page: %lu empty_space: %u", + (ulong) block->page, block->empty_space)); + bits= _ma_free_size_to_head_pattern(bitmap, block->empty_space); + if (block->used & BLOCKUSED_USE_ORG_BITMAP) + current_bitmap_value= block->org_bitmap_value; + } + else + bits= block->org_bitmap_value; + if (bits != current_bitmap_value) + { + if (set_page_bits(info, bitmap, block->page, bits)) + goto err; + } + else + { + DBUG_ASSERT(current_bitmap_value == + bitmap_get_page_bits(info, bitmap, block->page)); + } + + /* Handle all full pages and tail pages (for head page and blob) */ + for (block++; block < end; block++) + { + uint page_count; + if (!block->page_count) + continue; /* Skip 'filler blocks' */ + + page_count= block->page_count; + if (block->used & BLOCKUSED_TAIL) + { + current_bitmap_value= FULL_TAIL_PAGE; + /* The bitmap page is only one page */ + page_count= 1; + if (block->used & BLOCKUSED_USED) + { + DBUG_PRINT("info", ("tail page: %lu empty_space: %u", + (ulong) block->page, block->empty_space)); + bits= free_size_to_tail_pattern(bitmap, block->empty_space); + if (block->used & BLOCKUSED_USE_ORG_BITMAP) + current_bitmap_value= block->org_bitmap_value; + } + else + bits= block->org_bitmap_value; + + /* + The page has all bits set; The following test is an optimization + to not set the bits to the same value as before. + */ + DBUG_ASSERT(current_bitmap_value == + bitmap_get_page_bits(info, bitmap, block->page)); + + if (bits != current_bitmap_value) + { + if (set_page_bits(info, bitmap, block->page, bits)) + goto err; + } + } + else if (!(block->used & BLOCKUSED_USED) && + _ma_bitmap_reset_full_page_bits(info, bitmap, + block->page, page_count)) + goto err; + } + + /* This duplicates ma_bitmap_flushable(-1) except it already has mutex */ + if (info->non_flushable_state) + { + DBUG_ASSERT(((int) (bitmap->non_flushable)) > 0); + info->non_flushable_state= 0; + if (--bitmap->non_flushable == 0) + { + _ma_bitmap_unpin_all(info->s); + if (unlikely(bitmap->waiting_for_non_flushable)) + { + DBUG_PRINT("info", ("bitmap flushable waking up flusher")); + mysql_cond_broadcast(&bitmap->bitmap_cond); + } + } + } + DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable)); + + mysql_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(0); + +err: + mysql_mutex_unlock(&bitmap->bitmap_lock); + DBUG_RETURN(1); +} + + +/* + Free full pages from bitmap and pagecache + + SYNOPSIS + _ma_bitmap_free_full_pages() + info Maria handle + extents Extents (as stored on disk) + count Number of extents + + IMPLEMENTATION + Mark all full pages (not tails) from extents as free, both in bitmap + and page cache. + + RETURN + 0 ok + 1 error (Couldn't write or read bitmap page) +*/ + +my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents, + uint count) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + my_bool res; + DBUG_ENTER("_ma_bitmap_free_full_pages"); + + for (; count--; extents+= ROW_EXTENT_SIZE) + { + pgcache_page_no_t page= uint5korr(extents); + uint page_count= (uint2korr(extents + ROW_EXTENT_PAGE_SIZE) & + ~START_EXTENT_BIT); + if (!(page_count & TAIL_BIT)) + { + if (page == 0 && page_count == 0) + continue; /* Not used extent */ + if (pagecache_delete_pages(info->s->pagecache, &info->dfile, page, + page_count, PAGECACHE_LOCK_WRITE, 1)) + DBUG_RETURN(1); + mysql_mutex_lock(&bitmap->bitmap_lock); + res= _ma_bitmap_reset_full_page_bits(info, bitmap, page, page_count); + mysql_mutex_unlock(&bitmap->bitmap_lock); + if (res) + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Mark in the bitmap how much free space there is on a page + + SYNOPSIS + _ma_bitmap_set() + info Maria handler + page Adress to page + head 1 if page is a head page, 0 if tail page + empty_space How much empty space there is on page + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t page, my_bool head, + uint empty_space) +{ + MARIA_FILE_BITMAP *bitmap= &info->s->bitmap; + uint bits; + my_bool res; + DBUG_ENTER("_ma_bitmap_set"); + DBUG_PRINT("enter", ("page: %lu head: %d empty_space: %u", + (ulong) page, head, empty_space)); + + mysql_mutex_lock(&info->s->bitmap.bitmap_lock); + bits= (head ? + _ma_free_size_to_head_pattern(bitmap, empty_space) : + free_size_to_tail_pattern(bitmap, empty_space)); + res= set_page_bits(info, bitmap, page, bits); + mysql_mutex_unlock(&info->s->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/* + Check that bitmap pattern is correct for a page + + NOTES + Used in maria_chk + + SYNOPSIS + _ma_check_bitmap_data() + info Maria handler + page_type What kind of page this is + page Adress to page + empty_space Empty space on page + bitmap_pattern Bitmap pattern for page (from bitmap) + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_check_bitmap_data(MARIA_HA *info, enum en_page_type page_type, + uint empty_space, uint bitmap_pattern) +{ + uint bits; + switch (page_type) { + case UNALLOCATED_PAGE: + case MAX_PAGE_TYPE: + bits= 0; + break; + case HEAD_PAGE: + bits= _ma_free_size_to_head_pattern(&info->s->bitmap, empty_space); + break; + case TAIL_PAGE: + bits= free_size_to_tail_pattern(&info->s->bitmap, empty_space); + break; + case BLOB_PAGE: + bits= FULL_TAIL_PAGE; + break; + default: + bits= 0; /* to satisfy compiler */ + DBUG_ASSERT(0); + } + return (bitmap_pattern != bits); +} + +/** + Check that bitmap looks correct + + - All data before full_head_size and full_tail_size are allocated + - There is no allocated data after used_size + All of the above need to be correct only according to 6 byte + alignment as all loops reads 6 bytes at a time and we check both + start and end position according to the current 6 byte position. +*/ + +#ifndef DBUG_OFF +static void _ma_check_bitmap(MARIA_FILE_BITMAP *bitmap) +{ + uchar *data= bitmap->map; + uchar *end= bitmap->map + bitmap->total_size; + uchar *full_head_end=0, *full_tail_end=0, *first_empty= bitmap->map; + + for (; data < end; data+= 6) + { + ulonglong bits= uint6korr(data); /* 6 bytes = 6*8/3= 16 patterns */ + uint i; + + if (bits == 04444444444444444LL || bits == 0xffffffffffffLL) + { + first_empty= data + 6; + continue; /* block fully used */ + } + if (bits == 0) + { + if (!full_head_end) + full_head_end= data; + if (!full_tail_end) + full_tail_end= data; + continue; + } + + first_empty= data + 6; + if (!full_head_end || !full_tail_end) + { + for (i= 0, bits >>= 0; i < 16 ; i++, bits >>= 3) + { + uint pattern= (uint) (bits & 7); + if (pattern == FULL_HEAD_PAGE || pattern == FULL_TAIL_PAGE) + continue; + + if (pattern < 4 && !full_head_end) + full_head_end= data; + if ((pattern == 0 || (pattern > 4 && pattern < 7)) && !full_tail_end) + full_tail_end= data; + } + } + } + if (!full_head_end) + full_head_end= data; + if (!full_tail_end) + full_tail_end= data; + + /* used_size must point after the last byte that had some data) */ + DBUG_ASSERT(bitmap->used_size <= bitmap->total_size); + DBUG_ASSERT((bitmap->map + (bitmap->used_size+5)/6*6) >= first_empty); + /* full_xxxx_size can't point after the first block that has free data */ + DBUG_ASSERT((bitmap->map + (bitmap->full_head_size/6*6)) <= full_head_end); + DBUG_ASSERT((bitmap->map + (bitmap->full_tail_size/6*6)) <= full_tail_end); +} +#endif + + +/* + Check if the page type matches the one that we have in the bitmap + + SYNOPSIS + _ma_check_if_right_bitmap_type() + info Maria handler + page_type What kind of page this is + page Adress to page + bitmap_pattern Store here the pattern that was in the bitmap for the + page. This is always updated. + + NOTES + Used in maria_chk + + RETURN + 0 ok + 1 error +*/ + +my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, + enum en_page_type page_type, + pgcache_page_no_t page, + uint *bitmap_pattern) +{ + if ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap, + page)) > 7) + return 1; /* Couldn't read page */ + switch (page_type) { + case HEAD_PAGE: + return *bitmap_pattern < 1 || *bitmap_pattern > 4; + case TAIL_PAGE: + return *bitmap_pattern < 5; + case BLOB_PAGE: + return *bitmap_pattern != 7; + default: + break; + } + DBUG_ASSERT(0); + return 1; +} + + +/** + @brief create the first bitmap page of a freshly created data file + + @param share table's share + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int _ma_bitmap_create_first(MARIA_SHARE *share) +{ + uint block_size= share->bitmap.block_size; + File file= share->bitmap.file.file; + uchar marker[CRC_SIZE]; + + /* + Next write operation of the page will write correct CRC + if it is needed + */ + int4store(marker, MARIA_NO_CRC_BITMAP_PAGE); + + if (mysql_file_chsize(file, block_size - sizeof(marker), + 0, MYF(MY_WME)) || + my_pwrite(file, marker, sizeof(marker), + block_size - sizeof(marker), + MYF(MY_NABP | MY_WME))) + return 1; + share->state.state.data_file_length= block_size; + _ma_bitmap_delete_all(share); + return 0; +} + + +/** + @brief Pagecache callback to get the TRANSLOG_ADDRESS to flush up to, when a + bitmap page needs to be flushed. + + @param page Page's content + @param page_no Page's number (<offset>/<page length>) + @param data_ptr Callback data pointer (pointer to MARIA_SHARE) + + @retval TRANSLOG_ADDRESS to flush up to. +*/ + +static my_bool +flush_log_for_bitmap(PAGECACHE_IO_HOOK_ARGS *args __attribute__ ((unused))) +{ +#ifdef DBUG_ASSERT_EXISTS + const MARIA_SHARE *share= (MARIA_SHARE*)args->data; +#endif + DBUG_ENTER("flush_log_for_bitmap"); + DBUG_ASSERT(share->now_transactional); + /* + WAL imposes that UNDOs reach disk before bitmap is flushed. We don't know + the LSN of the last UNDO about this bitmap page, so we flush whole log. + */ + DBUG_RETURN(translog_flush(translog_get_horizon())); +} + + +/** + @brief Set callbacks for bitmap pages + + @note + We don't use pagecache_file_init here, as we want to keep the + code readable +*/ + +void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share) +{ + pagecache_file_set_null_hooks(file); + file->callback_data= (uchar*) share; + file->flush_log_callback= maria_flush_log_for_page_none; + file->post_write_hook= maria_page_write_failure; + + if (share->temporary) + { + file->post_read_hook= &maria_page_crc_check_none; + file->pre_write_hook= &maria_page_filler_set_none; + } + else + { + file->post_read_hook= &maria_page_crc_check_bitmap; + if (share->options & HA_OPTION_PAGE_CHECKSUM) + file->pre_write_hook= &maria_page_crc_set_normal; + else + file->pre_write_hook= &maria_page_filler_set_bitmap; + if (share->now_transactional) + file->flush_log_callback= flush_log_for_bitmap; + } +} + + +/** + Extends data file with zeroes and creates new bitmap pages into page cache. + + Writes all bitmap pages in [from, to]. + + Non-bitmap pages of zeroes are correct as they are marked empty in + bitmaps. Bitmap pages will not be zeroes: they will get their CRC fixed when + flushed. And if there is a crash before flush (so they are zeroes at + restart), a REDO will re-create them in page cache. +*/ + +static my_bool +_ma_bitmap_create_missing_into_pagecache(MARIA_SHARE *share, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t from, + pgcache_page_no_t to, + uchar *zeroes) +{ + pgcache_page_no_t i; + /* + We do not use my_chsize() because there can be a race between when it + reads the physical size and when it writes (assume data_file_length is 10, + physical length is 8 and two data pages are in cache, and here we do a + my_chsize: my_chsize sees physical length is 8, then the two data pages go + to disk then my_chsize writes from page 8 and so overwrites the two data + pages, wrongly). + We instead rely on the filesystem filling gaps with zeroes. + */ + for (i= from; i <= to; i+= bitmap->pages_covered) + { + /** + No need to keep them pinned, they are new so flushable. + @todo but we may want to keep them pinned, as an optimization: if they + are not pinned they may go to disk before the data pages go (so, the + physical pages would be in non-ascending "sparse" order on disk), or the + filesystem may fill gaps with zeroes physically which is a waste of + time. + */ + if (pagecache_write(share->pagecache, + &bitmap->file, i, 0, + zeroes, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) + goto err; + } + /* + Data pages after data_file_length are full of zeroes but that is allowed + as they are marked empty in the bitmap. + */ + return FALSE; +err: + _ma_set_fatal_error_with_share(share, my_errno); + return TRUE; +} + + +/** + Creates missing bitmaps when we extend the data file. + + At run-time, when we need a new bitmap page we come here; and only one bitmap + page at a time is created. + + In some recovery cases we insert at a large offset in the data file, way + beyond state.data_file_length, so can need to create more than one bitmap + page in one go. Known case is: + Start a transaction in Maria; + delete last row of very large table (with delete_row) + do a bulk insert + crash + Then UNDO_BULK_INSERT will truncate table files, and + UNDO_ROW_DELETE will want to put the row back to its original position, + extending the data file a lot: bitmap page*s* in the hole must be created, + or he table would look corrupted. + + We need to log REDOs for bitmap creation, consider: we apply a REDO for a + data page, which creates the first data page covered by a new bitmap + not yet created. If the data page is flushed but the bitmap page is not and + there is a crash, re-execution of the REDO will complain about the zeroed + bitmap page (see it as corruption). Thus a REDO is needed to re-create the + bitmap. + + @param info Maria handler + @param bitmap Bitmap handler + @param page Last bitmap page to create + + @note When this function is called this must be true: + ((page + 1) * bitmap->block_size > info->s->state.state.data_file_length) + +*/ + +static my_bool _ma_bitmap_create_missing(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page) +{ + MARIA_SHARE *share= info->s; + uint block_size= bitmap->block_size; + pgcache_page_no_t from, to; + my_off_t data_file_length= share->state.state.data_file_length; + DBUG_ENTER("_ma_bitmap_create_missing"); + DBUG_PRINT("enter", ("page: %lld", (longlong) page)); + + /* First (in offset order) bitmap page to create */ + if (data_file_length < block_size) + goto err; /* corrupted, should have first bitmap page */ + if (page * block_size >= share->base.max_data_file_length) + { + my_errno= HA_ERR_RECORD_FILE_FULL; + goto err; + } + + from= (data_file_length / block_size - 1) / bitmap->pages_covered + 1; + from*= bitmap->pages_covered; + /* + page>=from because: + (page + 1) * bs > dfl, and page == k * pc so: + (k * pc + 1) * bs > dfl; k * pc + 1 > dfl / bs; k * pc > dfl / bs - 1 + k > (dfl / bs - 1) / pc; k >= (dfl / bs - 1) / pc + 1 + k * pc >= ((dfl / bs - 1) / pc + 1) * pc == from. + */ + DBUG_ASSERT(page >= from); + + if (share->now_transactional) + { + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + page_store(log_data + FILEID_STORE_SIZE, from); + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + /* + We don't use info->trn so that this REDO is always executed even though + the UNDO does not reach disk due to crash. This is also consistent with + the fact that the new bitmap pages are not pinned. + */ + if (translog_write_record(&lsn, LOGREC_REDO_BITMAP_NEW_PAGE, + &dummy_transaction_object, info, + (translog_size_t)sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + goto err; + /* + No need to flush the log: the bitmap pages we are going to create will + flush it when they go to disk. + */ + } + + /* + Last bitmap page. It has special creation: will go to the page cache + only later as we are going to modify it very soon. + */ + bzero(bitmap->map, bitmap->block_size); + bitmap->used_size= bitmap->full_head_size= bitmap->full_tail_size= 0; + bitmap->changed=1; +#ifndef DBUG_OFF + /* + Make a copy of the page to be able to print out bitmap changes during + debugging + */ + memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size); +#endif + + /* Last bitmap page to create before 'page' */ + DBUG_ASSERT(page >= bitmap->pages_covered); + to= page - bitmap->pages_covered; + /* + In run-time situations, from>=to is always false, i.e. we always create + one bitmap at a time ('page'). + */ + if ((from <= to) && + _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to, + bitmap->map)) + goto err; + + share->state.state.data_file_length= (page + 1) * bitmap->block_size; + + DBUG_RETURN(FALSE); +err: + DBUG_RETURN(TRUE); +} + + +my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, + LSN lsn __attribute__ ((unused)), + const uchar *header) +{ + MARIA_SHARE *share= info->s; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + my_bool error; + pgcache_page_no_t from, to, min_from; + DBUG_ENTER("_ma_apply_redo_bitmap_new_page"); + + from= page_korr(header); + to= page_korr(header + PAGE_STORE_SIZE); + DBUG_PRINT("info", ("from: %lu to: %lu", (ulong)from, (ulong)to)); + if ((from > to) || + (from % bitmap->pages_covered) != 0 || + (to % bitmap->pages_covered) != 0) + { + error= TRUE; /* corrupted log record */ + goto err; + } + + min_from= (share->state.state.data_file_length / bitmap->block_size - 1) / + bitmap->pages_covered + 1; + min_from*= bitmap->pages_covered; + if (from < min_from) + { + DBUG_PRINT("info", ("overwrite bitmap pages from %lu", (ulong)min_from)); + /* + We have to overwrite. It could be that there was a bitmap page in + memory, covering a data page which went to disk, then crash: the + bitmap page is now full of zeros and is ==min_from, we have to overwrite + it with correct checksum. + */ + } + share->state.changed|= STATE_CHANGED; + bzero(info->buff, bitmap->block_size); + if (!(error= + _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to, + info->buff))) + share->state.state.data_file_length= (to + 1) * bitmap->block_size; + +err: + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c new file mode 100644 index 00000000..543ddcca --- /dev/null +++ b/storage/maria/ma_blockrec.c @@ -0,0 +1,7620 @@ +/* Copyright (C) 2007-2008 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Storage of records in block + + Some clarifications about the abbrev used: + + NULL fields -> Fields that may have contain a NULL value. + Not null fields -> Fields that may not contain a NULL value. + Critical fields -> Fields that can't be null and can't be dropped without + causing a table reorganization. + + + Maria will have a LSN at start of each page (excluding the bitmap pages) + + The different page types that are in a data file are: + + Bitmap pages Map of free pages in the next extent (8192 page size + gives us 256M of mapped pages / bitmap) + Head page Start of rows are stored on this page. + A rowid always points to a head page + Blob page This page is totally filled with data from one blob or by + a set of long VARCHAR/CHAR fields + Tail page This contains the last part from different rows, blobs + or varchar fields. + + The data file starts with a bitmap page, followed by as many data + pages as the bitmap can cover. After this there is a new bitmap page + and more data pages etc. + + For information about the bitmap page, see ma_bitmap.c + + Structure of data and tail page: + + The page has a row directory at end of page to allow us to do deletes + without having to reorganize the page. It also allows us to later store + some more bytes after each row to allow them to grow without having to move + around other rows. + + Page header: + + LSN 7 bytes Log position for last page change + PAGE_TYPE 1 uchar 0 unalloced / 1 for head / 2 for tail / 3 for blob + DIR_COUNT 1 uchar Number of row/tail entries on page + FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no + empty space 2 bytes Bytes of empty space on page + + The most significant bit in PAGE_TYPE is set to 1 if the data on the page + can be compacted to get more space. (PAGE_CAN_BE_COMPACTED) + + Row data + + Row directory of NO entries, that consist of the following for each row + (in reverse order; i.e., first record is stored last): + + Position 2 bytes Position of row on page + Length 2 bytes Length of entry + + For Position and Length, the 1 most significant bit of the position and + the 1 most significant bit of the length could be used for some states of + the row (in other words, we should try to keep these reserved) + + Position is 0 if the entry is not used. In this case length[0] points + to a previous free entry (255 if no previous entry) and length[1] + to the next free entry (or 255 if last free entry). This works because + the directory entry 255 can never be marked free (if the first directory + entry is freed, the directory is shrinked). + + checksum 4 bytes Reserved for full page read testing and live backup. + + ---------------- + + Structure of blob pages: + + LSN 7 bytes Log position for last page change + PAGE_TYPE 1 uchar 3 + + data + + ----------------- + + Row data structure: + + Flag 1 uchar Marker of which header field exists + TRANSID 6 bytes TRANSID of changing transaction + (optional, added on insert and first + update/delete) + VER_PTR 7 bytes Pointer to older version in log + (undo record) + (optional, added after first + update/delete) + DELETE_TRANSID 6 bytes (optional). TRANSID of original row. + Added on delete. + Nulls_extended 1 uchar To allow us to add new DEFAULT NULL + fields (optional, added after first + change of row after alter table) + Number of ROW_EXTENT's 1-3 uchar Length encoded, optional + This is the number of extents the + row is split into + First row_extent 7 uchar Pointer to first row extent (optional) + + Total length of length array 1-3 uchar Only used if we have + char/varchar/blob fields. + Row checksum 1 uchar Only if table created with checksums + Null_bits .. One bit for each NULL field (a field that may + have the value NULL) + Empty_bits .. One bit for each field that may be 'empty'. + (Both for null and not null fields). + This bit is 1 if the value for the field is + 0 or empty string. + + field_offsets 2 byte/offset + For each 32'th field, there is one offset + that points to where the field information + starts in the block. This is to provide + fast access to later field in the row + when we only need to return a small + set of fields. + TODO: Implement this. + + Things marked above as 'optional' will only be present if the + corresponding bit is set in 'Flag' field. Flag gives us a way to + get more space on a page when doing page compaction as we don't need + to store TRANSID that have committed before the smallest running + transaction we have in memory. + + Data in the following order: + (Field order is precalculated when table is created) + + Critical fixed length, not null, fields. (Note, these can't be dropped) + Fixed length, null fields + + Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields. + Number of bytes used in length array per entry is depending on max length + for field. + + ROW_EXTENT's + CHAR data (space stripped) + VARCHAR data + BLOB data + + Fields marked in null_bits or empty_bits are not stored in data part or + length array. + + If row doesn't fit into the given block, then the first EXTENT will be + stored last on the row. This is done so that we don't break any field + data in the middle. + + We first try to store the full row into one block. If that's not possible + we move out each big blob into their own extents. If this is not enough we + move out a concatenation of all varchars to their own extent. + + Each blob and the concatenated char/varchar fields are stored the following + way: + - Store the parts in as many full-contiguous pages as possible. + - The last part, that doesn't fill a full page, is stored in tail page. + + When doing an insert of a new row, we don't have to have + VER_PTR in the row. This will make rows that are not changed stored + efficiently. On update and delete we would add TRANSID (if it was an old + committed row) and VER_PTR to + the row. On row page compaction we can easily detect rows where + TRANSID was committed before the longest running transaction + started and we can then delete TRANSID and VER_PTR from the row to + gain more space. + + If a row is deleted in Maria, we change TRANSID to the deleting + transaction's id, change VER_PTR to point to the undo record for the delete, + and add DELETE_TRANSID (the id of the transaction which last + inserted/updated the row before its deletion). DELETE_TRANSID allows an old + transaction to avoid reading the log to know if it can see the last version + before delete (in other words it reduces the probability of having to follow + VER_PTR). TODO: depending on a compilation option, evaluate the performance + impact of not storing DELETE_TRANSID (which would make the row smaller). + + Description of the different parts: + + Flag is coded as: + + Description bit + TRANS_ID_exists 0 + VER_PTR_exists 1 + Row is deleted 2 (Means that DELETE_TRANSID exists) + Nulls_extended_exists 3 + Row is split 7 This means that 'Number_of_row_extents' exists + + Nulls_extended is the number of new DEFAULT NULL fields in the row + compared to the number of DEFAULT NULL fields when the first version + of the table was created. If Nulls_extended doesn't exist in the row, + we know it's 0 as this must be one of the original rows from when the + table was created first time. This coding allows us to add 255*8 = + 2048 new fields without requiring a full alter table. + + Empty_bits is used to allow us to store 0, 0.0, empty string, empty + varstring and empty blob efficiently. (This is very good for data + warehousing where NULL's are often regarded as evil). Having this + bitmap also allows us to drop information of a field during a future + delete if field was deleted with ALTER TABLE DROP COLUMN. To be able + to handle DROP COLUMN, we must store in the index header the fields + that has been dropped. When unpacking a row we will ignore dropped + fields. When storing a row, we will mark a dropped field either with a + null in the null bit map or in the empty_bits and not store any data + for it. + TODO: Add code for handling dropped fields. + + + A ROW EXTENT is range of pages. One ROW_EXTENT is coded as: + + START_PAGE 5 bytes + PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page. + Bit 15 is to set if this is start of a new + blob extent. + + With 8K pages, we can cover 256M in one extent. This coding gives us a + maximum file size of 2^40*8192 = 8192 tera + + As an example of ROW_EXTENT handling, assume a row with one integer + field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2 + big BLOB fields that we have updated. + + The record format for storing this into an empty file would be: + + Page 1: + + 00 00 00 00 00 00 00 LSN + 01 Only one row in page + FF No free dir entry + xx xx Empty space on page + + 10 Flag: row split, VER_PTR exists + 01 00 00 00 00 00 TRANSID 1 + 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1 + 5 Number of row extents + 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4 + 0 No null fields + 0 No empty fields + 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0 + 06 00 00 00 00 80 00 First blob, stored at page 6-133 + 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5 + 86 00 00 00 00 80 00 Second blob, stored at page 134-262 + 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5 + 05 00 5 integer + FA Length of first varchar field (size 250) + 00 60 Length of second varchar field (size 8192*3) + 00 60 10 First medium BLOB, 1M + 01 00 10 00 Second BLOB, 1M + xx xx xx xx xx xx Varchars are stored here until end of page + + ..... until end of page + + 09 00 F4 1F Start position 9, length 8180 + xx xx xx xx Checksum + + A data page is allowed to have a wrong CRC and header as long as it is + marked empty in the bitmap and its directory's count is 0. +*/ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_trnman.h" +#include "ma_key_recover.h" +#include "ma_recovery_util.h" +#include <lf.h> + +/* + Struct for having a cursor over a set of extent. + This is used to loop over all extents for a row when reading + the row data. It's also used to store the tail positions for + a read row to be used by a later update/delete command. +*/ + +typedef struct st_maria_extent_cursor +{ + /* + Pointer to packed uchar array of extents for the row. + Format is described above in the header + */ + uchar *extent; + /* Where data starts on page; Only for debugging */ + uchar *data_start; + /* Position to all tails in the row. Updated when reading a row */ + MARIA_RECORD_POS *tail_positions; + /* Current page */ + pgcache_page_no_t page; + /* How many pages in the page region */ + uint page_count; + /* What kind of lock to use for tail pages */ + enum pagecache_page_lock lock_for_tail_pages; + /* Total number of extents (i.e., entries in the 'extent' slot) */ + uint extent_count; + /* <> 0 if current extent is a tail page; Set while using cursor */ + uint tail; + /* Position for tail on tail page */ + uint tail_row_nr; + /* + == 1 if we are working on the first extent (i.e., the one that is stored in + the row header, not an extent that is stored as part of the row data). + */ + my_bool first_extent; +} MARIA_EXTENT_CURSOR; + + +/** + @brief Structure for passing down info to write_hook_for_clr_end(). + This hooks needs to know the variation of the live checksum caused by the + current operation to update state.checksum under log's mutex, + needs to know the transaction's previous undo_lsn to set + trn->undo_lsn under log mutex, and needs to know the type of UNDO being + undone now to modify state.records under log mutex. +*/ + +/** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */ +#define store_checksum_in_rec(S,D,E,P,L) do \ + { \ + D= 0; \ + if ((S)->calc_checksum != NULL) \ + { \ + D= (E); \ + ha_checksum_store(P, D); \ + L+= HA_CHECKSUM_STORE_SIZE; \ + } \ + } while (0) + + +static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails); +static my_bool delete_head_or_tail(MARIA_HA *info, + pgcache_page_no_t page, uint record_number, + my_bool head, my_bool from_update); +#ifndef DBUG_OFF +static void _ma_print_directory(MARIA_SHARE *share, + FILE *file, uchar *buff, uint block_size); +#endif +static uchar *store_page_range(MARIA_SHARE *share, + uchar *to, MARIA_BITMAP_BLOCK *block, + ulong length, + uint *tot_ranges); +static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, + LEX_CUSTRING *log_parts, + uint *log_parts_count); +static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, + const uchar *newrec, + LEX_CUSTRING *log_parts, + uint *log_parts_count); + +/**************************************************************************** + Initialization +****************************************************************************/ + +/* + Initialize data needed for block structures +*/ + + +/* Size of the different header elements for a row */ + +static uchar header_sizes[]= +{ + TRANSID_SIZE, + VERPTR_SIZE, + TRANSID_SIZE, /* Delete transid */ + 1 /* Null extends */ +}; + +/* + Calculate array of all used headers + + Used to speed up: + + size= 1; + if (flag & 1) + size+= TRANSID_SIZE; + if (flag & 2) + size+= VERPTR_SIZE; + if (flag & 4) + size+= TRANSID_SIZE + if (flag & 8) + size+= 1; + + NOTES + This is called only once at startup of Maria +*/ + +static uchar total_header_size[1 << array_elements(header_sizes)]; +#define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1) + +void _ma_init_block_record_data(void) +{ + uint i; + bzero(total_header_size, sizeof(total_header_size)); + total_header_size[0]= FLAG_SIZE; /* Flag uchar */ + for (i= 1; i < array_elements(total_header_size); i++) + { + uint size= FLAG_SIZE, j, bit; + for (j= 0; (bit= (1 << j)) <= i; j++) + { + if (i & bit) + size+= header_sizes[j]; + } + total_header_size[i]= size; + } +} + + +my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file) +{ + my_bool res; + pgcache_page_no_t last_page; + + /* + First calculate the max file length with can have with a pointer of size + rec_reflength. + + The 'rec_reflength - 1' is because one byte is used for row + position withing the page. + The /2 comes from _ma_transaction_recpos_to_keypos() where we use + the lowest bit to mark if there is a transid following the rownr. + */ + last_page= ((ulonglong) 1 << ((share->base.rec_reflength-1)*8))/2; + if (!last_page) /* Overflow; set max size */ + last_page= ~(pgcache_page_no_t) 0; + + res= _ma_bitmap_init(share, data_file, &last_page); + share->base.max_data_file_length= _ma_safe_mul(last_page + 1, + share->block_size); +#if SIZEOF_OFF_T == 4 + set_if_smaller(share->base.max_data_file_length, INT_MAX32); +#endif + return res; +} + + +my_bool _ma_once_end_block_record(MARIA_SHARE *share) +{ + int res= _ma_bitmap_end(share); + if (share->bitmap.file.file >= 0) + { + if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file, + share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE)) + res= 1; + /* + File must be synced as it is going out of the maria_open_list and so + becoming unknown to Checkpoint. + */ + if (!share->s3_path) + { + if (share->now_transactional && + mysql_file_sync(share->bitmap.file.file, MYF(MY_WME))) + res= 1; + if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME))) + res= 1; + } + /* + Trivial assignment to guard against multiple invocations + (May happen if file are closed but we want to keep the maria object + around a bit longer) + */ + share->bitmap.file.file= -1; + } + if (share->id != 0) + { + /* + We de-assign the id even though index has not been flushed, this is ok + as close_lock serializes us with a Checkpoint looking at our share. + */ + translog_deassign_id_from_share(share); + } + return res; +} + + +/* Init info->cur_row structure */ + +my_bool _ma_init_block_record(MARIA_HA *info) +{ + MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row; + MARIA_SHARE *share= info->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + uint default_extents; + DBUG_ENTER("_ma_init_block_record"); + + if (!my_multi_malloc(PSI_INSTRUMENT_ME, flag, + &row->empty_bits, share->base.pack_bytes, + &row->field_lengths, + share->base.max_field_lengths + 2, + &row->blob_lengths, sizeof(ulong) * share->base.blobs, + &row->null_field_lengths, (sizeof(uint) * + (share->base.fields - + share->base.blobs + + EXTRA_LENGTH_FIELDS)), + &row->tail_positions, (sizeof(MARIA_RECORD_POS) * + (share->base.blobs + 2)), + &new_row->empty_bits, share->base.pack_bytes, + &new_row->field_lengths, + share->base.max_field_lengths + 2, + &new_row->blob_lengths, + sizeof(ulong) * share->base.blobs, + &new_row->null_field_lengths, (sizeof(uint) * + (share->base.fields - + share->base.blobs + + EXTRA_LENGTH_FIELDS)), + &info->log_row_parts, + sizeof(*info->log_row_parts) * + (TRANSLOG_INTERNAL_PARTS + 3 + + share->base.fields + 3), + &info->update_field_data, + (share->base.fields * 4 + + share->base.max_field_lengths + 1 + 4), + NullS, 0)) + DBUG_RETURN(1); + /* Skip over bytes used to store length of field length for logging */ + row->field_lengths+= 2; + new_row->field_lengths+= 2; + + /* Reserve some initial space to avoid mallocs during execution */ + default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 + + (AVERAGE_BLOB_SIZE / + FULL_PAGE_SIZE(share) / + BLOB_SEGMENT_MIN_SIZE)); + + if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &info->bitmap_blocks, + sizeof(MARIA_BITMAP_BLOCK), + default_extents, 64, flag)) + goto err; + info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE; + if (!(info->cur_row.extents= my_malloc(PSI_INSTRUMENT_ME, + info->cur_row.extents_buffer_length, + flag))) + goto err; + + info->row_base_length= share->base_length; + info->row_flag= share->base.default_row_flag; + + /* + We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in + null_field_lengths to allow splitting of rows in 'find_where_to_split_row' + */ + row->null_field_lengths+= EXTRA_LENGTH_FIELDS; + new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS; + + DBUG_RETURN(0); + +err: + _ma_end_block_record(info); + DBUG_RETURN(1); +} + + +void _ma_end_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_end_block_record"); + my_free(info->cur_row.empty_bits); + delete_dynamic(&info->bitmap_blocks); + my_free(info->cur_row.extents); + my_free(info->blob_buff); + /* + The data file is closed, when needed, in ma_once_end_block_record(). + The following protects us from doing an extra, not allowed, close + in maria_close() + */ + info->dfile.file= -1; + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Helper functions +****************************************************************************/ + +/* + Return the next unused postion on the page after a directory entry. + + SYNOPSIS + start_of_next_entry() + dir Directory entry to be used. This can not be the + the last entry on the page! + + RETURN + # Position in page where next entry starts. + Everything between the '*dir' and this are free to be used. +*/ + +static inline uint start_of_next_entry(uchar *dir) +{ + uchar *prev; + /* + Find previous used entry. (There is always a previous entry as + the directory never starts with a deleted entry) + */ + for (prev= dir - DIR_ENTRY_SIZE ; + prev[0] == 0 && prev[1] == 0 ; + prev-= DIR_ENTRY_SIZE) + {} + return (uint) uint2korr(prev); +} + + +/* + Return the offset where the previous entry ends (before on page) + + SYNOPSIS + end_of_previous_entry() + dir Address for current directory entry + end Address to last directory entry + + RETURN + # Position where previous entry ends (smallest address on page) + Everything between # and current entry are free to be used. +*/ + + +static inline uint end_of_previous_entry(MARIA_SHARE *share, + uchar *dir, uchar *end) +{ + uchar *pos; + for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE) + { + uint offset; + if ((offset= uint2korr(pos))) + return offset + uint2korr(pos+2); + } + return PAGE_HEADER_SIZE(share); +} + + +#ifndef DBUG_OFF + +static void _ma_print_directory(MARIA_SHARE *share, + FILE *file, uchar *buff, uint block_size) +{ + uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0; + uint end_of_prev_row= PAGE_HEADER_SIZE(share); + uchar *dir, *end; + + dir= dir_entry_pos(buff, block_size, max_entry-1); + end= dir_entry_pos(buff, block_size, 0); + + DBUG_LOCK_FILE; /* If using DBUG_FILE */ + fprintf(file,"Directory dump (pos:length):\n"); + + for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++) + { + uint offset= uint2korr(end); + uint length= uint2korr(end+2); + fprintf(file, " %4u:%4u", offset, offset ? length : 0); + if (!(row % (80/12))) + fputc('\n', file); + if (offset) + { + DBUG_ASSERT(offset >= end_of_prev_row); + end_of_prev_row= offset + length; + } + } + fputc('\n', file); + fflush(file); + DBUG_UNLOCK_FILE; +} + + +static void check_directory(MARIA_SHARE *share, + uchar *buff, uint block_size, uint min_row_length, + uint real_empty_size) +{ + uchar *dir, *end; + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint start_of_dir, deleted; + uint end_of_prev_row= PAGE_HEADER_SIZE(share); + uint empty_size_on_page; + uint empty_size; + uchar free_entry, prev_free_entry; + + dir= dir_entry_pos(buff, block_size, max_entry-1); + start_of_dir= (uint) (dir - buff); + end= dir_entry_pos(buff, block_size, 0); + deleted= empty_size= 0; + + empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size : + uint2korr(buff + EMPTY_SPACE_OFFSET)); + + /* Ensure that all rows are in increasing order and no overlaps */ + for (; dir <= end ; end-= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(end); + uint length= uint2korr(end+2); + if (offset) + { + DBUG_ASSERT(offset >= end_of_prev_row); + DBUG_ASSERT(!length || length >= min_row_length); + empty_size+= offset - end_of_prev_row; + end_of_prev_row= offset + length; + } + else + deleted++; + } + empty_size+= start_of_dir - end_of_prev_row; + DBUG_ASSERT(end_of_prev_row <= start_of_dir); + DBUG_ASSERT(empty_size == empty_size_on_page); + + /* check free links */ + free_entry= buff[DIR_FREE_OFFSET]; + prev_free_entry= END_OF_DIR_FREE_LIST; + while (free_entry != END_OF_DIR_FREE_LIST) + { + uchar *dir= dir_entry_pos(buff, block_size, free_entry); + DBUG_ASSERT(dir[0] == 0 && dir[1] == 0); + DBUG_ASSERT(dir[2] == prev_free_entry); + prev_free_entry= free_entry; + free_entry= dir[3]; + deleted--; + } + DBUG_ASSERT(deleted == 0); +} +#else +#define check_directory(A,B,C,D,E) +#endif /* DBUG_OFF */ + + +/** + @brief Calculate if there is enough entries on the page +*/ + +static my_bool enough_free_entries(uchar *buff, uint block_size, + uint wanted_entries) +{ + uint entries= (uint) buff[DIR_COUNT_OFFSET]; + uint needed_free_entries, free_entry; + + if (entries + wanted_entries <= MAX_ROWS_PER_PAGE) + return 1; + + /* Check if enough free entries in free list */ + needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE; + + free_entry= (uint) buff[DIR_FREE_OFFSET]; + while (free_entry != END_OF_DIR_FREE_LIST) + { + uchar *dir; + if (!--needed_free_entries) + return 1; + dir= dir_entry_pos(buff, block_size, free_entry); + free_entry= dir[3]; + } + return 0; /* Not enough entries */ +} + + +/** + @brief Check if there is room for more rows on page + + @fn enough_free_entries_on_page + + @return 0 Directory is full + @return 1 There is room for more entries on the page +*/ + +my_bool enough_free_entries_on_page(MARIA_SHARE *share, + uchar *page_buff) +{ + enum en_page_type page_type; + page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] & + ~(uchar) PAGE_CAN_BE_COMPACTED); + + if (page_type == HEAD_PAGE) + { + uint row_count= (uint) page_buff[DIR_COUNT_OFFSET]; + return !(row_count == MAX_ROWS_PER_PAGE && + page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST); + } + return enough_free_entries(page_buff, share->block_size, + 1 + share->base.blobs); +} + + +/** + @brief Extend a record area to fit a given size block + + @fn extend_area_on_page() + @param info Handler + @param buff Page buffer + @param dir Pointer to dir entry in buffer + @param rownr Row number we working on + @param block_size Block size of buffer + @param request_length How much data we want to put at [dir] + @param empty_space Total empty space in buffer + This is updated with length after dir + is allocated and current block freed + @param head_page 1 if head page, 0 for tail page + + @implementation + The logic is as follows (same as in _ma_update_block_record()) + - If new data fits in old block, use old block. + - Extend block with empty space before block. If enough, use it. + - Extend block with empty space after block. If enough, use it. + - Use _ma_compact_block_page() to get all empty space at dir. + + @note + The given directory entry is set to rec length. + empty_space doesn't include the new directory entry + + + @return + @retval 0 ok + @retval ret_offset Pointer to store offset to found area + @retval ret_length Pointer to store length of found area + @retval [dir] rec_offset is store here too + + @retval 1 error (wrong info in block) +*/ + +static my_bool extend_area_on_page(MARIA_HA *info, + uchar *buff, uchar *dir, + uint rownr, + uint request_length, + uint *empty_space, uint *ret_offset, + uint *ret_length, + my_bool head_page) +{ + uint rec_offset, length, org_rec_length; + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + DBUG_ENTER("extend_area_on_page"); + + /* + We can't check for min length here as we may have called + extend_directory() to create a new (empty) entry just before + */ + check_directory(share, buff, block_size, 0, *empty_space); + + rec_offset= uint2korr(dir); + if (rec_offset) + { + /* Extending old row; Mark current space as 'free' */ + length= org_rec_length= uint2korr(dir + 2); + DBUG_PRINT("info", ("rec_offset: %u length: %u request_length: %u " + "empty_space: %u", + rec_offset, org_rec_length, request_length, + *empty_space)); + + *empty_space+= org_rec_length; + } + else + { + /* Reusing free directory entry; Free it from the directory list */ + if (dir[2] == END_OF_DIR_FREE_LIST) + buff[DIR_FREE_OFFSET]= dir[3]; + else + { + uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]); + DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr); + prev_dir[3]= dir[3]; + } + if (dir[3] != END_OF_DIR_FREE_LIST) + { + uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr); + next_dir[2]= dir[2]; + } + rec_offset= start_of_next_entry(dir); + length= 0; + } + if (length < request_length) + { + uint old_rec_offset; + /* + New data did not fit in old position. + Find first possible position where to put new data. + */ + old_rec_offset= rec_offset; + rec_offset= end_of_previous_entry(share, + dir, buff + block_size - + PAGE_SUFFIX_SIZE); + length+= (uint) (old_rec_offset - rec_offset); + DBUG_ASSERT(old_rec_offset); + /* + 'length' is 0 if we are doing an insert into a not allocated block. + This can only happen during "REDO of INSERT" or "UNDO of DELETE." + */ + if (length < request_length) + { + /* + Did not fit in current block + empty space. Extend with + empty space after block. + */ + if (rownr == max_entry - 1) + { + /* Last entry; Everything is free between this and directory */ + length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) - + rec_offset); + } + else + length= start_of_next_entry(dir) - rec_offset; + DBUG_ASSERT((int) length >= 0); + if (length < request_length) + { + /* Not enough continuous space, compact page to get more */ + int2store(dir, rec_offset); + /* Reset length, as this may be a deleted block */ + int2store(dir+2, 0); + _ma_compact_block_page(share, + buff, rownr, 1, + head_page ? info->trn->min_read_from: 0, + head_page ? share->base.min_block_length : 0); + rec_offset= uint2korr(dir); + length= uint2korr(dir+2); + if (length < request_length) + { + DBUG_PRINT("error", ("Not enough space: " + "length: %u request_length: %u", + length, request_length)); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(1); /* Error in block */ + } + *empty_space= length; /* All space is here */ + } + } + } + int2store(dir, rec_offset); + int2store(dir + 2, length); + *ret_offset= rec_offset; + *ret_length= length; + + check_directory(share, + buff, block_size, + head_page ? share->base.min_block_length : 0, + *empty_space - length); + DBUG_RETURN(0); +} + + +/** + @brief Copy not changed fields from 'from' to 'to' + + @notes + Assumption is that most fields are not changed! + (Which is why we don't test if all bits are set for some bytes in bitmap) +*/ + +void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields, + uchar *to, uchar *from) +{ + MARIA_COLUMNDEF *column, *end_column; + uchar *bitmap= (uchar*) changed_fields->bitmap; + MARIA_SHARE *share= info->s; + uint bit= 1; + + for (column= share->columndef, end_column= column+ share->base.fields; + column < end_column; column++) + { + if (!(*bitmap & bit)) + { + uint field_length= column->length; + if (column->type == FIELD_VARCHAR) + { + if (column->fill_length == 1) + field_length= (uint) from[column->offset] + 1; + else + field_length= uint2korr(from + column->offset) + 2; + } + memcpy(to + column->offset, from + column->offset, field_length); + } + if ((bit= (bit << 1)) == 256) + { + bitmap++; + bit= 1; + } + } +} + +#ifdef NOT_YET_NEEDED +/* Calculate empty space on a page */ + +static uint empty_space_on_page(uchar *buff, uint block_size) +{ + enum en_page_type; + page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & + ~(uchar) PAGE_CAN_BE_COMPACTED); + if (page_type == UNALLOCATED_PAGE) + return block_size; + if ((uint) page_type <= TAIL_PAGE) + return uint2korr(buff+EMPTY_SPACE_OFFSET); + return 0; /* Blob page */ +} +#endif + + +/* + @brief Ensure we have space for new directory entries + + @fn make_space_for_directory() + @param info Handler + @param buff Page buffer + @param max_entry Number of current entries in directory + @param count Number of new entries to be added to directory + @param first_dir First directory entry on page + @param empty_space Total empty space in buffer. It's updated + to reflect the new empty space + @param first_pos Store position to last data byte on page here + @param head_page 1 if head page, 0 for tail page. + + @note + This function is inline as the argument passing is the biggest + part of the function + + @return + @retval 0 ok + @retval 1 error (No data on page, fatal error) +*/ + +static inline my_bool +make_space_for_directory(MARIA_HA *info, + uchar *buff, uint max_entry, + uint count, uchar *first_dir, uint *empty_space, + uint *first_pos, + my_bool head_page) +{ + uint length_needed= DIR_ENTRY_SIZE * count; + MARIA_SHARE *share= info->s; + + /* + The following is not true only in the case and UNDO is used to reinsert + a row on a previously not used page + */ + if (likely(max_entry)) + { + /* Check if there is place for the directory entry on the page */ + *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2); + + if ((uint) (first_dir - buff) < *first_pos + length_needed) + { + /* Create place for directory */ + _ma_compact_block_page(share, + buff, max_entry - 1, 0, + head_page ? info->trn->min_read_from : 0, + head_page ? share->base.min_block_length : 0); + *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2)); + *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (*empty_space < length_needed) + { + /* + We should always have space, as we only come here for + UNDO of DELETE (in which case we know the row was on the + page before) or if the bitmap told us there was space on page + */ + DBUG_ASSERT(!maria_assert_if_crashed_table); + return(1); + } + } + } + else + *first_pos= PAGE_HEADER_SIZE(share); + + /* Reduce directory entry size from free space size */ + (*empty_space)-= length_needed; + buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count); + return(0); +} + + +/* + Find free position in directory + + SYNOPSIS + find_free_position() + info Handler + buff Page + block_size Size of page + res_rownr Store index to free position here + res_length Store length of found segment here + empty_space Store length of empty space on disk here. This is + all empty space, including the found block. + @param head_page 1 if head page, 0 for tail page. + + NOTES + If there is a free directory entry (entry with position == 0), + then use it and change it to be the size of the empty block + after the previous entry. This guarantees that all row entries + are stored on disk in inverse directory order, which makes life easier for + '_ma_compact_block_page()' and to know if there is free space after any + block. + + If there is no free entry (entry with position == 0), then we create + a new one. If there is not space for the directory entry (because + the last block overlapps with the directory), we compact the page. + + We will update the offset and the length of the found dir entry to + match the position and empty space found. + + buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller + + See start of file for description of how free directory entires are linked + + RETURN + 0 Error (directory full or last block goes over directory) + # Pointer to directory entry on page +*/ + +static uchar *find_free_position(MARIA_HA *info, + uchar *buff, uint block_size, uint *res_rownr, + uint *res_length, uint *empty_space, + my_bool head_page) +{ + uint max_entry, free_entry; + uint length, first_pos; + uchar *dir, *first_dir; + MARIA_SHARE *share= info->s; + DBUG_ENTER("find_free_position"); + + max_entry= (uint) buff[DIR_COUNT_OFFSET]; + free_entry= (uint) buff[DIR_FREE_OFFSET]; + *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + + DBUG_PRINT("info", ("max_entry: %u free_entry: %u", max_entry, free_entry)); + + first_dir= dir_entry_pos(buff, block_size, max_entry - 1); + + /* Search after first free position */ + if (free_entry != END_OF_DIR_FREE_LIST) + { + if (free_entry >= max_entry) + DBUG_RETURN(0); /* Consistency error */ + dir= dir_entry_pos(buff, block_size, free_entry); + DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST); + /* Relink free list */ + if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST) + { + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT((uint) next_entry[2] == free_entry && + uint2korr(next_entry) == 0); + next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */ + } + + first_pos= end_of_previous_entry(share, + dir, buff + block_size - + PAGE_SUFFIX_SIZE); + length= start_of_next_entry(dir) - first_pos; + int2store(dir, first_pos); /* Update dir entry */ + int2store(dir + 2, 0); + *res_rownr= free_entry; + *res_length= length; + + check_directory(share, buff, block_size, + head_page ? share->base.min_block_length : 0, (uint) -1); + DBUG_RETURN(dir); + } + /* No free places in dir; create a new one */ + + /* Check if there is place for the directory entry */ + if (max_entry == MAX_ROWS_PER_PAGE) + DBUG_RETURN(0); + + if (make_space_for_directory(info, buff, max_entry, 1, + first_dir, empty_space, &first_pos, head_page)) + DBUG_RETURN(0); + + dir= first_dir - DIR_ENTRY_SIZE; + length= (uint) (dir - buff - first_pos); + DBUG_ASSERT(length <= *empty_space); + int2store(dir, first_pos); + int2store(dir + 2, 0); /* Max length of region */ + *res_rownr= max_entry; + *res_length= length; + + check_directory(share, + buff, block_size, + head_page ? share->base.min_block_length : 0, + *empty_space); + DBUG_RETURN(dir); +} + + +/** + @brief Enlarge page directory to hold more entries + + @fn extend_directory() + @param info Handler + @param buff Page buffer + @param block_size Block size + @param max_entry Number of directory entries on page + @param new_entry Position for new entry + @param empty_space Total empty space in buffer. It's updated + to reflect the new empty space + @param head_page 1 if head page, 0 for tail page. + + @note + This is only called on UNDO when we want to expand the directory + to be able to re-insert row in a given position + + The new directory entry will be set to cover the maximum possible space + + @return + @retval 0 ok + @retval 1 error (No data on page, fatal error) +*/ + +static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size, + uint max_entry, uint new_entry, + uint *empty_space, my_bool head_page) +{ + uint length, first_pos; + uchar *dir, *first_dir; + DBUG_ENTER("extend_directory"); + + /* + Note that in if max_entry is 0, then first_dir will point to + an illegal directory entry. This is ok, as in this case we will + not access anything through first_dir. + */ + first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE; + + if (make_space_for_directory(info, buff, max_entry, + new_entry - max_entry + 1, + first_dir, empty_space, &first_pos, head_page)) + DBUG_RETURN(1); + + /* Set the new directory entry to cover the max possible length */ + dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1); + length= (uint) (dir - buff - first_pos); + int2store(dir, first_pos); + int2store(dir+2, length); + *empty_space-= length; + + if (new_entry-- > max_entry) + { + /* Link all row entries between new_entry and max_entry into free list */ + uint free_entry= (uint) buff[DIR_FREE_OFFSET]; + uint prev_entry= END_OF_DIR_FREE_LIST; + buff[DIR_FREE_OFFSET]= new_entry; + do + { + dir+= DIR_ENTRY_SIZE; + dir[0]= dir[1]= 0; + dir[2]= (uchar) prev_entry; + dir[3]= (uchar) new_entry-1; + prev_entry= new_entry; + } while (new_entry-- > max_entry); + if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST) + { + /* Relink next entry to point to newly freed entry */ + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_entry) == 0 && + next_entry[2] == END_OF_DIR_FREE_LIST); + next_entry[2]= max_entry; + } + } + + check_directory(info->s, + buff, block_size, + head_page ? MY_MIN(info->s->base.min_block_length, length) : + 0, *empty_space); + DBUG_RETURN(0); +} + + +/**************************************************************************** + Updating records +****************************************************************************/ + +/* + Calculate length of all the different field parts + + SYNOPSIS + calc_record_size() + info Maria handler + record Row to store + row Store statistics about row here + + NOTES + The statistics is used to find out how much space a row will need + and also where we can split a row when we need to split it into several + extents. +*/ + +static void calc_record_size(MARIA_HA *info, const uchar *record, + MARIA_ROW *row) +{ + MARIA_SHARE *share= info->s; + uchar *field_length_data; + MARIA_COLUMNDEF *column, *end_column; + uint *null_field_lengths= row->null_field_lengths; + ulong *blob_lengths= row->blob_lengths; + DBUG_ENTER("calc_record_size"); + + row->normal_length= row->char_length= row->varchar_length= + row->blob_length= row->extents_count= 0; + + /* Create empty bitmap and calculate length of each varlength/char field */ + bzero(row->empty_bits, share->base.pack_bytes); + field_length_data= row->field_lengths; + for (column= share->columndef + share->base.fixed_not_null_fields, + end_column= share->columndef + share->base.fields; + column < end_column; column++, null_field_lengths++) + { + if ((record[column->null_pos] & column->null_bit)) + { + if (column->type != FIELD_BLOB) + *null_field_lengths= 0; + else + *blob_lengths++= 0; + continue; + } + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + DBUG_ASSERT(column->empty_bit == 0); + /* fall through */ + case FIELD_SKIP_PRESPACE: /* Not packed */ + row->normal_length+= column->length; + *null_field_lengths= column->length; + break; + case FIELD_SKIP_ZERO: /* Fixed length field */ + if (memcmp(record+ column->offset, maria_zero_string, + column->length) == 0) + { + row->empty_bits[column->empty_pos] |= column->empty_bit; + *null_field_lengths= 0; + } + else + { + row->normal_length+= column->length; + *null_field_lengths= column->length; + } + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + const uchar *pos, *end; + for (pos= record + column->offset, end= pos + column->length; + end > pos && end[-1] == ' '; end--) + ; + if (pos == end) /* If empty string */ + { + row->empty_bits[column->empty_pos]|= column->empty_bit; + *null_field_lengths= 0; + } + else + { + uint length= (uint) (end - pos); + if (column->length <= 255) + *field_length_data++= (uchar) length; + else + { + int2store(field_length_data, length); + field_length_data+= 2; + } + row->char_length+= length; + *null_field_lengths= length; + } + break; + } + case FIELD_VARCHAR: + { + uint length, field_length_data_length; + const uchar *field_pos= record + column->offset; + + /* 256 is correct as this includes the length uchar */ + field_length_data[0]= field_pos[0]; + if (column->length <= 256) + { + length= (uint) (uchar) *field_pos; + field_length_data_length= 1; + } + else + { + length= uint2korr(field_pos); + field_length_data[1]= field_pos[1]; + field_length_data_length= 2; + } + *null_field_lengths= length; + if (!length) + { + row->empty_bits[column->empty_pos]|= column->empty_bit; + break; + } + row->varchar_length+= length; + *null_field_lengths= length; + field_length_data+= field_length_data_length; + break; + } + case FIELD_BLOB: + { + const uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_pos); + + *blob_lengths++= blob_length; + if (!blob_length) + row->empty_bits[column->empty_pos]|= column->empty_bit; + else + { + row->blob_length+= blob_length; + memcpy(field_length_data, field_pos, size_length); + field_length_data+= size_length; + } + break; + } + default: + DBUG_ASSERT(0); + } + } + row->field_lengths_length= (uint) (field_length_data - row->field_lengths); + /* + - info->row_base_length is base information we must have on a page in first + extent: + - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes + + table_checksum (0 | 1) + - row->min_length is minimum amount of data we must store on + a page. bitmap code will ensure we get at list this much + + total number of extents and one extent information + - fixed_not_null_fields_length is length of fixed length fields that can't + be compacted + - head_length is the amount of data for the head page + (ie, all fields except blobs) + */ + row->min_length= (info->row_base_length + + (share->base.max_field_lengths ? + size_to_store_key_length(row->field_lengths_length) : + 0)); + row->head_length= (row->min_length + + share->base.fixed_not_null_fields_length + + row->field_lengths_length + + row->normal_length + + row->char_length + row->varchar_length); + row->total_length= (row->head_length + row->blob_length); + if (row->total_length < share->base.min_block_length) + row->total_length= share->base.min_block_length; + DBUG_PRINT("exit", ("head_length: %lu total_length: %lu", + (ulong) row->head_length, (ulong) row->total_length)); + DBUG_VOID_RETURN; +} + + +/** + Compact page by removing all space between rows + + Moves up all rows to start of page. Moves blocks that are directly after + each other with one memmove. + + @note if rownr is the last row in the page, and extend_block is false, + caller has to make sure to update bitmap page afterwards to reflect freed + space. + + @param buff Page to compact + @param block_size Size of page + @param rownr Put empty data after this row + @param extend_block If 1, extend the block at 'rownr' to cover the + whole block. + @param min_read_from If <> 0, remove all trid's that are less than this +*/ + +void _ma_compact_block_page(MARIA_SHARE *share, + uchar *buff, uint rownr, + my_bool extend_block, TrID min_read_from, + uint min_row_length) +{ + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block; + uint freed_size= 0; + uint block_size= share->block_size; + uchar *dir, *end; + DBUG_ENTER("_ma_compact_block_page"); + DBUG_PRINT("enter", ("rownr: %u min_read_from: %lu", rownr, + (ulong) min_read_from)); + DBUG_ASSERT(max_entry > 0 && + max_entry < (block_size - PAGE_HEADER_SIZE(share) - + PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE); + + /* Move all entries before and including rownr up to start of page */ + dir= dir_entry_pos(buff, block_size, rownr); + end= dir_entry_pos(buff, block_size, 0); + page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE(share); + diff= 0; + for (; dir <= end ; end-= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(end); + + if (offset) + { + uint row_length= uint2korr(end + 2); + DBUG_ASSERT(offset >= page_pos); + DBUG_ASSERT(buff + offset + row_length <= dir); + DBUG_ASSERT(row_length >= min_row_length || row_length == 0); + + /* Row length can be zero if row is to be deleted */ + if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID)) + { + TrID transid= transid_korr(buff+offset+1); + if (transid < min_read_from) + { + /* Remove transid from row by moving the start point of the row up */ + buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID; + offset+= TRANSID_SIZE; + freed_size+= TRANSID_SIZE; + row_length-= TRANSID_SIZE; + int2store(end+2, row_length); + } + } + + if (offset != next_free_pos) + { + uint length= (next_free_pos - start_of_found_block); + /* + There was empty space before this and prev block + Check if we have to move previous block up to page start + */ + if (page_pos != start_of_found_block) + { + /* move up previous block */ + memmove(buff + page_pos, buff + start_of_found_block, length); + } + page_pos+= length; + /* next continuous block starts here */ + start_of_found_block= offset; + diff= offset - page_pos; + } + int2store(end, offset - diff); /* correct current pos */ + next_free_pos= offset + row_length; + + if (unlikely(row_length < min_row_length) && row_length) + { + /* + This can only happen in the case we compacted transid and + the row become 'too short' + + Move the current row down to it's right place and extend it + with 0. + */ + uint row_diff= min_row_length - row_length; + uint length= (next_free_pos - start_of_found_block); + + DBUG_ASSERT(page_pos != start_of_found_block); + bmove(buff + page_pos, buff + start_of_found_block, length); + bzero(buff+ page_pos + length, row_diff); + page_pos+= min_row_length; + int2store(end+2, min_row_length); + freed_size-= row_diff; + next_free_pos= start_of_found_block= page_pos; + diff= 0; + } + } + } + if (page_pos != start_of_found_block) + { + uint length= (next_free_pos - start_of_found_block); + memmove(buff + page_pos, buff + start_of_found_block, length); + } + start_of_found_block= uint2korr(dir); + + if (rownr != max_entry - 1) + { + /* Move all entries after rownr to end of page */ + uint rownr_length; + + DBUG_ASSERT(extend_block); /* Should always be true */ + next_free_pos= end_of_found_block= page_pos= + block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE; + diff= 0; + /* End points to entry before 'rownr' */ + for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE) + { + uint offset= uint2korr(dir); + uint row_length; + uint row_end; + if (!offset) + continue; + row_length= uint2korr(dir + 2); + row_end= offset + row_length; + DBUG_ASSERT(offset >= start_of_found_block && + row_end <= next_free_pos && row_length >= min_row_length); + + if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID)) + { + TrID transid= transid_korr(buff + offset+1); + if (transid < min_read_from) + { + /* Remove transid from row */ + buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID; + offset+= TRANSID_SIZE; + row_length-= TRANSID_SIZE; + int2store(dir+2, row_length); + } + if (unlikely(row_length < min_row_length)) + { + /* + This can only happen in the case we compacted transid and + the row become 'too short' + */ + uint row_diff= min_row_length - row_length; + if (next_free_pos < row_end + row_diff) + { + /* + Not enough space for extending next block with enough + end 0's. Move current data down to get place for them + */ + uint move_down= row_diff - (next_free_pos - row_end); + bmove(buff + offset - move_down, buff + offset, row_length); + offset-= move_down; + } + /* + Extend the next block with 0, which will be part of current + row when the blocks are joined together later + */ + bzero(buff + next_free_pos - row_diff, row_diff); + next_free_pos-= row_diff; + int2store(dir+2, min_row_length); + } + row_end= offset + row_length; + } + + if (row_end != next_free_pos) + { + uint length= (end_of_found_block - next_free_pos); + if (page_pos != end_of_found_block) + { + /* move next block down */ + memmove(buff + page_pos - length, buff + next_free_pos, length); + } + page_pos-= length; + /* next continuous block starts here */ + end_of_found_block= row_end; + diff= page_pos - row_end; + } + int2store(dir, offset + diff); /* correct current pos */ + next_free_pos= offset; + } + if (page_pos != end_of_found_block) + { + uint length= (end_of_found_block - next_free_pos); + memmove(buff + page_pos - length, buff + next_free_pos, length); + next_free_pos= page_pos- length; + } + + /* Extend rownr block to cover hole */ + rownr_length= next_free_pos - start_of_found_block; + int2store(dir+2, rownr_length); + DBUG_ASSERT(rownr_length >= min_row_length); + } + else + { + if (extend_block) + { + /* Extend last block to cover whole page */ + uint length= ((uint) (dir - buff) - start_of_found_block); + int2store(dir+2, length); + DBUG_ASSERT(length >= min_row_length); + } + else + { + /* Add length gained from freed transaction id's to this page */ + uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size; + int2store(buff + EMPTY_SPACE_OFFSET, length); + } + buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED; + } + check_directory(share, buff, block_size, min_row_length, + extend_block ? 0 : (uint) -1); + DBUG_EXECUTE("directory", _ma_print_directory(share, + DBUG_FILE, buff, block_size);); + DBUG_VOID_RETURN; +} + + +/* + Create an empty tail or head page + + SYNOPSIS + make_empty_page() + buff Page buffer + block_size Block size + page_type HEAD_PAGE or TAIL_PAGE + create_dir_entry TRUE of we should create a directory entry + + NOTES + EMPTY_SPACE is not updated +*/ + +static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type, + my_bool create_dir_entry) +{ + uint block_size= info->s->block_size; + DBUG_ENTER("make_empty_page"); + + bzero(buff, PAGE_HEADER_SIZE(info->s)); + +#if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind) + /* + We zero the rest of the block to avoid getting old memory information + to disk and to allow the file to be compressed better if archived. + The code does not assume the block is zeroed. + */ + if (page_type != BLOB_PAGE) + bzero(buff+ PAGE_HEADER_SIZE(info->s), + block_size - PAGE_HEADER_SIZE(info->s)); +#endif + buff[PAGE_TYPE_OFFSET]= (uchar) page_type; + buff[DIR_COUNT_OFFSET]= (int) create_dir_entry; + buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST; + if (create_dir_entry) + { + /* Create directory entry to point to start of page with size 0 */ + buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + int2store(buff, PAGE_HEADER_SIZE(info->s)); + int2store(buff+2, 0); + } + DBUG_VOID_RETURN; +} + + +/* + Read or initialize new head or tail page + + SYNOPSIS + get_head_or_tail_page() + info Maria handler + block Block to read + buff Suggest this buffer to key cache + length Minimum space needed + page_type HEAD_PAGE || TAIL_PAGE + res Store result position here + + NOTES + We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data + as we don't know how much data the caller will actually use. + + res->empty_space is set to length of empty space + + RETURN + 0 ok All slots in 'res' are updated + 1 error my_errno is set +*/ + +struct st_row_pos_info +{ + uchar *buff; /* page buffer */ + uchar *data; /* Place for data */ + uchar *dir; /* Directory */ + uint length; /* Length for data */ + uint rownr; /* Offset in directory */ + uint empty_space; /* Space left on page */ +}; + + +static my_bool get_head_or_tail_page(MARIA_HA *info, + const MARIA_BITMAP_BLOCK *block, + uchar *buff, uint length, uint page_type, + enum pagecache_page_lock lock, + struct st_row_pos_info *res) +{ + uint block_size; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + DBUG_ENTER("get_head_or_tail_page"); + DBUG_PRINT("enter", ("page_type: %u length: %u", page_type, length)); + + block_size= share->block_size; + if (block->org_bitmap_value == 0) /* Empty block */ + { + /* New page */ + make_empty_page(info, buff, page_type, 1); + res->buff= buff; + res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE(share)); + res->data= (buff + PAGE_HEADER_SIZE(share)); + res->dir= res->data + res->length; + res->rownr= 0; + DBUG_ASSERT(length <= res->length); + } + else + { + uchar *dir; + /* Read old page */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + res->buff= pagecache_read(share->pagecache, &info->dfile, + block->page, 0, 0, share->page_type, + lock, &page_link.link); + page_link.changed= res->buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!page_link.changed) + { + _ma_set_fatal_error(info, my_errno); + DBUG_RETURN(1); + } + + DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == + page_type); + if (!(dir= find_free_position(info, res->buff, block_size, &res->rownr, + &res->length, &res->empty_space, + page_type == HEAD_PAGE))) + goto crashed; + + if (res->length < length) + { + if (res->empty_space + res->length >= length) + { + _ma_compact_block_page(share, + res->buff, res->rownr, 1, + (page_type == HEAD_PAGE ? + info->trn->min_read_from : 0), + (page_type == HEAD_PAGE ? + share->base.min_block_length : + 0)); + /* All empty space are now after current position */ + dir= dir_entry_pos(res->buff, block_size, res->rownr); + res->length= res->empty_space= uint2korr(dir+2); + } + if (res->length < length) + { + DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u", + length, res->length, res->empty_space)); + goto crashed; /* Wrong bitmap information */ + } + } + res->dir= dir; + res->data= res->buff + uint2korr(dir); + } + DBUG_RETURN(0); + +crashed: + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); /* File crashed */ + DBUG_RETURN(1); +} + + +/* + @brief Create room for a head or tail row on a given page at given position + + @fn get_rowpos_in_head_or_tail_page() + @param info Maria handler + @param block Block to read + @param buff Suggest this buffer to key cache + @param length Minimum space needed + @param page_type HEAD_PAGE || TAIL_PAGE + @param rownr Rownr to use + @param res Store result position here + + @note + This is essential same as get_head_or_tail_page, with the difference + that the caller species at what position the row should be put. + This is used when restoring a row to it's original position as + part of UNDO DELETE or UNDO UPDATE + + @return + @retval 0 ok All slots in 'res' are updated + @retval 1 error my_errno is set +*/ + +static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info, + const MARIA_BITMAP_BLOCK *block, + uchar *buff, uint length, + uint page_type, + enum pagecache_page_lock lock, + uint rownr, + struct st_row_pos_info *res) +{ + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uchar *dir; + uint block_size= share->block_size; + uint max_entry, max_length, rec_offset; + DBUG_ENTER("get_rowpos_in_head_or_tail_page"); + + if (block->org_bitmap_value == 0) /* Empty block */ + { + /* New page */ + make_empty_page(info, buff, page_type, 0); + res->empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE; + } + else + { + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + buff= pagecache_read(share->pagecache, &info->dfile, + block->page, 0, 0, share->page_type, + lock, &page_link.link); + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!page_link.changed) /* Read error */ + { + _ma_set_fatal_error(info, my_errno); + DBUG_RETURN(1); + } + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == + (uchar) page_type); + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type) + goto err; + res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + } + + max_entry= (uint) buff[DIR_COUNT_OFFSET]; + if (max_entry <= rownr) + { + if (extend_directory(info, buff, block_size, + max_entry, rownr, &res->empty_space, + page_type == HEAD_PAGE)) + goto err; + } + + /* + The following dir entry is unused in case of insert / update but + not in case of undo_update / undo_delete + */ + dir= dir_entry_pos(buff, block_size, rownr); + + if (extend_area_on_page(info, buff, dir, rownr, length, + &res->empty_space, &rec_offset, &max_length, + page_type == HEAD_PAGE)) + goto err; + + res->buff= buff; + res->rownr= rownr; + res->dir= dir; + res->data= buff + rec_offset; + res->length= length; + DBUG_RETURN(0); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); /* File crashed */ + DBUG_RETURN(1); +} + + +/* + Write tail for head data or blob + + SYNOPSIS + write_tail() + info Maria handler + block Block to tail page + row_part Data to write to page + length Length of data + + NOTES + block->page_count is updated to the directory offset for the tail + so that we can store the position in the row extent information + + RETURN + 0 ok + block->page_count is set to point (dir entry + TAIL_BIT) + + 1 error; In this case my_errno is set to the error +*/ + +static my_bool write_tail(MARIA_HA *info, + MARIA_BITMAP_BLOCK *block, + uchar *row_part, uint org_length) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + uint block_size= share->block_size, empty_space, length= org_length; + struct st_row_pos_info row_pos; + my_off_t position; + my_bool res, block_is_read; + DBUG_ENTER("write_tail"); + DBUG_PRINT("enter", ("page: %lu length: %u", + (ulong) block->page, length)); + + info->keyread_buff_used= 1; + /* + Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows + some place to grow in the future) + */ + if (length < MIN_TAIL_SIZE) + length= MIN_TAIL_SIZE; + + if (block->page_count == TAIL_PAGE_COUNT_MARKER) + { + /* + Create new tail + page will be pinned & locked by get_head_or_tail_page + */ + if (get_head_or_tail_page(info, block, info->keyread_buff, length, + TAIL_PAGE, PAGECACHE_LOCK_WRITE, + &row_pos)) + DBUG_RETURN(1); + } + else + { + /* Write tail on predefined row position */ + if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff, + length, TAIL_PAGE, + PAGECACHE_LOCK_WRITE, + block->page_count & ~TAIL_BIT, + &row_pos)) + DBUG_RETURN(1); + } + DBUG_PRINT("info", ("tailid: %lu (%lu:%u)", + (ulong) ma_recordpos(block->page, row_pos.rownr), + (ulong) block->page, row_pos.rownr)); + + block_is_read= block->org_bitmap_value != 0; + + memcpy(row_pos.data, row_part, org_length); + + if (share->now_transactional) + { + /* Log changes in tail block */ + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + LSN lsn; + + /* + Log REDO changes of tail page + Note that we have to log length, not org_length, to be sure that + REDO, which doesn't use write_tail, also creates a block of at least + MIN_TAIL_SIZE + */ + page_store(log_data + FILEID_STORE_SIZE, block->page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + row_pos.rownr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + if (translog_write_record(&lsn, + (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL : + LOGREC_REDO_NEW_ROW_TAIL), + info->trn, info, + (translog_size_t) (sizeof(log_data) + length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(1); + } + + int2store(row_pos.dir + 2, length); + empty_space= row_pos.empty_space - length; + int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space); + block->page_count= row_pos.rownr + TAIL_BIT; + /* + If there is less directory entries free than number of possible tails + we can write for a row, we mark the page full to ensure that we don't + during _ma_bitmap_find_place() allocate more entries on the tail page + than it can hold + */ + block->empty_space= (enough_free_entries(row_pos.buff, share->block_size, + 1 + share->base.blobs) ? + empty_space : 0); + /* Keep BLOCKUSED_USE_ORG_BITMAP */ + block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL; + + if (block_is_read) + { + /* Current page link is last element in pinned_pages */ + MARIA_PINNED_PAGE *page_link; + page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE*); + pagecache_unlock_by_link(share->pagecache, page_link->link, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + DBUG_ASSERT(page_link->changed); + page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK; + res= 0; + } + else + { + if (!(res= pagecache_write(share->pagecache, + &info->dfile, block->page, 0, + row_pos.buff,share->page_type, + PAGECACHE_LOCK_READ, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE))) + { + DBUG_ASSERT(page_link.link); + page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + + /* Increase data file size, if extended */ + position= (my_off_t) block->page * block_size; + if (share->state.state.data_file_length <= position) + { + /* + We are modifying a state member before writing the UNDO; this is a WAL + violation. But for data_file_length this is ok, as long as we change + data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see + collect_tables()). + */ + _ma_set_share_data_file_length(share, position + block_size); + } + } + DBUG_RETURN(res); +} + + +/* + Write full pages + + SYNOPSIS + write_full_pages() + info Maria handler + lsn LSN for the undo record + block Where to write data + data Data to write + length Length of data + + NOTES + Logging of the changes to the full pages are done in the caller + write_block_record(). + + RETURN + 0 ok + 1 error on write +*/ + +static my_bool write_full_pages(MARIA_HA *info, + LSN lsn, + MARIA_BITMAP_BLOCK *block, + uchar *data, ulong length) +{ + pgcache_page_no_t page; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + uint data_size= FULL_PAGE_SIZE(share); + uchar *buff= info->keyread_buff; + uint page_count, sub_blocks; + my_off_t position, max_position; + DBUG_ENTER("write_full_pages"); + DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu", + (ulong) length, (ulong) block->page, + (ulong) block->page_count)); + DBUG_ASSERT((block->page_count & TAIL_BIT) == 0); + + info->keyread_buff_used= 1; + page= block->page; + page_count= block->page_count; + sub_blocks= block->sub_blocks; + + max_position= (my_off_t) (page + page_count) * block_size; + + /* Increase data file size, if extended */ + + for (; length; data+= data_size) + { + uint copy_length; + if (!page_count--) + { + if (!--sub_blocks) + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(1); + } + + block++; + page= block->page; + page_count= block->page_count - 1; + DBUG_PRINT("info", ("page: %lu page_count: %lu", + (ulong) block->page, (ulong) block->page_count)); + + position= (page + page_count + 1) * block_size; + set_if_bigger(max_position, position); + } + lsn_store(buff, lsn); + buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE; + bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE, + FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE)); + copy_length= MY_MIN(data_size, length); + memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, copy_length); + length-= copy_length; + + /* + Zero out old information from the block. This removes possible + sensitive information from the block and also makes the file + easier to compress and easier to compare after recovery. + */ + if (copy_length != data_size) + bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length), + (data_size - copy_length) + PAGE_SUFFIX_SIZE); + + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, info->trn->rec_lsn)) + DBUG_RETURN(1); + page++; + DBUG_ASSERT(block->used & BLOCKUSED_USED); + } + if (share->state.state.data_file_length < max_position) + _ma_set_share_data_file_length(share, max_position); + DBUG_RETURN(0); +} + + +/* + Store ranges of full pages in compact format for logging + + SYNOPSIS + store_page_range() + to Store data here + block Where pages are to be written + length Length of data to be written + Normally this is full pages, except for the last + tail block that may only partly fit the last page. + tot_ranges Add here the number of ranges used + + NOTES + The format of one entry is: + + Ranges SUB_RANGE_SIZE + Empty bytes at end of last byte BLOCK_FILLER_SIZE + For each range + Page number PAGE_STORE_SIZE + Number of pages PAGERANGE_STORE_SIZE + + RETURN + # end position for 'to' +*/ + +static uchar *store_page_range(MARIA_SHARE *share, + uchar *to, MARIA_BITMAP_BLOCK *block, + ulong length, + uint *tot_ranges) +{ + uint data_size= FULL_PAGE_SIZE(share); + ulong pages_left= (length + data_size -1) / data_size; + uint page_count, ranges, empty_space; + uchar *to_start; + DBUG_ENTER("store_page_range"); + + to_start= to; + to+= SUB_RANGE_SIZE; + + /* Store number of unused bytes at last page */ + empty_space= (uint) (pages_left * data_size - length); + int2store(to, empty_space); + to+= BLOCK_FILLER_SIZE; + + ranges= 0; + do + { + pgcache_page_no_t page; + page= block->page; + page_count= block->page_count; + block++; + if (page_count > pages_left) + page_count= pages_left; + + page_store(to, page); + to+= PAGE_STORE_SIZE; + pagerange_store(to, page_count); + to+= PAGERANGE_STORE_SIZE; + ranges++; + } while ((pages_left-= page_count)); + /* Store number of ranges for this block */ + int2store(to_start, ranges); + (*tot_ranges)+= ranges; + + DBUG_RETURN(to); +} + + +/* + Store packed extent data + + SYNOPSIS + store_extent_info() + to Store first packed data here + row_extents_second_part Store rest here + first_block First block to store + count Number of blocks + + NOTES + We don't have to store the position for the head block + + We have to set the START_EXTENT_BIT for every extent where the + blob will be stored on a page of it's own. We need this in the + UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and + undo-update. +*/ + +static void store_extent_info(uchar *to, + uchar *row_extents_second_part, + MARIA_BITMAP_BLOCK *first_block, + uint count) +{ + MARIA_BITMAP_BLOCK *block, *end_block; + uint copy_length; + my_bool first_found= 0; + DBUG_ENTER("store_extent_info"); + DBUG_PRINT("enter", ("count: %u", count)); + + for (block= first_block, end_block= first_block+count ; + block < end_block; block++) + { + /* The following is only false for marker (unused) blocks */ + if (likely(block->used & BLOCKUSED_USED)) + { + uint page_count= block->page_count; + DBUG_ASSERT(page_count != 0); + page_store(to, block->page); + if (block->sub_blocks) + { + /* + Set a bit so that we later know that this was the first block + for a blob + */ + page_count|= START_EXTENT_BIT; + } + pagerange_store(to + PAGE_STORE_SIZE, page_count); + DBUG_DUMP("extent", to, ROW_EXTENT_SIZE); + to+= ROW_EXTENT_SIZE; + if (!first_found) + { + first_found= 1; + to= row_extents_second_part; + } + } + } + copy_length= (count - 1) * ROW_EXTENT_SIZE; + /* + In some unlikely cases we have allocated to many blocks. Clear this + data. + */ + bzero(to, (size_t) (row_extents_second_part + copy_length - to)); + DBUG_VOID_RETURN; +} + + +/** + @brief + Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable + for write_block_record + + @note + In case of blobs, this function marks all the blob pages in the bitmap + as full pages. The bitmap bits for other pages will be marked + when write_block_record() calls _ma_bitmap_release_unused(). + + This function will be removed in Maria 2.0 when we instead of delete rows + mark them as deleted and only remove them after commit. + + @return + @retval 0 ok + @retval 1 Error (out of memory or disk error changing bitmap) or + wrong information in extent information +*/ + +static my_bool extent_to_bitmap_blocks(MARIA_HA *info, + MARIA_BITMAP_BLOCKS *blocks, + pgcache_page_no_t head_page, + uint extent_count, + const uchar *extent_info) +{ + MARIA_BITMAP_BLOCK *block, *start_block; + MARIA_SHARE *share= info->s; + uint i, tail_page; + DBUG_ENTER("extent_to_bitmap_blocks"); + + if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2)) + DBUG_RETURN(1); + block= blocks->block= dynamic_element(&info->bitmap_blocks, 0, + MARIA_BITMAP_BLOCK*); + blocks->count= extent_count + 1; + blocks->tail_page_skipped= blocks->page_skipped= 0; + block->page= head_page; + block->page_count= 1; + block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; + /* Impossible value, will force storage of real value */ + block->org_bitmap_value= 255; + + start_block= block++; + for (i=0 ; + i++ < extent_count ; + block++, extent_info+= ROW_EXTENT_SIZE) + { + uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE); + if (page_count & START_EXTENT_BIT) + { + page_count&= ~START_EXTENT_BIT; + start_block->sub_blocks= (uint) (block - start_block); + start_block= block; + } + block->page= page_korr(extent_info); + block->page_count= page_count; + block->sub_blocks= 0; + if (block->page_count == 0) + { + /* Extend allocated but not used by write_block_record() */ + DBUG_ASSERT(block->page == 0); + /* This is the last block */ + blocks->count= i; + break; + } + if ((tail_page= page_count & TAIL_BIT)) + page_count= 1; + + /* Check if wrong data */ + if (block->page == 0 || page_count == 0 || + (block->page + page_count) * share->block_size > + share->state.state.data_file_length) + { + DBUG_PRINT("error", ("page: %lu page_count: %u tail: %u length: %ld data_length: %ld", + (ulong) block->page, + (block->page_count & ~TAIL_BIT), + (uint) MY_TEST(block->page_count & TAIL_BIT), + (ulong) ((block->page + (page_count & ~TAIL_BIT)) * + share->block_size), + (ulong) share->state.state.data_file_length)); + DBUG_RETURN(1); + } + if (tail_page) + { + block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap, + block->page); + block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED | + BLOCKUSED_USE_ORG_BITMAP); + } + else + { + my_bool res; + mysql_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, + block->page, page_count); + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + DBUG_RETURN(1); + block->used= BLOCKUSED_USED; + } + } + start_block->sub_blocks= (uint) (block - start_block); + DBUG_RETURN(0); +} + + +/* + Free regions of pages with logging + + NOTES + We are removing filler events and tail page events from + row->extents to get smaller log. + + RETURN + 0 ok + 1 error +*/ + +static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row) +{ + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + LSN lsn; + size_t extents_length; + uchar *extents= row->extents; + DBUG_ENTER("free_full_pages"); + + if (info->s->now_transactional) + { + /* Compact events by removing filler and tail events */ + uchar *new_block= 0; + uchar *end, *to, *compact_extent_info; + my_bool res, buff_alloced; + uint extents_count; + + alloc_on_stack(*info->stack_end_ptr, compact_extent_info, buff_alloced, + row->extents_count * ROW_EXTENT_SIZE); + if (!compact_extent_info) + DBUG_RETURN(1); + + to= compact_extent_info; + for (end= extents + row->extents_count * ROW_EXTENT_SIZE ; + extents < end ; + extents+= ROW_EXTENT_SIZE) + { + uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); + page_count&= ~START_EXTENT_BIT; + if (! (page_count & TAIL_BIT) && page_count != 0) + { + /* Found correct extent */ + if (!new_block) + new_block= extents; /* First extent in range */ + continue; + } + /* Found extent to remove, copy everything found so far */ + if (new_block) + { + size_t length= (size_t) (extents - new_block); + memcpy(to, new_block, length); + to+= length; + new_block= 0; + } + } + if (new_block) + { + size_t length= (size_t) (extents - new_block); + memcpy(to, new_block, length); + to+= length; + } + + if (!unlikely(extents_length= (uint) (to - compact_extent_info))) + { + /* + No ranges. This happens in the rear case when we have a allocated + place for a blob on a tail page but it did fit into the main page. + */ + stack_alloc_free(compact_extent_info, buff_alloced); + DBUG_RETURN(0); + } + extents_count= (uint) (extents_length / ROW_EXTENT_SIZE); + pagerange_store(log_data + FILEID_STORE_SIZE, extents_count); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length; + res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn, + info, + (translog_size_t) (sizeof(log_data) + + extents_length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL); + stack_alloc_free(compact_extent_info, buff_alloced); + if (res) + DBUG_RETURN(1); + } + + DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents, + row->extents_count)); +} + + +/* + Free one page range + + NOTES + This is very similar to free_full_pages() + + RETURN + 0 ok + 1 error +*/ + +static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page, + uint count) +{ + my_bool res= 0; + uint delete_count; + MARIA_SHARE *share= info->s; + DBUG_ENTER("free_full_page_range"); + + delete_count= count; + if (share->state.state.data_file_length == + (page + count) * share->block_size) + { + /* + Don't delete last page from pagecache as this will make the file + shorter than expected if the last operation extended the file + */ + delete_count--; + } + if (delete_count && + pagecache_delete_pages(share->pagecache, &info->dfile, + page, delete_count, PAGECACHE_LOCK_WRITE, 1)) + res= 1; + + if (share->now_transactional) + { + LSN lsn; + /** @todo unify log_data's shape with delete_head_or_tail() */ + uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + ROW_EXTENT_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + DBUG_ASSERT(info->trn->rec_lsn); + pagerange_store(log_data + FILEID_STORE_SIZE, 1); + page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + page); + int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + PAGE_STORE_SIZE, count); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + res= 1; + } + mysql_mutex_lock(&share->bitmap.bitmap_lock); + if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count)) + res= 1; + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(res); +} + + +/** + @brief Write a record to a (set of) pages + + @fn write_block_record() + @param info Maria handler + @param old_record Original record in case of update; NULL in case of + insert + @param record Record we should write + @param row Statistics about record (calculated by + calc_record_size()) + @param bitmap_blocks On which pages the record should be stored + @param head_block_is_read 1 if head block existed. 0 if new block. + @param row_pos Position on head page where to put head part of + record + @param undo_lsn <> LSN_ERROR if we are executing an UNDO + @param old_record_checksum Checksum of old_record: ignored if table does + not have live checksum; otherwise if + old_record==NULL it must be 0. + + @note + On return all pinned pages are released. + + [page_buff + EMPTY_SPACE_OFFSET] is set to + row_pos->empty_space - head_length + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool write_block_record(MARIA_HA *info, + const uchar *old_record, + const uchar *record, + MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *bitmap_blocks, + my_bool head_block_is_read, + struct st_row_pos_info *row_pos, + LSN undo_lsn, + ha_checksum old_record_checksum) +{ + uchar *data, *end_of_data, *tmp_data_used, *tmp_data; + uchar *UNINIT_VAR(row_extents_first_part), *UNINIT_VAR(row_extents_second_part); + uchar *field_length_data; + uchar *page_buff; + MARIA_BITMAP_BLOCK *block, *head_block; + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + MARIA_PINNED_PAGE page_link; + uint block_size, flag, head_length; + ulong *blob_lengths; + my_bool row_extents_in_use, blob_full_pages_exists; + LSN lsn; + my_off_t position; + uint save_my_errno; + myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("write_block_record"); + + head_block= bitmap_blocks->block; + block_size= share->block_size; + + page_buff= row_pos->buff; + /* Position on head page where we should store the head part */ + data= row_pos->data; + end_of_data= data + row_pos->length; + + /* Write header */ + flag= info->row_flag; + row_extents_in_use= 0; + if (unlikely(row->total_length > row_pos->length)) + { + /* Need extent */ + DBUG_ASSERT(bitmap_blocks->count > 1); + if (bitmap_blocks->count <= 1) + goto crashed; /* Wrong in bitmap */ + flag|= ROW_FLAG_EXTENTS; + row_extents_in_use= 1; + } + /* For now we have only a minimum header */ + *data++= (uchar) flag; + if (flag & ROW_FLAG_TRANSID) + { + transid_store(data, info->trn->trid); + data+= TRANSID_SIZE; + } + + if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED)) + *data++= (uchar) (share->base.null_bytes - + share->base.original_null_bytes); + if (row_extents_in_use) + { + /* Store first extent in header */ + store_key_length_inc(data, bitmap_blocks->count - 1); + row_extents_first_part= data; + data+= ROW_EXTENT_SIZE; + } + if (share->base.max_field_lengths) + store_key_length_inc(data, row->field_lengths_length); + if (share->calc_checksum) + { + *(data++)= (uchar) (row->checksum); /* store least significant byte */ + DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL))); + } + memcpy(data, record, share->base.null_bytes); + data+= share->base.null_bytes; + memcpy(data, row->empty_bits, share->base.pack_bytes); + data+= share->base.pack_bytes; + + DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR || + (uint) (data - row_pos->data) == row->min_length); + + /* + Allocate a buffer of rest of data (except blobs) + + To avoid double copying of data, we copy as many columns that fits into + the page. The rest goes into info->packed_row. + + Using an extra buffer, instead of doing continuous writes to different + pages, uses less code and we don't need to have to do a complex call + for every data segment we want to store. + */ + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + row->head_length, myflag)) + DBUG_RETURN(1); + + tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */ + tmp_data= data; + + if (row_extents_in_use) + { + uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE; + if (!tmp_data_used && tmp_data + copy_length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + row_extents_second_part= tmp_data; + /* + We will copy the extents here when we have figured out the tail + positions. + */ + tmp_data+= copy_length; + } + + /* Copy fields that has fixed lengths (primary key etc) */ + for (column= share->columndef, + end_column= column + share->base.fixed_not_null_fields; + column < end_column; column++) + { + if (!tmp_data_used && tmp_data + column->length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + memcpy(tmp_data, record + column->offset, column->length); + tmp_data+= column->length; + } + + /* Copy length of data for variable length fields */ + if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data) + { + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + field_length_data= row->field_lengths; + memcpy(tmp_data, field_length_data, row->field_lengths_length); + tmp_data+= row->field_lengths_length; + + DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR || + (uint) (tmp_data - row_pos->data) == row->min_length + + share->base.fixed_not_null_fields_length + + row->field_lengths_length); + + /* Copy variable length fields and fields with null/zero */ + for (end_column= share->columndef + share->base.fields - share->base.blobs; + column < end_column ; + column++) + { + const uchar *field_pos; + ulong length; + if ((record[column->null_pos] & column->null_bit) || + (column->empty_bit && + (row->empty_bits[column->empty_pos] & column->empty_bit))) + continue; + + field_pos= record + column->offset; + switch (column->type) { + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_SKIP_PRESPACE: + case FIELD_SKIP_ZERO: /* Fixed length field */ + length= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + /* Char that is space filled */ + if (column->length <= 255) + length= (uint) (uchar) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } + break; + case FIELD_VARCHAR: + if (column->length <= 256) + { + length= (uint) (uchar) *field_length_data++; + field_pos++; /* Skip length uchar */ + } + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + field_pos+= 2; + } + DBUG_ASSERT(length <= column->length); + break; + default: /* Wrong data */ + DBUG_ASSERT(!maria_assert_if_crashed_table); + length=0; + break; + } + if (!tmp_data_used && tmp_data + length > end_of_data) + { + /* Data didn't fit in page; Change to use tmp buffer */ + tmp_data_used= tmp_data; + tmp_data= info->rec_buff; + } + memcpy((char*) tmp_data, field_pos, length); + tmp_data+= length; + } + + block= head_block + head_block->sub_blocks; /* Point to first blob data */ + + end_column= column + share->base.blobs; + blob_lengths= row->blob_lengths; + if (!tmp_data_used) + { + /* Still room on page; Copy as many blobs we can into this page */ + data= tmp_data; + for (; column < end_column && + *blob_lengths <= (ulong)(end_of_data - data); + column++, blob_lengths++) + { + uchar *tmp_pos; + uint length; + if (!*blob_lengths) /* Null or "" */ + continue; + length= column->length - portable_sizeof_char_ptr; + memcpy(&tmp_pos, record + column->offset + length, sizeof(char*)); + memcpy(data, tmp_pos, *blob_lengths); + data+= *blob_lengths; + /* + The following is not true when we want to insert data into original + place. In this case we don't have any extra blocks allocated + */ + if (likely(undo_lsn == LSN_ERROR)) + { + /* Skip over tail page that was prepared for storing blob */ + block++; + bitmap_blocks->tail_page_skipped= 1; + } + } + if (head_block->sub_blocks > 1) + { + /* We have allocated pages that where not used */ + bitmap_blocks->page_skipped= 1; + } + } + else + data= tmp_data_used; /* Get last used on page */ + + /* Update page directory */ + head_length= (uint) (data - row_pos->data); + DBUG_PRINT("info", ("Used head length on page: %u header_length: %u", + head_length, + (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0))); + if (head_length < share->base.min_block_length) + { + /* Extend row to be of size min_block_length */ + uint diff_length= share->base.min_block_length - head_length; + bzero(data, diff_length); + data+= diff_length; + head_length= share->base.min_block_length; + } + DBUG_ASSERT(data <= end_of_data); + /* + If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have + written exactly head_length bytes (same as original record). + */ + DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length); + int2store(row_pos->dir + 2, head_length); + /* update empty space at start of block */ + row_pos->empty_space-= head_length; + int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space); + /* Mark in bitmaps how the current page was actually used */ + head_block->empty_space= row_pos->empty_space; + if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE && + page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST) + head_block->empty_space= 0; /* Page is full */ + head_block->used|= BLOCKUSED_USED; + + check_directory(share, + page_buff, share->block_size, share->base.min_block_length, + (uint) -1); + + /* + Now we have to write tail pages, as we need to store the position + to them in the row extent header. + + We first write out all blob tails, to be able to store them in + the current page or 'tmp_data'. + + Then we write the tail of the non-blob fields (The position to the + tail page is stored either in row header, the extents in the head + page or in the first full page of the non-blob data. It's never in + the tail page of the non-blob data) + */ + + blob_full_pages_exists= 0; + if (row_extents_in_use) + { + if (column != end_column) /* If blob fields */ + { + MARIA_COLUMNDEF *save_column= column; + MARIA_BITMAP_BLOCK *save_block= block; + MARIA_BITMAP_BLOCK *end_block; + ulong *save_blob_lengths= blob_lengths; + + for (; column < end_column; column++, blob_lengths++) + { + uchar *blob_pos; + if (!*blob_lengths) /* Null or "" */ + continue; + if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) + { + uint length; + length= column->length - portable_sizeof_char_ptr; + memcpy(&blob_pos, record + column->offset + length, sizeof(char*)); + length= *blob_lengths % FULL_PAGE_SIZE(share); /* tail size */ + if (length != *blob_lengths) + blob_full_pages_exists= 1; + if (write_tail(info, block + block->sub_blocks-1, + blob_pos + *blob_lengths - length, + length)) + goto disk_err; + } + else + blob_full_pages_exists= 1; + + for (end_block= block + block->sub_blocks; block < end_block; block++) + { + /* + Set only a bit, to not cause bitmap code to believe a block is full + when there is still a lot of entries in it. + */ + block->used|= BLOCKUSED_USED; + } + } + DBUG_ASSERT((undo_lsn == LSN_ERROR || + block == bitmap_blocks->block + bitmap_blocks->count)); + column= save_column; + block= save_block; + blob_lengths= save_blob_lengths; + } + + if (tmp_data_used) /* non blob data overflows */ + { + MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block; + MARIA_BITMAP_BLOCK *head_tail_block= 0; + ulong length; + ulong data_length= (ulong) (tmp_data - info->rec_buff); + +#ifdef SANITY_CHECKS + DBUG_ASSERT(head_block->sub_blocks != 1); + if (head_block->sub_blocks == 1) + goto crashed; /* no reserved full or tails */ +#endif + /* + Find out where to write tail for non-blob fields. + + Problem here is that the bitmap code may have allocated more + space than we need. We have to handle the following cases: + + - Bitmap code allocated a tail page we don't need. + - The last full page allocated needs to be changed to a tail page + (Because we where able to put more data on the head page than + the bitmap allocation assumed) + + The reserved pages in bitmap_blocks for the main page has one of + the following allocations: + - Full pages, with following blocks: + # * full pages + empty page ; To be used if we change last full to tail page. This + has 'count' = 0. + tail page (optional, if last full page was part full) + - One tail page + */ + + cur_block= head_block + 1; + end_block= head_block + head_block->sub_blocks; + /* + Loop until we have find a block bigger than we need or + we find the empty page block. + */ + while (data_length >= (length= (cur_block->page_count * + FULL_PAGE_SIZE(share))) && + cur_block->page_count) + { +#ifdef SANITY_CHECKS + DBUG_ASSERT(!((cur_block == end_block) || + (cur_block->used & BLOCKUSED_USED))); + if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED)) + goto crashed; +#endif + data_length-= length; + (cur_block++)->used|= BLOCKUSED_USED; + } + last_head_block= cur_block; + if (data_length) + { + if (cur_block->page_count == 0) + { + /* Skip empty filler block */ + cur_block++; + } +#ifdef SANITY_CHECKS + DBUG_ASSERT(!(cur_block >= end_block)); + if ((cur_block >= end_block)) + goto crashed; +#endif + if (cur_block->used & BLOCKUSED_TAIL) + { + DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size)); + /* tail written to tail page */ + cur_block->used|= BLOCKUSED_USED; + head_tail_block= cur_block; + } + else if (data_length > length - MAX_TAIL_SIZE(block_size)) + { + /* tail written to full page */ + cur_block->used|= BLOCKUSED_USED; + if ((cur_block != end_block - 1) && + (end_block[-1].used & BLOCKUSED_TAIL)) + bitmap_blocks->tail_page_skipped= 1; + } + else + { + /* + cur_block is a full block, followed by an empty and optional + tail block. Change cur_block to a tail block or split it + into full blocks and tail blocks. + + TODO: + If there is enough space on the following tail block, use + this instead of creating a new tail block. + */ + DBUG_ASSERT(cur_block[1].page_count == 0); + if (cur_block->page_count == 1) + { + /* convert full block to tail block */ + cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL; + head_tail_block= cur_block; + } + else + { + DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(share)); + DBUG_PRINT("info", ("Splitting blocks into full and tail")); + cur_block[1].page= (cur_block->page + cur_block->page_count - 1); + cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */ + cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL; + cur_block->page_count--; + cur_block->used|= BLOCKUSED_USED; + last_head_block= head_tail_block= cur_block+1; + } + if (end_block[-1].used & BLOCKUSED_TAIL) + bitmap_blocks->tail_page_skipped= 1; + } + } + else + { + /* Must be an empty or tail page */ + DBUG_ASSERT(cur_block->page_count == 0 || + cur_block->used & BLOCKUSED_TAIL); + if (end_block[-1].used & BLOCKUSED_TAIL) + bitmap_blocks->tail_page_skipped= 1; + } + + /* + Write all extents into page or tmp_data + + Note that we still don't have a correct position for the tail + of the non-blob fields. + */ + store_extent_info(row_extents_first_part, + row_extents_second_part, + head_block+1, bitmap_blocks->count - 1); + if (head_tail_block) + { + ulong block_length= (ulong) (tmp_data - info->rec_buff); + uchar *extent_data; + + length= (uint) (block_length % FULL_PAGE_SIZE(share)); + if (write_tail(info, head_tail_block, + info->rec_buff + block_length - length, + length)) + goto disk_err; + tmp_data-= length; /* Remove the tail */ + if (tmp_data == info->rec_buff) + { + /* We have no full blocks to write for the head part */ + tmp_data_used= 0; + } + + /* Store the tail position for the non-blob fields */ + if (head_tail_block == head_block + 1) + { + /* + We had a head block + tail block, which means that the + tail block is the first extent + */ + extent_data= row_extents_first_part; + } + else + { + /* + We have a head block + some full blocks + tail block + last_head_block is pointing after the last used extent + for the head block. + */ + extent_data= row_extents_second_part + + ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE; + } + /* Write information for tail block in the reserved space */ + page_store(extent_data, head_tail_block->page); + pagerange_store(extent_data + PAGE_STORE_SIZE, + head_tail_block->page_count); + } + } + else + store_extent_info(row_extents_first_part, + row_extents_second_part, + head_block+1, bitmap_blocks->count - 1); + } + + if (share->now_transactional) + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + + /* Log REDO changes of head page */ + page_store(log_data + FILEID_STORE_SIZE, head_block->page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + row_pos->rownr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length; + if (translog_write_record(&lsn, + head_block_is_read ? + LOGREC_REDO_INSERT_ROW_HEAD : + LOGREC_REDO_NEW_ROW_HEAD, + info->trn, + info, + (translog_size_t) (sizeof(log_data) + + head_length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + goto disk_err; + } + +#ifdef RECOVERY_EXTRA_DEBUG + if (info->trn->undo_lsn != LSN_IMPOSSIBLE) + { + /* Stop right after the REDO; testing incomplete log record groups */ + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash", + { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); }); + } +#endif + + if (head_block_is_read) + { + MARIA_PINNED_PAGE *page_link; + /* Head page is always the first pinned page */ + page_link= dynamic_element(&info->pinned_pages, 0, + MARIA_PINNED_PAGE*); + pagecache_unlock_by_link(share->pagecache, page_link->link, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link->changed= 1; + } + else + { + if (pagecache_write(share->pagecache, + &info->dfile, head_block->page, 0, + page_buff, share->page_type, + head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ : + PAGECACHE_LOCK_READ, + head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED : + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE)) + goto disk_err; + DBUG_ASSERT(page_link.link); + page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + /* Increase data file size, if extended */ + position= (my_off_t) head_block->page * block_size; + if (share->state.state.data_file_length <= position) + _ma_set_share_data_file_length(share, position + block_size); + } + + if (share->now_transactional && (tmp_data_used || blob_full_pages_exists)) + { + /* + Log REDO writes for all full pages (head part and all blobs) + We write all here to be able to generate the UNDO record early + so that we can write the LSN for the UNDO record to all full pages. + */ + uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) * + ROW_EXTENTS_ON_STACK]; + uchar *log_data, *log_pos; + LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 + + ROW_EXTENTS_ON_STACK]; + LEX_CUSTRING *log_array_pos, *log_array; + int error; + translog_size_t log_entry_length= 0; + uint ext_length, extents= 0, sub_extents= 0; + + /* If few extents, then allocate things on stack to avoid a malloc call */ + if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK) + { + log_array= tmp_log_array; + log_data= tmp_log_data; + } + else + { + if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME), &log_array, + (uint) ((bitmap_blocks->count + + TRANSLOG_INTERNAL_PARTS + 2) * + sizeof(*log_array)), + &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE + + bitmap_blocks->count * (ROW_EXTENT_SIZE + + BLOCK_FILLER_SIZE + + SUB_RANGE_SIZE), + NullS)) + goto disk_err; + } + log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2; + log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1; + + if (tmp_data_used) + { + /* Full head page */ + translog_size_t block_length= (translog_size_t) (tmp_data - + info->rec_buff); + log_pos= store_page_range(share, + log_pos, head_block+1, + (ulong) block_length, &extents); + log_array_pos->str= info->rec_buff; + log_array_pos->length= block_length; + log_entry_length+= block_length; + log_array_pos++; + sub_extents++; + } + if (blob_full_pages_exists) + { + MARIA_COLUMNDEF *tmp_column= column; + ulong *tmp_blob_lengths= blob_lengths; + MARIA_BITMAP_BLOCK *tmp_block= block; + + /* Full blob pages */ + for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++) + { + ulong blob_length; + uint length; + + if (!*tmp_blob_lengths) /* Null or "" */ + continue; + blob_length= *tmp_blob_lengths; + length= tmp_column->length - portable_sizeof_char_ptr; + /* + If last part of blog was on tail page, change blob_length to + reflect this + */ + if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL) + blob_length-= (blob_length % FULL_PAGE_SIZE(share)); + if (blob_length) + { + memcpy((void*) &log_array_pos->str, + record + tmp_column->offset + length, + sizeof(uchar*)); + log_array_pos->length= blob_length; + log_entry_length+= blob_length; + log_array_pos++; + sub_extents++; + + log_pos= store_page_range(share, + log_pos, tmp_block, + blob_length, &extents); + } + tmp_block+= tmp_block->sub_blocks; + } + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + ext_length= (uint) (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length; + pagerange_store(log_data+ FILEID_STORE_SIZE, extents); + pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + sub_extents); + + log_entry_length+= ext_length; + /* trn->rec_lsn is already set earlier in this function */ + error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS, + info->trn, info, log_entry_length, + (uint) (log_array_pos - log_array), + log_array, log_data, NULL); + if (log_array != tmp_log_array) + my_free(log_array); + if (error) + goto disk_err; + } + + /* Write UNDO or CLR record */ + lsn= LSN_IMPOSSIBLE; + if (share->now_transactional) + { + LEX_CUSTRING *log_array= info->log_row_parts; + + if (undo_lsn != LSN_ERROR) + { + /* + Store if this CLR is about UNDO_DELETE or UNDO_UPDATE; + in the first case, Recovery, when it sees the CLR_END in the + REDO phase, may decrement the records' count. + */ + if (_ma_write_clr(info, undo_lsn, + old_record ? LOGREC_UNDO_ROW_UPDATE : + LOGREC_UNDO_ROW_DELETE, + share->calc_checksum != 0, + row->checksum - old_record_checksum, + &lsn, (void*) 0)) + goto disk_err; + } + else + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 + + HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE + + ROW_EXTENT_SIZE]; + uchar *log_pos; + ha_checksum checksum_delta; + + /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */ + lsn_store(log_data, info->trn->undo_lsn); + page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + head_block->page); + dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE, + row_pos->rownr); + log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE); + store_checksum_in_rec(share, checksum_delta, + row->checksum - old_record_checksum, + log_pos, log_pos); + compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + if (!old_record) + { + /* Store undo_lsn in case we are aborting the insert */ + row->orig_undo_lsn= info->trn->undo_lsn; + /* Write UNDO log record for the INSERT */ + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, + log_array, + log_data + LSN_STORE_SIZE, &checksum_delta)) + goto disk_err; + } + else + { + /* Write UNDO log record for the UPDATE */ + size_t row_length, extents_length; + uint row_parts_count, cur_head_length; + + /* + Write head length and extents of the original row so that we + during UNDO can put it back in the original position. + We don't store size for TRANSID, as we don't write this during + UNDO. + */ + cur_head_length= (info->cur_row.head_length - + info->cur_row.header_length); + int2store(log_pos, cur_head_length); + pagerange_store(log_pos + 2, info->cur_row.extents_count); + log_pos+= 2 + PAGERANGE_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 + + PAGERANGE_STORE_SIZE); + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str= + info->cur_row.extents; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length= + extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE; + + row_length= fill_update_undo_parts(info, old_record, record, + log_array + + TRANSLOG_INTERNAL_PARTS + 2, + &row_parts_count); + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn, + info, + (translog_size_t) + (log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extents_length + + row_length), + TRANSLOG_INTERNAL_PARTS + 2 + + row_parts_count, + log_array, + log_data + LSN_STORE_SIZE, + &checksum_delta)) + goto disk_err; + } + } + } + /* Release not used space in used pages */ + if (_ma_bitmap_release_unused(info, bitmap_blocks)) + goto disk_err; + _ma_unpin_all_pages(info, lsn); + + if (tmp_data_used) + { + /* + Write data stored in info->rec_buff to pages + This is the char/varchar data that didn't fit into the head page. + */ + DBUG_ASSERT(bitmap_blocks->count != 0); + if (write_full_pages(info, lsn, head_block + 1, + info->rec_buff, (ulong) (tmp_data - info->rec_buff))) + goto disk_err; + } + + /* Write rest of blobs (data, but no tails as they are already written) */ + for (; column < end_column; column++, blob_lengths++) + { + uchar *blob_pos; + uint length; + ulong blob_length; + if (!*blob_lengths) /* Null or "" */ + continue; + length= column->length - portable_sizeof_char_ptr; + memcpy(&blob_pos, record + column->offset + length, sizeof(char*)); + /* remove tail part */ + blob_length= *blob_lengths; + if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL) + blob_length-= (blob_length % FULL_PAGE_SIZE(share)); + + if (blob_length && write_full_pages(info, lsn, block, + blob_pos, blob_length)) + goto disk_err; + block+= block->sub_blocks; + } + + _ma_finalize_row(info); + DBUG_RETURN(0); + +crashed: + DBUG_ASSERT(!maria_assert_if_crashed_table); + /* Something was wrong with data on page */ + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + +disk_err: + /** + @todo RECOVERY we are going to let dirty pages go to disk while we have + logged UNDO, this violates WAL. We must mark the table corrupted! + + @todo RECOVERY we have written some REDOs without a closing UNDO, + it's possible that a next operation by this transaction succeeds and then + Recovery would glue the "orphan REDOs" to the succeeded operation and + execute the failed REDOs. We need some mark "abort this group" in the + log, or mark the table corrupted (then user will repair it and thus REDOs + will be skipped). + + @todo RECOVERY to not let write errors go unnoticed, pagecache_write() + should take a MARIA_HA* in argument, and it it + fails when flushing a page to disk it should call + (*the_maria_ha->write_error_func)(the_maria_ha) + and this hook will mark the table corrupted. + Maybe hook should be stored in the pagecache's block structure, or in a + hash "file->maria_ha*". + + @todo RECOVERY we should distinguish below between log write error and + table write error. The former should stop Maria immediately, the latter + should mark the table corrupted. + */ + /* + Unpin all pinned pages to not cause problems for disk cache. This is + safe to call even if we already called _ma_unpin_all_pages() above. + */ + save_my_errno= my_errno; + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + my_errno= save_my_errno; + DBUG_RETURN(1); +} + + +/* + @brief Write a record + + @fn allocate_and_write_block_record() + @param info Maria handler + @param record Record to write + @param row Information about fields in 'record' + @param undo_lsn <> LSN_ERROR if we are executing an UNDO + + @return + @retval 0 ok + @retval 1 Error +*/ + +static my_bool allocate_and_write_block_record(MARIA_HA *info, + const uchar *record, + MARIA_ROW *row, + LSN undo_lsn) +{ + struct st_row_pos_info row_pos; + MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks; + int save_my_errno; + DBUG_ENTER("allocate_and_write_block_record"); + + _ma_bitmap_flushable(info, 1); + if (_ma_bitmap_find_place(info, row, blocks)) + goto err; /* Error reading bitmap */ + + /* + Sleep; a checkpoint will happen and should not send this over-allocated + bitmap to disk but rather wait. + */ + DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10);); + + /* page will be pinned & locked by get_head_or_tail_page */ + if (get_head_or_tail_page(info, blocks->block, info->buff, + MY_MAX(row->space_on_head_page, + info->s->base.min_block_length), + HEAD_PAGE, + PAGECACHE_LOCK_WRITE, &row_pos)) + goto err; + row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr); + if (info->s->calc_checksum) + { + if (undo_lsn == LSN_ERROR) + row->checksum= (info->s->calc_checksum)(info, record); + else + { + /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */ + DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record)); + } + } + DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos, + (ulong) ma_recordpos_to_page(row->lastpos), + ma_recordpos_to_dir_entry(row->lastpos), + row_pos.length)); + if (write_block_record(info, (uchar*) 0, record, row, + blocks, blocks->block->org_bitmap_value != 0, + &row_pos, undo_lsn, 0)) + goto err; + /* Now let checkpoint happen but don't commit */ + DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000);); + DBUG_RETURN(0); + +err: + save_my_errno= my_errno; + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + my_errno= save_my_errno; + DBUG_RETURN(1); +} + + +/* + Write a record and return rowid for it + + SYNOPSIS + _ma_write_init_block_record() + info Maria handler + record Record to write + + NOTES + This is done BEFORE we write the keys to the row! + + RETURN + HA_OFFSET_ERROR Something went wrong + # Rowid for row +*/ + +MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, + const uchar *record) +{ + DBUG_ENTER("_ma_write_init_block_record"); + + calc_record_size(info, record, &info->cur_row); + if (allocate_and_write_block_record(info, record, + &info->cur_row, LSN_ERROR)) + DBUG_RETURN(HA_OFFSET_ERROR); + DBUG_RETURN(info->cur_row.lastpos); +} + + +/* + Dummy function for (*info->s->write_record)() + + Nothing to do here, as we already wrote the record in + _ma_write_init_block_record() +*/ + +my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)), + const uchar *record __attribute__ ((unused))) +{ + return 0; /* Row already written */ +} + + +/** + @brief Remove row written by _ma_write_block_record() and log undo + + @param info Maria handler + + @note + This is called in case we got a duplicate unique key while + writing keys. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_write_abort_block_record(MARIA_HA *info) +{ + my_bool res= 0; + MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; + MARIA_BITMAP_BLOCK *block, *end; + LSN lsn= LSN_IMPOSSIBLE; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_write_abort_block_record"); + + _ma_bitmap_lock(share); /* Lock bitmap from other insert threads */ + if (delete_head_or_tail(info, + ma_recordpos_to_page(info->cur_row.lastpos), + ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1, + 0)) + res= 1; + for (block= blocks->block + 1, end= block + blocks->count - 1; block < end; + block++) + { + if (block->used & BLOCKUSED_USED) + { + if (block->used & BLOCKUSED_TAIL) + { + /* + block->page_count is set to the tail directory entry number in + write_block_record() + */ + if (delete_head_or_tail(info, block->page, + block->page_count & ~TAIL_BIT, + 0, 0)) + res= 1; + } + else + { + if (free_full_page_range(info, block->page, block->page_count)) + res= 1; + } + } + } + _ma_bitmap_unlock(share); + if (share->now_transactional) + { + /* + Write clr to mark end of aborted row insert. + The above delete_head_or_tail() calls will only log redo, not undo. + The undo just before the row insert is stored in row->orig_undo_lsn. + + When applying undo's, we can skip all undo records between current + lsn and row->orig_undo_lsn as logically things are as before the + attempted insert. + */ + if (_ma_write_clr(info, info->cur_row.orig_undo_lsn, + LOGREC_UNDO_ROW_INSERT, + share->calc_checksum != 0, + (ha_checksum) 0 - info->cur_row.checksum, + &lsn, (void*) 0)) + res= 1; + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/* + Update a record + + NOTES + For the moment, we assume that info->curr_row.extents is always updated + when a row is read. In the future we may decide to read this on demand + for rows split into many extents. +*/ + +static my_bool _ma_update_block_record2(MARIA_HA *info, + MARIA_RECORD_POS record_pos, + const uchar *oldrec, + const uchar *record, + LSN undo_lsn) +{ + MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks; + uchar *buff; + MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; + MARIA_PINNED_PAGE page_link; + uint rownr, org_empty_size, head_length; + uint block_size= info->s->block_size; + uint errpos __attribute__((unused)) = 0; + uchar *dir; + pgcache_page_no_t page; + struct st_row_pos_info row_pos; + my_bool res; + ha_checksum old_checksum; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_update_block_record2"); + DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos)); + +#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE + DBUG_DUMP("oldrec", oldrec, share->base.reclength); + DBUG_DUMP("newrec", record, share->base.reclength); +#endif + + /* + Checksums of new and old rows were computed by callers already; new + row's was put into cur_row, old row's was put into new_row. + */ + old_checksum= new_row->checksum; + new_row->checksum= cur_row->checksum; + calc_record_size(info, record, new_row); + page= ma_recordpos_to_page(record_pos); + + _ma_bitmap_flushable(info, 1); + buff= pagecache_read(share->pagecache, + &info->dfile, (pgcache_page_no_t) page, 0, 0, + share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + { + _ma_set_fatal_error(info, my_errno); + goto err; + } + + org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); + rownr= ma_recordpos_to_dir_entry(record_pos); + dir= dir_entry_pos(buff, block_size, rownr); + + /* + We can't use cur_row->head_length as the block may have been compacted + since we read it. + */ + head_length= uint2korr(dir + 2); + + if ((org_empty_size + head_length) >= new_row->total_length) + { + uint rec_offset, length; + MARIA_BITMAP_BLOCK block; + + DBUG_PRINT("info", ("org_empty_size: %u org_length: %u new_length: %lu", + org_empty_size, head_length, + new_row->total_length)); + + /* + We can fit the new row in the same page as the original head part + of the row + */ + block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap, + org_empty_size); + if (extend_area_on_page(info, buff, dir, rownr, + new_row->total_length, &org_empty_size, + &rec_offset, &length, 1)) + { + errpos= 1; + goto err; + } + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= org_empty_size; + row_pos.dir= dir; + row_pos.data= buff + rec_offset; + row_pos.length= length; + blocks->block= █ + blocks->count= 1; + block.page= page; + block.sub_blocks= 1; + block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP; + block.empty_space= row_pos.empty_space; + + if (*cur_row->tail_positions && + delete_tails(info, cur_row->tail_positions)) + { + errpos= 2; + goto err; + } + if (cur_row->extents_count && free_full_pages(info, cur_row)) + { + errpos= 3; + goto err; + } + res= write_block_record(info, oldrec, record, new_row, blocks, + 1, &row_pos, undo_lsn, old_checksum); + /* We can't update or delete this without re-reading it again */ + info->update&= ~HA_STATE_AKTIV; + DBUG_RETURN(res); + } + /* Delete old row */ + if (*cur_row->tail_positions && + delete_tails(info, cur_row->tail_positions)) + { + errpos= 4; + goto err; + } + if (cur_row->extents_count && free_full_pages(info, cur_row)) + { + errpos= 5; + goto err; + } + + head_length= uint2korr(dir + 2); + if (_ma_bitmap_find_new_place(info, new_row, page, head_length + + org_empty_size, blocks)) + { + errpos= 6; + goto err; + } + + /* + Allocate all size in block for record + TODO: + Need to improve this to do compact if we can fit one more blob into + the head page + */ + if ((head_length < new_row->space_on_head_page || + (new_row->total_length <= head_length && + org_empty_size + head_length >= new_row->total_length))) + { + _ma_compact_block_page(share, + buff, rownr, 1, + info->trn->min_read_from, + share->base.min_block_length); + org_empty_size= 0; + head_length= uint2korr(dir + 2); + } + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= org_empty_size + head_length; + row_pos.dir= dir; + row_pos.data= buff + uint2korr(dir); + row_pos.length= head_length; + if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1, + &row_pos, undo_lsn, old_checksum))) + { + errpos= 7; + goto err; + } + DBUG_RETURN(0); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_PRINT("error", ("errpos: %d", errpos)); + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + DBUG_RETURN(1); +} + + +/* + @brief Store new row on it's original position + + @note + This is basicly a copy of _ma_update_block_record2 + When we have a purge thread for deleted row, we can remove this function + and use _ma_update_block_record2 instead. + + This is the main reason we don't make a lot of subfunctions that are + common between _ma_update_block_record2() and this function. + + Note: If something goes wrong we mark the file crashed +*/ + +static my_bool _ma_update_at_original_place(MARIA_HA *info, + pgcache_page_no_t page, + uint rownr, + uint length_on_head_page, + uint extent_count, + const uchar *extent_info, + const uchar *oldrec, + const uchar *record, + LSN undo_lsn) +{ + MARIA_BITMAP_BLOCKS *blocks; + MARIA_BITMAP_BLOCK *block; + MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + ha_checksum old_checksum; + uint org_empty_size, empty_size; + uint block_size= info->s->block_size; + uchar *dir, *buff; + struct st_row_pos_info row_pos; + my_bool res; + uint rec_offset, length; + DBUG_ENTER("_ma_update_at_original_place"); + +#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE + DBUG_DUMP("oldrec", oldrec, share->base.reclength); + DBUG_DUMP("newrec", record, share->base.reclength); +#endif + + /* + Checksums of new and old rows were computed by callers already; new + row's was put into cur_row, old row's was put into new_row. + */ + old_checksum= new_row->checksum; + new_row->checksum= cur_row->checksum; + calc_record_size(info, record, new_row); + + _ma_bitmap_flushable(info, 1); + buff= pagecache_read(share->pagecache, + &info->dfile, (pgcache_page_no_t) page, 0, 0, + share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + { + _ma_set_fatal_error(info, my_errno); + goto err; + } + + org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); + dir= dir_entry_pos(buff, block_size, rownr); + + if ((org_empty_size + cur_row->head_length) < length_on_head_page) + { + DBUG_PRINT("error", + ("org_empty_size: %u head_length: %u length_on_page: %u", + org_empty_size, (uint) cur_row->head_length, + length_on_head_page)); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; + } + + /* + We can fit the new row in the same page as the original head part + of the row + */ + empty_size= org_empty_size; + if (extend_area_on_page(info, buff, dir, rownr, + length_on_head_page, &empty_size, + &rec_offset, &length, 1)) + goto err; + + row_pos.buff= buff; + row_pos.rownr= rownr; + row_pos.empty_space= empty_size; + row_pos.dir= dir; + row_pos.data= buff + rec_offset; + + /* Delete old row */ + if (*cur_row->tail_positions && + delete_tails(info, cur_row->tail_positions)) + goto err; + if (cur_row->extents_count && free_full_pages(info, cur_row)) + goto err; + + /* Change extent information to be usable by write_block_record() */ + blocks= &cur_row->insert_blocks; + if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info)) + goto err; + block= blocks->block; + block->empty_space= row_pos.empty_space; + block->org_bitmap_value= + _ma_free_size_to_head_pattern(&share->bitmap, + (enough_free_entries_on_page(share, buff) ? + org_empty_size : 0)); + + DBUG_ASSERT(block->org_bitmap_value == + _ma_bitmap_get_page_bits(info, &info->s->bitmap, page)); + block->used|= BLOCKUSED_USE_ORG_BITMAP; + + /* + We have to use <= below as the new_row may be smaller than the original + row as the new row doesn't have transaction id + */ + + DBUG_ASSERT(blocks->count > 1 || + MY_MAX(new_row->total_length, share->base.min_block_length) <= + length_on_head_page); + + /* Store same amount of data on head page as on original page */ + row_pos.length= (length_on_head_page - + (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE); + set_if_bigger(row_pos.length, share->base.min_block_length); + if ((res= write_block_record(info, oldrec, record, new_row, blocks, + 1, &row_pos, undo_lsn, old_checksum))) + goto err; + DBUG_RETURN(0); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_mark_file_crashed(share); + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + DBUG_RETURN(1); +} + + +/* Wrapper for _ma_update_block_record2() used by ma_update() */ + +my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos, + const uchar *orig_rec, const uchar *new_rec) +{ + return _ma_update_block_record2(info, record_pos, orig_rec, new_rec, + LSN_ERROR); +} + + +/* + Delete a directory entry + + SYNOPSIS + delete_dir_entry() + buff Page buffer + record_number Record number to delete + empty_space Empty space on page after delete + + RETURN + -1 Error on page + 0 ok + 1 Page is now empty +*/ + +static int delete_dir_entry(MARIA_SHARE *share, + uchar *buff, uint record_number, + uint *empty_space_res) +{ + uint block_size= share->block_size; + uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; + uint length, empty_space; + uchar *dir; + DBUG_ENTER("delete_dir_entry"); + DBUG_PRINT("enter", ("record_number: %u number_of_records: %u", + record_number, number_of_records)); + +#ifdef SANITY_CHECKS + if (record_number >= number_of_records || + record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 - + PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE)) + { + DBUG_PRINT("error", ("record_number: %u number_of_records: %u", + record_number, number_of_records)); + + DBUG_RETURN(-1); + } +#endif + + check_directory(share, buff, block_size, 0, (uint) -1); + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + dir= dir_entry_pos(buff, block_size, record_number); + length= uint2korr(dir + 2); /* Length of entry we just deleted */ + DBUG_ASSERT(uint2korr(dir) != 0 && length < block_size); + + if (record_number == number_of_records - 1) + { + /* Delete this entry and all following free directory entries */ + uchar *end= buff + block_size - PAGE_SUFFIX_SIZE; + number_of_records--; + dir+= DIR_ENTRY_SIZE; + empty_space+= DIR_ENTRY_SIZE; + + /* Unlink and free the next empty ones */ + while (dir < end && dir[0] == 0 && dir[1] == 0) + { + number_of_records--; + if (dir[2] == END_OF_DIR_FREE_LIST) + buff[DIR_FREE_OFFSET]= dir[3]; + else + { + uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]); + DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] == + number_of_records); + prev_entry[3]= dir[3]; + } + if (dir[3] != END_OF_DIR_FREE_LIST) + { + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] == + number_of_records); + next_entry[2]= dir[2]; + } + dir+= DIR_ENTRY_SIZE; + empty_space+= DIR_ENTRY_SIZE; + } + + if (number_of_records == 0) + { + /* All entries on page deleted */ + DBUG_PRINT("info", ("Page marked as unallocated")); + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + dir= dir_entry_pos(buff, block_size, record_number); + bzero(dir, (record_number+1) * DIR_ENTRY_SIZE); + } +#endif + *empty_space_res= block_size; + DBUG_RETURN(1); + } + buff[DIR_COUNT_OFFSET]= (uchar) number_of_records; + } + else + { + /* Update directory */ + dir[0]= dir[1]= 0; + dir[2]= END_OF_DIR_FREE_LIST; + if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST) + { + /* Relink next entry to point to newly freed entry */ + uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]); + DBUG_ASSERT(uint2korr(next_entry) == 0 && + next_entry[2] == END_OF_DIR_FREE_LIST); + next_entry[2]= record_number; + } + buff[DIR_FREE_OFFSET]= record_number; + } + empty_space+= length; + + int2store(buff + EMPTY_SPACE_OFFSET, empty_space); + buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED; + + *empty_space_res= empty_space; + + check_directory(share, buff, block_size, 0, empty_space); + DBUG_RETURN(0); +} + + +/* + Delete a head a tail part + + SYNOPSIS + delete_head_or_tail() + info Maria handler + page Page (not file offset!) on which the row is + head 1 if this is a head page + from_update 1 if we are called from update. In this case we + leave the page as write locked as we may put + the new row into the old position. + + RETURN + 0 ok + 1 error +*/ + +static my_bool delete_head_or_tail(MARIA_HA *info, + pgcache_page_no_t page, uint record_number, + my_bool head, my_bool from_update) +{ + MARIA_SHARE *share= info->s; + uint empty_space; + int res; + my_bool page_is_empty; + uchar *buff; + LSN lsn; + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock lock_at_write, lock_at_unpin; + DBUG_ENTER("delete_head_or_tail"); + DBUG_PRINT("enter", ("id: %lu (%lu:%u)", + (ulong) ma_recordpos(page, record_number), + (ulong) page, record_number)); + + buff= pagecache_read(share->pagecache, + &info->dfile, page, 0, 0, + share->page_type, + PAGECACHE_LOCK_WRITE, &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + { + _ma_set_fatal_error(info, my_errno); + DBUG_RETURN(1); + } + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == + (head ? HEAD_PAGE : TAIL_PAGE)); + + if (from_update) + { + lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED; + lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK; + } + else + { + lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ; + lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK; + } + + res= delete_dir_entry(share, buff, record_number, &empty_space); + if (res < 0) + DBUG_RETURN(1); + if (res == 0) /* after our deletion, page is still not empty */ + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + page_is_empty= 0; + if (share->now_transactional) + { + /* Log REDO data */ + page_store(log_data + FILEID_STORE_SIZE, page); + dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + record_number); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD : + LOGREC_REDO_PURGE_ROW_TAIL), + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + DBUG_RETURN(1); + } + } + else /* page is now empty */ + { + page_is_empty= 1; + if (share->now_transactional) + { + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + page_store(log_data + FILEID_STORE_SIZE, page); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL, + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + DBUG_RETURN(1); + } + /* + Mark that this page must be written to disk by page cache, even + if we could call pagecache_delete() on it. + This is needed to ensure that repair finds the empty page on disk + and not old data. + */ + pagecache_set_write_on_delete_by_link(page_link.link); + DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]); + } + + pagecache_unlock_by_link(share->pagecache, page_link.link, + lock_at_write, + PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + page_link.unlock= lock_at_unpin; + set_dynamic(&info->pinned_pages, (void*) &page_link, + info->pinned_pages.elements-1); + + DBUG_PRINT("info", ("empty_space: %u", empty_space)); + + /* + If there is not enough space for all possible tails, mark the + page full + */ + if (!head && !page_is_empty && !enough_free_entries(buff, share->block_size, + 1 + share->base.blobs)) + empty_space= 0; + + DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space)); +} + + +/* + delete all tails + + SYNOPSIS + delete_tails() + info Handler + tails Pointer to vector of tail positions, ending with 0 + + RETURN + 0 ok + 1 error +*/ + +static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails) +{ + my_bool res= 0; + DBUG_ENTER("delete_tails"); + for (; *tails; tails++) + { + if (delete_head_or_tail(info, + ma_recordpos_to_page(*tails), + ma_recordpos_to_dir_entry(*tails), 0, 1)) + res= 1; + } + DBUG_RETURN(res); +} + + +/* + Delete a record + + NOTES + For the moment, we assume that info->cur_row.extents is always updated + when a row is read. In the future we may decide to read this on demand + for rows with many splits. +*/ + +my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) +{ + pgcache_page_no_t page; + uint record_number; + MARIA_SHARE *share= info->s; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("_ma_delete_block_record"); + + page= ma_recordpos_to_page(info->cur_row.lastpos); + record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos); + DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos, + (ulong) page, record_number)); + + _ma_bitmap_flushable(info, 1); + if (delete_head_or_tail(info, page, record_number, 1, 0) || + delete_tails(info, info->cur_row.tail_positions)) + goto err; + + if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row)) + goto err; + + if (share->now_transactional) + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE + + HA_CHECKSUM_STORE_SIZE]; + uchar *log_pos; + size_t row_length; + uint row_parts_count, extents_length; + ha_checksum checksum_delta; + + /* Write UNDO record */ + lsn_store(log_data, info->trn->undo_lsn); + page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE; + dirpos_store(log_pos, record_number); + log_pos+= DIRPOS_STORE_SIZE; + int2store(log_pos, info->cur_row.head_length - + info->cur_row.header_length); + log_pos+= 2; + pagerange_store(log_pos, info->cur_row.extents_count); + log_pos+= PAGERANGE_STORE_SIZE; + + info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length= + sizeof(log_data) - HA_CHECKSUM_STORE_SIZE; + store_checksum_in_rec(share, checksum_delta, + (ha_checksum) 0 - info->cur_row.checksum, log_pos, + info->log_row_parts[TRANSLOG_INTERNAL_PARTS + + 0].length); + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str= + info->cur_row.extents; + info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length= + extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE; + + row_length= fill_insert_undo_parts(info, record, + (info->log_row_parts + + TRANSLOG_INTERNAL_PARTS + 2), + &row_parts_count); + + if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn, + info, + (translog_size_t) + (info->log_row_parts[TRANSLOG_INTERNAL_PARTS + + 0].length + row_length + + extents_length), + TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count, + info->log_row_parts, + log_data + LSN_STORE_SIZE, + &checksum_delta)) + goto err; + } + + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(0); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + DBUG_RETURN(1); +} + + +/**************************************************************************** + Reading of records +****************************************************************************/ + +/* + Read position to record from record directory at end of page + + SYNOPSIS + get_record_position() + buff page buffer + block_size block size for page + record_number Record number in index + end_of_data pointer to end of data for record + + RETURN + 0 Error in data + # Pointer to start of record. + In this case *end_of_data is set. +*/ + +static uchar *get_record_position(MARIA_SHARE *share, uchar *buff, + uint record_number, uchar **end_of_data) +{ + uint block_size= share->block_size; + uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; + uchar *dir; + uchar *data; + uint offset, length; + +#ifdef SANITY_CHECKS + if (record_number >= number_of_records || + record_number > ((block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE) + / DIR_ENTRY_SIZE)) + { + DBUG_PRINT("error", + ("Wrong row number: record_number: %u number_of_records: %u", + record_number, number_of_records)); + return 0; + } +#endif + + dir= dir_entry_pos(buff, block_size, record_number); + offset= uint2korr(dir); + length= uint2korr(dir + 2); +#ifdef SANITY_CHECKS + if (offset < PAGE_HEADER_SIZE(share) || + offset + length > (block_size - + number_of_records * DIR_ENTRY_SIZE - + PAGE_SUFFIX_SIZE)) + { + DBUG_PRINT("error", + ("Wrong row position: record_number: %u offset: %u " + "length: %u number_of_records: %u", + record_number, offset, length, number_of_records)); + return 0; + } +#endif + data= buff + offset; + *end_of_data= data + length; + return data; +} + + +/* + Init extent + + NOTES + extent is a cursor over which pages to read +*/ + +static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info, + uint extents, MARIA_RECORD_POS *tail_positions) +{ + uint page_count; + extent->extent= extent_info; + extent->extent_count= extents; + extent->page= page_korr(extent_info); /* First extent */ + page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) & + ~START_EXTENT_BIT); + extent->tail= page_count & TAIL_BIT; + if (extent->tail) + { + extent->page_count= 1; + extent->tail_row_nr= page_count & ~TAIL_BIT; + } + else + extent->page_count= page_count; + extent->tail_positions= tail_positions; + extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED; +} + + +/* + Read next extent + + SYNOPSIS + read_next_extent() + info Maria handler + extent Pointer to current extent (this is updated to point + to next) + end_of_data Pointer to end of data in read block (out) + + NOTES + New block is read into info->buff + + RETURN + 0 Error; my_errno is set + # Pointer to start of data in read block + In this case end_of_data is updated to point to end of data. +*/ + +static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent, + uchar **end_of_data) +{ + MARIA_SHARE *share= info->s; + uchar *buff, *data; + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock lock; + DBUG_ENTER("read_next_extent"); + + if (!extent->page_count) + { + uint page_count; + if (!--extent->extent_count) + goto crashed; + extent->extent+= ROW_EXTENT_SIZE; + extent->page= page_korr(extent->extent); + page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) & + ~START_EXTENT_BIT); + if (!page_count) + goto crashed; + extent->tail= page_count & TAIL_BIT; + if (extent->tail) + extent->tail_row_nr= page_count & ~TAIL_BIT; + else + extent->page_count= page_count; + DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d", + (ulong) extent->page, extent->page_count, + extent->tail != 0)); + } + extent->first_extent= 0; + + lock= PAGECACHE_LOCK_LEFT_UNLOCKED; + if (extent->tail) + lock= extent->lock_for_tail_pages; + + buff= pagecache_read(share->pagecache, + &info->dfile, extent->page, 0, + info->buff, share->page_type, + lock, &page_link.link); + if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) + { + /* Read during UNDO */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + if (!buff) + { + /* check if we tried to read over end of file (ie: bad data in record) */ + if ((extent->page + 1) * share->block_size > + share->state.state.data_file_length) + goto crashed; + DBUG_RETURN(0); + } + + if (!extent->tail) + { + /* Full data page */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE) + goto crashed; + extent->page++; /* point to next page */ + extent->page_count--; + *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE; + info->cur_row.full_page_count++; /* For maria_chk */ + DBUG_RETURN(extent->data_start= buff + FULL_PAGE_HEADER_SIZE(share)); + } + + /* Found tail */ + if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE) + goto crashed; + *(extent->tail_positions++)= ma_recordpos(extent->page, + extent->tail_row_nr); + info->cur_row.tail_count++; /* For maria_chk */ + + if (!(data= get_record_position(share, buff, + extent->tail_row_nr, + end_of_data))) + goto crashed; + extent->data_start= data; + extent->page_count= 0; /* No more data in extent */ + DBUG_RETURN(data); + + +crashed: + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_PRINT("error", ("wrong extent information")); + DBUG_RETURN(0); +} + + +/* + Read data that may be split over many blocks + + SYNOPSIS + read_long_data() + info Maria handler + to Store result string here (this is allocated) + extent Pointer to current extent position + data Current position in buffer + end_of_data End of data in buffer + + NOTES + When we have to read a new buffer, it's read into info->buff + + This loop is implemented by goto's instead of a for() loop as + the code is notable smaller and faster this way (and it's not nice + to jump into a for loop() or into a 'then' clause) + + RETURN + 0 ok + 1 error +*/ + +static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length, + MARIA_EXTENT_CURSOR *extent, + uchar **data, uchar **end_of_data) +{ + uint left_length= (uint) (*end_of_data - *data); + DBUG_ENTER("read_long_data2"); + DBUG_PRINT("enter", ("length: %lu left_length: %u", + length, left_length)); + DBUG_ASSERT(*data <= *end_of_data); + + /* + Fields are never split in middle. This means that if length > rest-of-data + we should start reading from the next extent. The reason we may have + data left on the page is that if the fixed part of the row was less than + min_block_length the head block was extended to min_block_length. + + This may change in the future, which is why we have the loop written + the way it's written. + */ + if (extent->first_extent && length > left_length) + { + *end_of_data= *data; + left_length= 0; + } + + for(;;) + { + if (unlikely(left_length >= length)) + { + memcpy(to, *data, length); + (*data)+= length; + DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length)); + DBUG_RETURN(0); + } + memcpy(to, *data, left_length); + to+= left_length; + length-= left_length; + if (!(*data= read_next_extent(info, extent, end_of_data))) + break; + left_length= (uint) (*end_of_data - *data); + } + DBUG_RETURN(1); +} + +static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length, + MARIA_EXTENT_CURSOR *extent, + uchar **data, uchar **end_of_data) +{ + uint left_length= (uint) (*end_of_data - *data); + if (likely(left_length >= length)) + { + memcpy(to, *data, length); + (*data)+= length; + return 0; + } + return read_long_data2(info, to, length, extent, data, end_of_data); +} + + +/* + Read a record from page (helper function for _ma_read_block_record()) + + SYNOPSIS + _ma_read_block_record2() + info Maria handler + record Store record here + data Start of head data for row + end_of_data End of data for row + + NOTES + The head page is already read by caller + Following data is update in info->cur_row: + + cur_row.head_length is set to size of entry in head block + cur_row.tail_positions is set to point to all tail blocks + cur_row.extents points to extents data + cur_row.extents_counts contains number of extents + cur_row.empty_bits is set to empty bits + cur_row.field_lengths contains packed length of all fields + cur_row.blob_length contains total length of all blobs + cur_row.checksum contains checksum of read record. + + RETURN + 0 ok + # Error code +*/ + +int _ma_read_block_record2(MARIA_HA *info, uchar *record, + uchar *data, uchar *end_of_data) +{ + MARIA_SHARE *share= info->s; + uchar *field_length_data= 0, *UNINIT_VAR(blob_buffer), *start_of_data; + uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths; + my_bool found_blob= 0; + MARIA_EXTENT_CURSOR extent; + MARIA_COLUMNDEF *column, *end_column; + MARIA_ROW *cur_row= &info->cur_row; + myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("_ma_read_block_record2"); + + start_of_data= data; + flag= (uint) (uchar) data[0]; + cur_null_bytes= share->base.original_null_bytes; + null_bytes= share->base.null_bytes; + cur_row->head_length= (uint) (end_of_data - data); + cur_row->full_page_count= cur_row->tail_count= 0; + cur_row->blob_length= 0; + /* Number of bytes in header that we don't need to write during undo */ + cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1; + + if (flag & ROW_FLAG_TRANSID) + { + cur_row->trid= transid_korr(data+1); + if (!info->trn) + { + /* File crashed */ + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); + } + if (!trnman_can_read_from(info->trn, cur_row->trid)) + DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE); + } + + /* Skip trans header (for now, until we have MVCC csupport) */ + data+= cur_row->header_length + 1 ; + if (flag & ROW_FLAG_NULLS_EXTENDED) + cur_null_bytes+= data[-1]; + + row_extents= 0; + if (flag & ROW_FLAG_EXTENTS) + { + uint row_extent_size; + /* + Record is split over many data pages. + Get number of extents and first extent + */ + get_key_length(row_extents, data); + cur_row->extents_count= row_extents; + row_extent_size= row_extents * ROW_EXTENT_SIZE; + if (cur_row->extents_buffer_length < row_extent_size && + _ma_alloc_buffer(&cur_row->extents, + &cur_row->extents_buffer_length, + row_extent_size, myflag)) + DBUG_RETURN(my_errno); + memcpy(cur_row->extents, data, ROW_EXTENT_SIZE); + data+= ROW_EXTENT_SIZE; + init_extent(&extent, cur_row->extents, row_extents, + cur_row->tail_positions); + } + else + { + cur_row->extents_count= 0; + (*cur_row->tail_positions)= 0; + extent.page_count= 0; + extent.extent_count= 1; + } + extent.first_extent= 1; + + field_lengths= 0; + if (share->base.max_field_lengths) + { + get_key_length(field_lengths, data); + cur_row->field_lengths_length= field_lengths; +#ifdef SANITY_CHECKS + if (field_lengths > share->base.max_field_lengths) + goto err; +#endif + } + + if (share->calc_checksum) + cur_row->checksum= (uint) (uchar) *data++; + /* data now points on null bits */ + memcpy(record, data, cur_null_bytes); + if (unlikely(cur_null_bytes != null_bytes)) + { + /* + This only happens if we have added more NULL columns with + ALTER TABLE and are fetching an old, not yet modified old row + */ + bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes)); + } + data+= null_bytes; + /* We copy the empty bits to be able to use them for delete/update */ + memcpy(cur_row->empty_bits, data, share->base.pack_bytes); + data+= share->base.pack_bytes; + + /* TODO: Use field offsets, instead of just skipping them */ + data+= share->base.field_offsets * FIELD_OFFSET_SIZE; + + /* + Read row extents (note that first extent was already read into + cur_row->extents above) + */ + if (row_extents > 1) + { + if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE, + (row_extents - 1) * ROW_EXTENT_SIZE, + &extent, &data, &end_of_data)) + DBUG_RETURN(my_errno); + } + + /* + Data now points to start of fixed length field data that can't be null + or 'empty'. Note that these fields can't be split over blocks. + */ + for (column= share->columndef, + end_column= column + share->base.fixed_not_null_fields; + column < end_column; column++) + { + uint column_length= column->length; + if (data + column_length > end_of_data && + !(data= read_next_extent(info, &extent, &end_of_data))) + goto err; + memcpy(record + column->offset, data, column_length); + data+= column_length; + } + + /* Read array of field lengths. This may be stored in several extents */ + if (field_lengths) + { + field_length_data= cur_row->field_lengths; + if (read_long_data(info, field_length_data, field_lengths, &extent, + &data, &end_of_data)) + DBUG_RETURN(my_errno); + } + + /* Read variable length data. Each of these may be split over many extents */ + for (end_column= share->columndef + share->base.fields; + column < end_column; column++) + { + enum en_fieldtype type= column->type; + uchar *field_pos= record + column->offset; + /* First check if field is present in record */ + if ((record[column->null_pos] & column->null_bit) || + (column->empty_bit && + (cur_row->empty_bits[column->empty_pos] & column->empty_bit))) + { + bfill(record + column->offset, column->fill_length, + type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + switch (type) { + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_SKIP_PRESPACE: + case FIELD_SKIP_ZERO: /* Fixed length field */ + if (data + column->length > end_of_data && + !(data= read_next_extent(info, &extent, &end_of_data))) + goto err; + memcpy(field_pos, data, column->length); + data+= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + /* Char that is space filled */ + uint length; + if (column->length <= 255) + length= (uint) (uchar) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } +#ifdef SANITY_CHECKS + if (length > column->length) + goto err; +#endif + if (read_long_data(info, field_pos, length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + bfill(field_pos + length, column->length - length, ' '); + break; + } + case FIELD_VARCHAR: + { + ulong length; + uint pack_length __attribute__((unused)); + if (column->length <= 256) + { + length= (uint) (uchar) (*field_pos++= *field_length_data++); + pack_length= 1; + } + else + { + length= uint2korr(field_length_data); + field_pos[0]= field_length_data[0]; + field_pos[1]= field_length_data[1]; + field_pos+= 2; + field_length_data+= 2; + pack_length= 2; + } +#ifdef SANITY_CHECKS + if (length > column->length - pack_length) + goto err; +#endif + if (read_long_data(info, field_pos, length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + MEM_UNDEFINED(field_pos + length, column->length - length - pack_length); + break; + } + case FIELD_BLOB: + { + uint column_size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(column_size_length, + field_length_data); + + if (!found_blob) + { + /* Calculate total length for all blobs */ + ulong blob_lengths= 0; + uchar *length_data= field_length_data; + MARIA_COLUMNDEF *blob_field= column; + + found_blob= 1; + for (; blob_field < end_column; blob_field++) + { + uint size_length; + if ((record[blob_field->null_pos] & blob_field->null_bit) || + (blob_field->empty_bit & + (cur_row->empty_bits[blob_field->empty_pos] & + blob_field->empty_bit))) + continue; + size_length= blob_field->length - portable_sizeof_char_ptr; + blob_lengths+= _ma_calc_blob_length(size_length, length_data); + length_data+= size_length; + } + cur_row->blob_length= blob_lengths; + DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths)); + if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size, + blob_lengths, myflag)) + DBUG_RETURN(my_errno); + blob_buffer= info->blob_buff; + } + + memcpy(field_pos, field_length_data, column_size_length); + memcpy(field_pos + column_size_length, (uchar *) &blob_buffer, + sizeof(char*)); + field_length_data+= column_size_length; + + /* + After we have read one extent, then each blob is in it's own extent + */ + if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length) + end_of_data= data; /* Force read of next extent */ + + if (read_long_data(info, blob_buffer, blob_length, &extent, &data, + &end_of_data)) + DBUG_RETURN(my_errno); + blob_buffer+= blob_length; + break; + } + default: +#ifdef EXTRA_DEBUG + DBUG_ASSERT(0); /* purecov: deadcode */ +#endif + goto err; + } + continue; + } + + if (row_extents) + { + DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u", + extent.page_count, extent.extent_count)); + *extent.tail_positions= 0; /* End marker */ + if (extent.page_count) + goto err; + if (extent.extent_count > 1) + { + if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE, + (extent.extent_count-1) * ROW_EXTENT_SIZE)) + { + DBUG_PRINT("error", ("Data in extent is not zero")); + DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE, + (extent.extent_count-1) * ROW_EXTENT_SIZE); + goto err; + } + } + } + else + { + DBUG_PRINT("info", ("Row read")); + /* + data should normally point to end_of_date. The only exception is if + the row is very short in which case we allocated 'min_block_length' data + for allowing the row to expand. + */ + if (data != end_of_data && (uint) (end_of_data - start_of_data) > + share->base.min_block_length) + goto err; + } +#ifdef EXTRA_DEBUG + if (share->calc_checksum && !info->in_check_table) + { + /* Esnure that row checksum is correct */ + DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) == + cur_row->checksum); + } +#endif + info->update|= HA_STATE_AKTIV; /* We have an active record */ + DBUG_RETURN(0); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + /* Something was wrong with data on record */ + DBUG_PRINT("error", ("Found record with wrong data")); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); +} + + +/** @brief Read positions to tail blocks and full blocks + + @fn read_row_extent_info() + @param info Handler + + @notes + This function is a simpler version of _ma_read_block_record2() + The data about the used pages is stored in info->cur_row. + + @return Status + @retval 0 ok + @retval 1 Error. my_errno contains error number +*/ + +static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff, + uint record_number) +{ + MARIA_SHARE *share= info->s; + MARIA_EXTENT_CURSOR extent; + MARIA_RECORD_POS *tail_pos; + uchar *data, *end_of_data; + uint flag, row_extents, row_extents_size; + uint field_lengths __attribute__ ((unused)); + uchar *extents, *end; + myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("read_row_extent_info"); + + if (!(data= get_record_position(share, buff, + record_number, &end_of_data))) + DBUG_RETURN(1); /* Wrong in record */ + + flag= (uint) (uchar) data[0]; + /* Skip trans header */ + data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)]; + + row_extents= 0; + row_extents_size= 0; + if (flag & ROW_FLAG_EXTENTS) + { + /* + Record is split over many data pages. + Get number of extents and first extent + */ + get_key_length(row_extents, data); + row_extents_size= row_extents * ROW_EXTENT_SIZE; + if (info->cur_row.extents_buffer_length < row_extents_size && + _ma_alloc_buffer(&info->cur_row.extents, + &info->cur_row.extents_buffer_length, + row_extents_size, myflag)) + DBUG_RETURN(1); + memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE); + data+= ROW_EXTENT_SIZE; + init_extent(&extent, info->cur_row.extents, row_extents, + info->cur_row.tail_positions); + extent.first_extent= 1; + } + info->cur_row.extents_count= row_extents; + + /* + field_lengths looks unused but get_key_length will + increment data, which is required as data it's used later. + */ + if (share->base.max_field_lengths) + get_key_length(field_lengths, data); + + if (share->calc_checksum) + info->cur_row.checksum= (uint) (uchar) *data++; + if (row_extents > 1) + { + data+= share->base.null_bytes; + data+= share->base.pack_bytes; + data+= share->base.field_offsets * FIELD_OFFSET_SIZE; + + /* + Read row extents (note that first extent was already read into + info->cur_row.extents above) + Lock tails with write lock as we will delete them later. + */ + extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED; + if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE, + row_extents_size - ROW_EXTENT_SIZE, + &extent, &data, &end_of_data)) + DBUG_RETURN(1); + } + + /* Update tail_positions with pointer to tails */ + tail_pos= info->cur_row.tail_positions; + for (extents= info->cur_row.extents, end= extents + row_extents_size; + extents < end; + extents+= ROW_EXTENT_SIZE) + { + pgcache_page_no_t page= uint5korr(extents); + uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE); + if (page_count & TAIL_BIT) + *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT | + START_EXTENT_BIT))); + } + *tail_pos= 0; /* End marker */ + DBUG_RETURN(0); +} + + +/* + Read a record based on record position + + @fn _ma_read_block_record() + @param info Maria handler + @param record Store record here + @param record_pos Record position + + @return Status + @retval 0 ok + @retval # Error number +*/ + +int _ma_read_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos) +{ + MARIA_SHARE *share= info->s; + uchar *data, *end_of_data, *buff; + uint offset; + int ret; + DBUG_ENTER("_ma_read_block_record"); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) record_pos, + (ulong) ma_recordpos_to_page(record_pos), + ma_recordpos_to_dir_entry(record_pos))); + + offset= ma_recordpos_to_dir_entry(record_pos); + + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, ma_recordpos_to_page(record_pos), 0, + info->buff, share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + + /* + Unallocated page access can happen if this is an access to a page where + all rows where deleted as part of this statement. + */ + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE || + (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE); + + if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE) || + !(data= get_record_position(share, buff, offset, &end_of_data))) + { + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_PRINT("warning", ("Wrong directory entry in data block")); + my_errno= HA_ERR_RECORD_DELETED; /* File crashed */ + DBUG_RETURN(HA_ERR_RECORD_DELETED); + } + ret= _ma_read_block_record2(info, record, data, end_of_data); + DBUG_RETURN(ret); +} + + +/* compare unique constraint between stored rows */ + +my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + uchar *org_rec_buff, *old_record; + size_t org_rec_buff_size; + int error; + my_bool buff_alloced; + DBUG_ENTER("_ma_cmp_block_unique"); + + alloc_on_stack(*info->stack_end_ptr, old_record, buff_alloced, + info->s->base.reclength); + if (!old_record) + DBUG_RETURN(1); + + /* Don't let the compare destroy blobs that may be in use */ + org_rec_buff= info->rec_buff; + org_rec_buff_size= info->rec_buff_size; + if (info->s->base.blobs) + { + /* Force realloc of record buffer*/ + info->rec_buff= 0; + info->rec_buff_size= 0; + } + error= _ma_read_block_record(info, old_record, pos); + if (!error) + error= _ma_unique_comp(def, record, old_record, def->null_are_equal); + if (info->s->base.blobs) + { + my_free(info->rec_buff); + info->rec_buff= org_rec_buff; + info->rec_buff_size= org_rec_buff_size; + } + DBUG_PRINT("exit", ("result: %d", error)); + stack_alloc_free(old_record, buff_alloced); + DBUG_RETURN(error != 0); +} + + +/**************************************************************************** + Table scan +****************************************************************************/ + +/* + Allocate buffers for table scan + + SYNOPSIS + _ma_scan_init_block_record(MARIA_HA *info) + + IMPLEMENTATION + We allocate one buffer for the current bitmap and one buffer for the + current page + + RETURN + 0 ok + 1 error (couldn't allocate memory or disk error) +*/ + +my_bool _ma_scan_init_block_record(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("_ma_scan_init_block_record"); + DBUG_ASSERT(info->dfile.file == share->bitmap.file.file); + + /* + bitmap_buff may already be allocated if this is the second call to + rnd_init() without a rnd_end() in between, see sql/handler.h + */ + if (!(info->scan.bitmap_buff || + ((info->scan.bitmap_buff= + (uchar *) my_malloc(PSI_INSTRUMENT_ME, share->block_size * 2, + flag))))) + DBUG_RETURN(1); + info->scan.page_buff= info->scan.bitmap_buff + share->block_size; + info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size; + + /* Set scan variables to get _ma_scan_block() to start with reading bitmap */ + info->scan.number_of_rows= 0; + info->scan.bitmap_pos= info->scan.bitmap_end; + info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered; + info->scan.max_page= share->state.state.data_file_length / share->block_size; + /* + We need to flush what's in memory (bitmap.map) to page cache otherwise, as + we are going to read bitmaps from page cache in table scan (see + _ma_scan_block_record()), we may miss recently inserted rows (bitmap page + in page cache would be too old). + */ + DBUG_RETURN(_ma_bitmap_flush(info->s)); +} + + +/* Free buffers allocated by _ma_scan_block_init() */ + +void _ma_scan_end_block_record(MARIA_HA *info) +{ + DBUG_ENTER("_ma_scan_end_block_record"); + my_free(info->scan.bitmap_buff); + info->scan.bitmap_buff= 0; + if (info->scan_save) + { + my_free(info->scan_save); + info->scan_save= 0; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Save current scan position + + @note + For the moment we can only remember one position, but this is + good enough for MySQL usage + + @return + @retval 0 ok + @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position +*/ + +int _ma_scan_remember_block_record(MARIA_HA *info, + MARIA_RECORD_POS *lastpos) +{ + uchar *bitmap_buff; + DBUG_ENTER("_ma_scan_remember_block_record"); + if (!(info->scan_save)) + { + if (!(info->scan_save= my_malloc(PSI_INSTRUMENT_ME, + ALIGN_SIZE(sizeof(*info->scan_save)) + + info->s->block_size * 2, + MYF(MY_WME)))) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + info->scan_save->bitmap_buff= ((uchar*) info->scan_save + + ALIGN_SIZE(sizeof(*info->scan_save))); + } + /* For checking if pages have changed since we last read it */ + info->scan.row_changes= info->row_changes; + + /* Remember used bitmap and used head page */ + bitmap_buff= info->scan_save->bitmap_buff; + memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save)); + info->scan_save->bitmap_buff= bitmap_buff; + memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2); + + /* Point to the last read row */ + *lastpos= info->cur_row.nextpos - 1; + info->scan_save->dir+= DIR_ENTRY_SIZE; + DBUG_RETURN(0); +} + + +/** + @brief restore scan block it's original values + + @return + 0 ok + # error + + @note + In theory we could swap bitmap buffers instead of copy them. + For the moment we don't do that because there are variables pointing + inside the buffers and it's a bit of hassle to either make them relative + or repoint them. + + If the data file has changed, we will re-read the new block record + to ensure that when we continue scanning we can ignore any deleted rows. +*/ + +int _ma_scan_restore_block_record(MARIA_HA *info, + MARIA_RECORD_POS lastpos) +{ + uchar *bitmap_buff; + DBUG_ENTER("_ma_scan_restore_block_record"); + + info->cur_row.nextpos= lastpos; + bitmap_buff= info->scan.bitmap_buff; + memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save)); + info->scan.bitmap_buff= bitmap_buff; + memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2); + + if (info->scan.row_changes != info->row_changes) + { + /* + Table has been changed. We have to re-read the current page block as + data may have changed on it that we have to see. + */ + if (!(pagecache_read(info->s->pagecache, + &info->dfile, + ma_recordpos_to_page(info->scan.row_base_page), + 0, info->scan.page_buff, + info->s->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]; + info->scan.dir_end= (info->scan.page_buff + info->s->block_size - + PAGE_SUFFIX_SIZE - + info->scan.number_of_rows * DIR_ENTRY_SIZE); + } + DBUG_RETURN(0); +} + + +/* + Read next record while scanning table + + SYNOPSIS + _ma_scan_block_record() + info Maria handler + record Store found here + record_pos Value stored in info->cur_row.next_pos after last call + This is offset inside the current pagebuff + skip_deleted + + NOTES + - One must have called mi_scan() before this + - In this version, we don't actually need record_pos, we as easily + use a variable in info->scan + + IMPLEMENTATION + Current code uses a lot of goto's to separate the different kind of + states we may be in. This gives us a minimum of executed if's for + the normal cases. I tried several different ways to code this, but + the current one was in the end the most readable and fastest. + + RETURN + 0 ok + # Error code (Normally HA_ERR_END_OF_FILE) +*/ + +int _ma_scan_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos, + my_bool skip_deleted __attribute__ ((unused))) +{ + uint block_size; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_scan_block_record"); + +restart_record_read: + /* Find next row in current page */ + while (likely(record_pos < info->scan.number_of_rows)) + { + uint length, offset; + uchar *data, *end_of_data; + int error; + + /* Ensure that scan.dir and record_pos are in sync */ + DBUG_ASSERT(info->scan.dir == dir_entry_pos(info->scan.page_buff, + share->block_size, + (uint) record_pos)); + + /* Search for a valid directory entry (not 0) */ + while (!(offset= uint2korr(info->scan.dir))) + { + info->scan.dir-= DIR_ENTRY_SIZE; + record_pos++; +#ifdef SANITY_CHECKS + if (info->scan.dir < info->scan.dir_end) + { + DBUG_ASSERT(!maria_assert_if_crashed_table); + goto err; + } +#endif + } + /* + This should always be true as the directory should always start with + a valid entry. + */ + DBUG_ASSERT(info->scan.dir >= info->scan.dir_end); + + /* found row */ + info->cur_row.lastpos= info->scan.row_base_page + record_pos; + info->cur_row.nextpos= record_pos + 1; + data= info->scan.page_buff + offset; + length= uint2korr(info->scan.dir + 2); + end_of_data= data + length; + info->scan.dir-= DIR_ENTRY_SIZE; /* Point to next row to process */ +#ifdef SANITY_CHECKS + if (end_of_data > info->scan.dir_end || + offset < PAGE_HEADER_SIZE(share) || + length < share->base.min_block_length) + { + DBUG_ASSERT(!(end_of_data > info->scan.dir_end)); + DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE(share))); + DBUG_ASSERT(!(length < share->base.min_block_length)); + goto err; + } +#endif + DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos)); + error= _ma_read_block_record2(info, record, data, end_of_data); + if (error != HA_ERR_ROW_NOT_VISIBLE) + DBUG_RETURN(error); + record_pos++; + } + + /* Find next head page in current bitmap */ +restart_bitmap_scan: + block_size= share->block_size; + if (likely(info->scan.bitmap_pos < info->scan.bitmap_end)) + { + uchar *data= info->scan.bitmap_pos; + longlong bits= info->scan.bits; + uint bit_pos= info->scan.bit_pos; + + do + { + while (likely(bits)) + { + uint pattern= (uint) (bits & 7); + bits >>= 3; + bit_pos++; + if (pattern > 0 && pattern <= 4) + { + /* Found head page; Read it */ + pgcache_page_no_t page; + info->scan.bitmap_pos= data; + info->scan.bits= bits; + info->scan.bit_pos= bit_pos; + page= (info->scan.bitmap_page + 1 + + (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1); + info->scan.row_base_page= ma_recordpos(page, 0); + if (page >= info->scan.max_page) + { + DBUG_PRINT("info", ("Found end of file")); + DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); + } + if (!(pagecache_read(share->pagecache, + &info->dfile, + page, 0, info->scan.page_buff, + share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != + HEAD_PAGE)) + { + /* + This may happen if someone has been deleting all rows + from a page since we read the bitmap, so it may be ok. + Print warning in debug log and continue. + */ + DBUG_PRINT("warning", + ("Found page of type %d when expecting head page", + (info->scan.page_buff[PAGE_TYPE_OFFSET] & + PAGE_TYPE_MASK))); + continue; + } + if ((info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0) + { + DBUG_PRINT("error", ("Wrong page header")); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); + } + DBUG_PRINT("info", ("Page %lu has %u rows", + (ulong) page, info->scan.number_of_rows)); + info->scan.dir= (info->scan.page_buff + block_size - + PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); + info->scan.dir_end= (info->scan.dir - + (info->scan.number_of_rows - 1) * + DIR_ENTRY_SIZE); + record_pos= 0; + goto restart_record_read; + } + } + for (data+= 6; data < info->scan.bitmap_end; data+= 6) + { + bits= uint6korr(data); + /* Skip not allocated pages and blob / full tail pages */ + if (bits && bits != 07777777777777777LL) + break; + } + bit_pos= 0; + } while (data < info->scan.bitmap_end); + } + + /* Read next bitmap */ + info->scan.bitmap_page+= share->bitmap.pages_covered; + if (unlikely(info->scan.bitmap_page >= info->scan.max_page)) + { + DBUG_PRINT("info", ("Found end of file")); + DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE)); + } + DBUG_PRINT("info", ("Reading bitmap at %lu", + (ulong) info->scan.bitmap_page)); + if (!(pagecache_read(share->pagecache, &info->s->bitmap.file, + info->scan.bitmap_page, + 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + DBUG_RETURN(my_errno); + /* Skip scanning 'bits' in bitmap scan code */ + info->scan.bitmap_pos= info->scan.bitmap_buff - 6; + info->scan.bits= 0; + goto restart_bitmap_scan; + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_PRINT("error", ("Wrong data on page")); + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); +} + + +/* + Compare a row against a stored one + + NOTES + Not implemented, as block record is not supposed to be used in a shared + global environment +*/ + +my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)), + const uchar *record __attribute__ ((unused))) +{ + return 0; +} + + +/* + Store an integer with simple packing + + SYNOPSIS + ma_store_integer() + to Store the packed integer here + nr Integer to store + + NOTES + This is mostly used to store field numbers and lengths of strings. + We have to cast the result for the LL() becasue of a bug in Forte CC + compiler. + + Packing used is: + nr < 251 is stored as is (in 1 byte) + Numbers that require 1-4 bytes are stored as char(250+byte_length), data + Bigger numbers are stored as 255, data as ulonglong (not yet done). + + RETURN + Position in 'to' after the packed length +*/ + +uchar *ma_store_length(uchar *to, ulong nr) +{ + if (nr < 251) + { + *to=(uchar) nr; + return to+1; + } + if (nr < 65536) + { + if (nr <= 255) + { + to[0]= (uchar) 251; + to[1]= (uchar) nr; + return to+2; + } + to[0]= (uchar) 252; + int2store(to+1, nr); + return to+3; + } + if (nr < 16777216) + { + *to++= (uchar) 253; + int3store(to, nr); + return to+3; + } + *to++= (uchar) 254; + int4store(to, nr); + return to+4; +} + + +/* Calculate how many bytes needed to store a number */ + +uint ma_calc_length_for_store_length(ulong nr) +{ + if (nr < 251) + return 1; + if (nr < 65536) + { + if (nr <= 255) + return 2; + return 3; + } + if (nr < 16777216) + return 4; + return 5; +} + + +/* Retrive a stored number */ + +static ulong ma_get_length(const uchar **packet) +{ + reg1 const uchar *pos= *packet; + if (*pos < 251) + { + (*packet)++; + return (ulong) *pos; + } + if (*pos == 251) + { + (*packet)+= 2; + return (ulong) pos[1]; + } + if (*pos == 252) + { + (*packet)+= 3; + return (ulong) uint2korr(pos+1); + } + if (*pos == 253) + { + (*packet)+= 4; + return (ulong) uint3korr(pos+1); + } + DBUG_ASSERT(*pos == 254); + (*packet)+= 5; + return (ulong) uint4korr(pos+1); +} + + +/* + Fill array with pointers to field parts to be stored in log for insert + + SYNOPSIS + fill_insert_undo_parts() + info Maria handler + record Inserted row + log_parts Store pointers to changed memory areas here + log_parts_count See RETURN + + NOTES + We have information in info->cur_row about the read row. + + RETURN + length of data in log_parts. + log_parts_count contains number of used log_parts +*/ + +static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record, + LEX_CUSTRING *log_parts, + uint *log_parts_count) +{ + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + uchar *field_lengths= info->cur_row.field_lengths; + size_t row_length; + MARIA_ROW *cur_row= &info->cur_row; + LEX_CUSTRING *start_log_parts; + DBUG_ENTER("fill_insert_undo_parts"); + + start_log_parts= log_parts; + + /* Store null bits */ + log_parts->str= record; + log_parts->length= share->base.null_bytes; + row_length= log_parts->length; + log_parts++; + + /* Stored bitmap over packed (zero length or all-zero fields) */ + log_parts->str= info->cur_row.empty_bits; + log_parts->length= share->base.pack_bytes; + row_length+= log_parts->length; + log_parts++; + + if (share->base.max_field_lengths) + { + /* Store length of all not empty char, varchar and blob fields */ + log_parts->str= field_lengths - 2; + log_parts->length= info->cur_row.field_lengths_length+2; + int2store(log_parts->str, info->cur_row.field_lengths_length); + row_length+= log_parts->length; + log_parts++; + } + + if (share->base.blobs) + { + /* + Store total blob length to make buffer allocation easier during UNDO + */ + log_parts->str= info->length_buff; + log_parts->length= (uint) (ma_store_length(info->length_buff, + info->cur_row.blob_length) - + (uchar*) log_parts->str); + row_length+= log_parts->length; + log_parts++; + } + + /* Handle constant length fields that are always present */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + log_parts->str= record + column->offset; + log_parts->length= column->length; + row_length+= log_parts->length; + log_parts++; + } + + /* Handle NULL fields and CHAR/VARCHAR fields */ + for (end_column= share->columndef + share->base.fields - share->base.blobs; + column < end_column; + column++) + { + const uchar *column_pos; + size_t column_length; + if ((record[column->null_pos] & column->null_bit) || + (column->empty_bit && + cur_row->empty_bits[column->empty_pos] & column->empty_bit)) + continue; + + column_pos= record+ column->offset; + column_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + if (column->length <= 255) + column_length= *field_lengths++; + else + { + column_length= uint2korr(field_lengths); + field_lengths+= 2; + } + break; + } + case FIELD_VARCHAR: + { + if (column->fill_length == 1) + column_length= *field_lengths; + else + column_length= uint2korr(field_lengths); + field_lengths+= column->fill_length; + column_pos+= column->fill_length; + break; + } + default: + DBUG_ASSERT(0); + } + log_parts->str= column_pos; + log_parts->length= column_length; + row_length+= log_parts->length; + log_parts++; + } + + /* Add blobs */ + for (end_column+= share->base.blobs; column < end_column; column++) + { + const uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_pos); + + /* + We don't have to check for null, as blob_length is guranteed to be 0 + if the blob is null + */ + if (blob_length) + { + uchar *blob_pos; + memcpy(&blob_pos, record + column->offset + size_length, + sizeof(blob_pos)); + log_parts->str= blob_pos; + log_parts->length= blob_length; + row_length+= log_parts->length; + log_parts++; + } + } + *log_parts_count= (uint) (log_parts - start_log_parts); + DBUG_RETURN(row_length); +} + + +/* + Fill array with pointers to field parts to be stored in log for update + + SYNOPSIS + fill_update_undo_parts() + info Maria handler + oldrec Original row + newrec New row + log_parts Store pointers to changed memory areas here + log_parts_count See RETURN + + IMPLEMENTATION + Format of undo record: + + Fields are stored in same order as the field array. + + Offset to changed field data (packed) + + For each changed field + Fieldnumber (packed) + Length, if variable length field (packed) + + For each changed field + Data + + Packing is using ma_store_integer() + + The reason we store field numbers & length separated from data (ie, not + after each other) is to get better cpu caching when we loop over + fields (as we probably don't have to access data for each field when we + want to read and old row through the undo log record). + + As a special case, we use '255' for the field number of the null bitmap. + + RETURN + length of data in log_parts. + log_parts_count contains number of used log_parts +*/ + +static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec, + const uchar *newrec, + LEX_CUSTRING *log_parts, + uint *log_parts_count) +{ + MARIA_SHARE *share= info->s; + MARIA_COLUMNDEF *column, *end_column; + MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row; + uchar *field_data, *start_field_data, *length_str; + uchar *old_field_lengths= old_row->field_lengths; + uchar *new_field_lengths= new_row->field_lengths; + size_t row_length= 0; + uint field_lengths; + LEX_CUSTRING *start_log_parts; + my_bool new_column_is_empty; + DBUG_ENTER("fill_update_undo_parts"); + + start_log_parts= log_parts; + + /* + First log part is for number of fields, field numbers and lengths + The +4 is to reserve place for the number of changed fields. + */ + start_field_data= field_data= info->update_field_data + 4; + log_parts++; + + if (memcmp(oldrec, newrec, share->base.null_bytes)) + { + /* Store changed null bits */ + *field_data++= (uchar) 255; /* Special case */ + log_parts->str= oldrec; + log_parts->length= share->base.null_bytes; + row_length= log_parts->length; + log_parts++; + } + + /* Handle constant length fields */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + if (memcmp(oldrec + column->offset, newrec + column->offset, + column->length)) + { + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + log_parts->str= oldrec + column->offset; + log_parts->length= column->length; + row_length+= column->length; + log_parts++; + } + } + + /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */ + for (end_column= share->columndef + share->base.fields; + column < end_column; + column++) + { + const uchar *new_column_pos, *old_column_pos; + size_t new_column_length, old_column_length; + + /* First check if old column is null or empty */ + if (oldrec[column->null_pos] & column->null_bit) + { + /* + It's safe to skip this one as either the new column is also null + (no change) or the new_column is not null, in which case the null-bit + maps differed and we have already stored the null bitmap. + */ + continue; + } + if (column->empty_bit && + (old_row->empty_bits[column->empty_pos] & column->empty_bit)) + { + if (new_row->empty_bits[column->empty_pos] & column->empty_bit) + continue; /* Both are empty; skip */ + + /* Store null length column */ + field_data= ma_store_length(field_data, + (uint) (column - share->columndef)); + field_data= ma_store_length(field_data, 0); + continue; + } + /* + Remember if the 'new' value is empty (as in this case we must always + log the original value + */ + new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) || + (column->empty_bit && + (new_row->empty_bits[column->empty_pos] & + column->empty_bit))); + + old_column_pos= oldrec + column->offset; + new_column_pos= newrec + column->offset; + old_column_length= new_column_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + break; + case FIELD_VARCHAR: + new_column_length--; /* Skip length prefix */ + old_column_pos+= column->fill_length; + new_column_pos+= column->fill_length; + /* Fall through */ + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + if (new_column_length <= 255) + { + old_column_length= *old_field_lengths++; + if (!new_column_is_empty) + new_column_length= *new_field_lengths++; + } + else + { + old_column_length= uint2korr(old_field_lengths); + old_field_lengths+= 2; + if (!new_column_is_empty) + { + new_column_length= uint2korr(new_field_lengths); + new_field_lengths+= 2; + } + } + break; + } + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + old_column_length= _ma_calc_blob_length(size_length, old_column_pos); + memcpy((void*) &old_column_pos, oldrec + column->offset + size_length, + sizeof(old_column_pos)); + if (!new_column_is_empty) + { + new_column_length= _ma_calc_blob_length(size_length, new_column_pos); + memcpy((void*) &new_column_pos, newrec + column->offset + size_length, + sizeof(old_column_pos)); + } + break; + } + default: + DBUG_ASSERT(0); + } + + if (new_column_is_empty || new_column_length != old_column_length || + memcmp(old_column_pos, new_column_pos, new_column_length)) + { + field_data= ma_store_length(field_data, + (ulong) (column - share->columndef)); + field_data= ma_store_length(field_data, (ulong) old_column_length); + + log_parts->str= old_column_pos; + log_parts->length= old_column_length; + row_length+= old_column_length; + log_parts++; + } + } + + *log_parts_count= (uint) (log_parts - start_log_parts); + + /* Store length of field length data before the field/field_lengths */ + field_lengths= (uint) (field_data - start_field_data); + length_str= start_field_data - ma_calc_length_for_store_length(field_lengths); + start_log_parts->str= length_str; + ma_store_length(length_str, field_lengths); + start_log_parts->length= (size_t) (field_data - start_log_parts->str); + row_length+= start_log_parts->length; + DBUG_RETURN(row_length); +} + +/*************************************************************************** + In-write hooks called under log's lock when log record is written +***************************************************************************/ + +/** + @brief Sets transaction's rec_lsn if needed + + A transaction sometimes writes a REDO even before the page is in the + pagecache (example: brand new head or tail pages; full pages). So, if + Checkpoint happens just after the REDO write, it needs to know that the + REDO phase must start before this REDO. Scanning the pagecache cannot + tell that as the page is not in the cache. So, transaction sets its rec_lsn + to the REDO's LSN or somewhere before, and Checkpoint reads the + transaction's rec_lsn. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_redo(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, void *hook_arg + __attribute__ ((unused))) +{ + /* + Users of dummy_transaction_object must keep this TRN clean as it + is used by many threads (like those manipulating non-transactional + tables). It might be dangerous if one user sets rec_lsn or some other + member and it is picked up by another user (like putting this rec_lsn into + a page of a non-transactional table); it's safer if all members stay 0. So + non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not + call this hook; we trust them but verify ;) + */ + DBUG_ASSERT(trn->trid != 0); + /* + If the hook stays so simple, it would be faster to pass + !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn + to translog_write_record(), like Monty did in his original code, and not + have a hook. For now we keep it like this. + */ + if (trn->rec_lsn == 0) + trn->rec_lsn= *lsn; + return 0; +} + + +/** + @brief Sets transaction's undo_lsn, first_undo_lsn if needed + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, void *hook_arg + __attribute__ ((unused))) +{ + DBUG_ASSERT(trn->trid != 0); + trn->undo_lsn= *lsn; + if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0)) + trn->first_undo_lsn= + trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + return 0; + /* + when we implement purging, we will specialize this hook: UNDO_PURGE + records will additionally set trn->undo_purge_lsn + */ +} + + +/** + @brief Sets the table's records count and checksum and others to 0, then + calls the generic REDO hook. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_redo_delete_all(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, void *hook_arg) +{ + _ma_reset_status(tbl_info); + return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates "records" and "checksum" and calls the generic UNDO hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_row_insert(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + share->state.state.records++; + share->state.state.checksum+= *(ha_checksum *)hook_arg; + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates "records" and calls the generic UNDO hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_row_delete(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + share->state.state.records--; + share->state.state.checksum+= *(ha_checksum *)hook_arg; + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Upates "records" and "checksum" and calls the generic UNDO hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_row_update(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + share->state.state.checksum+= *(ha_checksum *)hook_arg; + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + /* + We are going to call maria_delete_all_rows(), but without logging and + syncing, as an optimization (if we crash before commit, the UNDO will + empty; if we crash after commit, we have flushed and forced the files). + Status still needs to be reset under log mutex, in case of a concurrent + checkpoint. + */ + _ma_reset_status(tbl_info); + return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates table's lsn_of_file_id. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_file_id(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn + __attribute__ ((unused)), + MARIA_HA *tbl_info, + LSN *lsn, + void *hook_arg + __attribute__ ((unused))) +{ + DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0); + tbl_info->s->lsn_of_file_id= *lsn; + return 0; +} + + +/** + Updates transaction's rec_lsn when committing. + + A transaction writes its commit record before being committed in trnman, so + if Checkpoint happens just between the COMMIT record log write and the + commit in trnman, it will record that transaction is not committed. Assume + the transaction (trn1) did an INSERT; after the checkpoint, a second + transaction (trn2) does a DELETE of what trn1 has inserted. Then crash, + Checkpoint record says that trn1 was not committed, and REDO phase starts + from Checkpoint record's LSN. So it will not find the COMMIT record of + trn1, will want to roll back trn1, which will fail because the row/key + which it wants to delete does not exist anymore. + To avoid this, Checkpoint needs to know that the REDO phase must start + before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's + record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint + will know. + + @note so after commit trn->rec_lsn is a "commit LSN", which could be of + use later. + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_commit(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, + MARIA_HA *tbl_info + __attribute__ ((unused)), + LSN *lsn, + void *hook_arg + __attribute__ ((unused))) +{ + trn->rec_lsn= *lsn; + return 0; +} + + +/*************************************************************************** + Applying of REDO log records +***************************************************************************/ + +/* + Apply changes to head and tail pages + + SYNOPSIS + _ma_apply_redo_insert_row_head_or_tail() + info Maria handler + lsn LSN to put on page + page_type HEAD_PAGE or TAIL_PAGE + new_page True if this is first entry on page + header Header (without FILEID) + data Data to be put on page + data_length Length of data + + NOTE + Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL + LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL + + RETURN + 0 ok + # Error number +*/ + +uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + my_bool new_page, + const uchar *header, + const uchar *data, + size_t data_length) +{ + MARIA_SHARE *share= info->s; + pgcache_page_no_t page; + uint rownr, empty_space; + uint block_size= share->block_size; + uint rec_offset; + uchar *buff, *dir; + uint result; + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock lock_method; + enum pagecache_page_pin pin_method; + my_off_t end_of_page; + uint error; + DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail"); + + page= page_korr(header); + rownr= dirpos_korr(header + PAGE_STORE_SIZE); + + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr, (uint) data_length)); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + end_of_page= (page + 1) * share->block_size; + if (end_of_page > share->state.state.data_file_length) + { + DBUG_PRINT("info", ("Enlarging data file from %lu to %lu", + (ulong) share->state.state.data_file_length, + (ulong) end_of_page)); + /* + New page at end of file. Note that the test above is also positive if + data_file_length is not a multiple of block_size (system crashed while + writing the last page): in this case we just extend the last page and + fill it entirely with zeroes, then the REDO will put correct data on + it. + */ + lock_method= PAGECACHE_LOCK_WRITE; + pin_method= PAGECACHE_PIN; + + DBUG_ASSERT(rownr == 0 && new_page); + if (rownr != 0 || !new_page) + goto crashed_file; + + buff= info->keyread_buff; + info->keyread_buff_used= 1; + make_empty_page(info, buff, page_type, 1); + empty_space= (block_size - PAGE_OVERHEAD_SIZE(share)); + rec_offset= PAGE_HEADER_SIZE(share); + dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE; + } + else + { + lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + pin_method= PAGECACHE_PIN_LEFT_PINNED; + + share->pagecache->readwrite_flags&= ~MY_WME; + share->silence_encryption_errors= 1; + buff= pagecache_read(share->pagecache, &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link); + share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags; + share->silence_encryption_errors= 0; + if (!buff) + { + /* Skip errors when reading outside of file and uninitialized pages */ + if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT && + my_errno != HA_ERR_WRONG_CRC && + my_errno != HA_ERR_DECRYPTION_FAILED)) + { + DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno)); + goto err; + } + /* Create new page */ + buff= pagecache_block_link_to_buffer(page_link.link); + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; + } + else if (lsn_korr(buff) >= lsn) /* Test if already applied */ + { + check_skipped_lsn(info, lsn_korr(buff), 1, page); + /* Fix bitmap, just in case */ + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + goto err; + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(0); + } + + if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type)) + { + /* + This is a page that has been freed before and now should be + changed to new type. + */ + if (!new_page) + { + DBUG_PRINT("error", + ("Found page of wrong type: %u, should have been %u", + (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK), + page_type)); + goto crashed_file; + } + make_empty_page(info, buff, page_type, 0); + empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE; + (void) extend_directory(info, buff, block_size, 0, rownr, &empty_space, + page_type == HEAD_PAGE); + rec_offset= PAGE_HEADER_SIZE(share); + dir= dir_entry_pos(buff, block_size, rownr); + empty_space+= uint2korr(dir+2); + } + else + { + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint length; + + DBUG_ASSERT(!new_page); + dir= dir_entry_pos(buff, block_size, rownr); + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + + if (max_entry <= rownr) + { + /* Add directory entry first in directory and data last on page */ + if (extend_directory(info, buff, block_size, max_entry, rownr, + &empty_space, page_type == HEAD_PAGE)) + goto crashed_file; + } + if (extend_area_on_page(info, buff, dir, rownr, + (uint) data_length, &empty_space, + &rec_offset, &length, page_type == HEAD_PAGE)) + goto crashed_file; + } + } + /* Copy data */ + int2store(dir+2, data_length); + memcpy(buff + rec_offset, data, data_length); + empty_space-= (uint) data_length; + int2store(buff + EMPTY_SPACE_OFFSET, empty_space); + + /* Fix bitmap */ + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + goto err; + + /* + If page was not read before, write it but keep it pinned. + We don't update its LSN When we have processed all REDOs for this page + in the current REDO's group, we will stamp page with UNDO's LSN + (if we stamped it now, a next REDO, in + this group, for this page, would be skipped) and unpin then. + */ + result= 0; + if (lock_method == PAGECACHE_LOCK_WRITE && + pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + lock_method, pin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE)) + result= my_errno; + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + /* + Data page and bitmap page are in place, we can update data_file_length in + case we extended the file. We could not do it earlier: bitmap code tests + data_file_length to know if it has to create a new page or not. + */ + set_if_bigger(share->state.state.data_file_length, end_of_page); + DBUG_RETURN(result); + +crashed_file: + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); +err: + error= my_errno; + if (lock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED) + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + _ma_mark_file_crashed(share); + DBUG_ASSERT(!maria_assert_if_crashed_table); /* catch recovery error early */ + DBUG_RETURN((my_errno= error)); +} + + +/* + Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL + + SYNOPSIS + _ma_apply_redo_purge_row_head_or_tail() + info Maria handler + lsn LSN to put on page + page_type HEAD_PAGE or TAIL_PAGE + header Header (without FILEID) + + NOTES + This function is very similar to delete_head_or_tail() + + RETURN + 0 ok + # Error number +*/ + +uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header) +{ + MARIA_SHARE *share= info->s; + pgcache_page_no_t page; + uint rownr, empty_space; + uchar *buff; + int result; + uint error; + MARIA_PINNED_PAGE page_link; + DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail"); + + page= page_korr(header); + rownr= dirpos_korr(header+PAGE_STORE_SIZE); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + if (!(buff= pagecache_read(share->pagecache, &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + goto err; + + if (lsn_korr(buff) >= lsn) + { + /* + Already applied + Note that in case the page is not anymore a head or tail page + a future redo will fix the bitmap. + */ + check_skipped_lsn(info, lsn_korr(buff), 1, page); + if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type) + { + empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET); + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, + empty_space)) + goto err; + } + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(0); + } + + DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type); + + if (delete_dir_entry(share, buff, rownr, &empty_space) < 0) + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; + } + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + + result= 0; + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + /* This will work even if the page was marked as UNALLOCATED_PAGE */ + if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) + result= my_errno; + + DBUG_RETURN(result); + +err: + error= my_errno; + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + _ma_mark_file_crashed(share); + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_RETURN((my_errno= error)); + +} + + +/** + @brief Apply LOGREC_REDO_FREE_BLOCKS + + @param info Maria handler + @param header Header (without FILEID) + + Mark the pages free in the bitmap. + + We have to check against _ma_redo_not_needed_for_page() + to guard against the case where we first clear a block and after + that insert new data into the blocks. If we would unconditionally + clear the bitmap here, future changes would be ignored for the page + if it's not in the dirty list (ie, it would be flushed). + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_free_blocks(MARIA_HA *info, + LSN lsn __attribute__((unused)), + LSN redo_lsn, + const uchar *header) +{ + MARIA_SHARE *share= info->s; + uint ranges; + uint16 sid; + DBUG_ENTER("_ma_apply_redo_free_blocks"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + sid= fileid_korr(header); + header+= FILEID_STORE_SIZE; + ranges= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + DBUG_ASSERT(ranges > 0); + + /** @todo leave bitmap lock to the bitmap code... */ + mysql_mutex_lock(&share->bitmap.bitmap_lock); + while (ranges--) + { + my_bool res; + uint page_range; + pgcache_page_no_t page, start_page; + + start_page= page= page_korr(header); + header+= PAGE_STORE_SIZE; + /* Page range may have this bit set to indicate a tail page */ + page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT); + DBUG_ASSERT(page_range > 0); + + header+= PAGERANGE_STORE_SIZE; + + DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range)); + + for ( ; page_range-- ; start_page++) + { + if (_ma_redo_not_needed_for_page(sid, redo_lsn, start_page, FALSE)) + continue; + res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page, + 1); + if (res) + { + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + _ma_mark_file_crashed(share); + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_RETURN(res); + } + } + } + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + DBUG_RETURN(0); +} + + +/** + @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL + + @param info Maria handler + @param header Header (without FILEID) + + @note It marks the page free in the bitmap, and sets the directory's count + to 0. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, + const uchar *header) +{ + MARIA_SHARE *share= info->s; + uchar *buff; + pgcache_page_no_t page; + MARIA_PINNED_PAGE page_link; + my_bool res; + DBUG_ENTER("_ma_apply_redo_free_head_or_tail"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + page= page_korr(header); + + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, &page_link.link))) + { + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + check_skipped_lsn(info, lsn_korr(buff), 1, page); + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + } + else + { + buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + uint number_of_records= (uint) buff[DIR_COUNT_OFFSET]; + uchar *dir= dir_entry_pos(buff, share->block_size, + number_of_records-1); + buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST; + bzero(dir, number_of_records * DIR_ENTRY_SIZE); + } +#endif + + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + /** @todo leave bitmap lock to the bitmap code... */ + mysql_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1); + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + goto err; + DBUG_RETURN(0); + +err: + _ma_mark_file_crashed(share); + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_RETURN(1); +} + + +/** + @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS + + @param info Maria handler + @parma lsn LSN to put on pages + @param header Header (with FILEID) + @param redo_lsn REDO record's LSN + @param[out] number_of_blobs Number of blobs found in log record + @param[out] number_of_ranges Number of ranges found + @param[out] first_page First page touched + @param[out] last_page Last page touched + + @note Write full pages (full head & blob pages) + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, + LSN lsn, const uchar *header, + LSN redo_lsn, + uint * const number_of_blobs, + uint * const number_of_ranges, + pgcache_page_no_t * const first_page, + pgcache_page_no_t * const last_page) +{ + MARIA_SHARE *share= info->s; + const uchar *data; + uint data_size= FULL_PAGE_SIZE(share); + uint blob_count, ranges; + uint16 sid; + pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0; + DBUG_ENTER("_ma_apply_redo_insert_row_blobs"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + sid= fileid_korr(header); + header+= FILEID_STORE_SIZE; + *number_of_ranges= ranges= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + *number_of_blobs= blob_count= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + DBUG_ASSERT(ranges >= blob_count); + + data= (header + ranges * ROW_EXTENT_SIZE + + blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE)); + + while (blob_count--) + { + uint sub_ranges, empty_space; + + sub_ranges= uint2korr(header); + header+= SUB_RANGE_SIZE; + empty_space= uint2korr(header); + header+= BLOCK_FILLER_SIZE; + DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size); + ranges-= sub_ranges; + + while (sub_ranges--) + { + uint i; + uint res; + uint page_range; + pgcache_page_no_t page; + uchar *buff; + uint data_on_page= data_size; + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + page_range= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + + for (i= page_range; i-- > 0 ; page++, data+= data_on_page) + { + MARIA_PINNED_PAGE page_link; + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + + set_if_smaller(first_page2, page); + set_if_bigger(last_page2, page); + if (i == 0 && sub_ranges == 0) + data_on_page= data_size - empty_space; /* data on last page */ + if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE)) + continue; + + if (((page + 1) * share->block_size) > + share->state.state.data_file_length) + { + /* New page or half written page at end of file */ + DBUG_PRINT("info", ("Enlarging data file from %lu to %lu", + (ulong) share->state.state.data_file_length, + (ulong) ((page + 1 ) * share->block_size))); + share->state.state.data_file_length= (page + 1) * share->block_size; + buff= info->keyread_buff; + info->keyread_buff_used= 1; + make_empty_page(info, buff, BLOB_PAGE, 0); + unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED; + unpin_method= PAGECACHE_PIN_LEFT_UNPINNED; + } + else + { + share->pagecache->readwrite_flags&= ~MY_WME; + share->silence_encryption_errors= 1; + buff= pagecache_read(share->pagecache, + &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, &page_link.link); + share->pagecache->readwrite_flags= share->pagecache-> + org_readwrite_flags; + share->silence_encryption_errors= 0; + if (!buff) + { + if (my_errno != HA_ERR_FILE_TOO_SHORT && + my_errno != HA_ERR_WRONG_CRC && + my_errno != HA_ERR_DECRYPTION_FAILED) + { + /* If not read outside of file */ + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + goto err; + } + /* + Physical file was too short, create new page. It can be that + recovery started with a file with N pages, wrote page N+2 into + pagecache (increased data_file_length but not physical file + length), now reads page N+1: the read fails. + */ + buff= pagecache_block_link_to_buffer(page_link.link); + make_empty_page(info, buff, BLOB_PAGE, 0); + } + else + { +#ifdef DBUG_ASSERT_EXISTS + uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK); +#endif + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + check_skipped_lsn(info, lsn_korr(buff), 1, page); + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + goto fix_bitmap; + } + DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) || + (found_page_type == (uchar) UNALLOCATED_PAGE)); + } + unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK; + unpin_method= PAGECACHE_UNPIN; + } + + /* + Blob pages are never updated twice in same redo-undo chain, so + it's safe to update lsn for them here + */ + lsn_store(buff, lsn); + buff[PAGE_TYPE_OFFSET]= BLOB_PAGE; + bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE, + FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE)); + + if (data_on_page != data_size) + { + /* + Last page may be only partly filled. We zero the rest, like + write_full_pages() does. + */ + bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space, + empty_space); + } + memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, data_on_page); + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE)) + goto err; + + fix_bitmap: + /** @todo leave bitmap lock to the bitmap code... */ + mysql_mutex_lock(&share->bitmap.bitmap_lock); + res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, page, + 1); + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + if (res) + goto err; + } + } + } + *first_page= first_page2; + *last_page= last_page2; + DBUG_RETURN(0); + +err: + _ma_mark_file_crashed(share); + DBUG_ASSERT(!maria_assert_if_crashed_table); + DBUG_RETURN(1); +} + + +/**************************************************************************** + Applying of UNDO entries +****************************************************************************/ + +/** Execute undo of a row insert (delete the inserted row) */ + +my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header) +{ + pgcache_page_no_t page; + uint rownr; + uchar *buff; + my_bool res; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + ha_checksum checksum; + LSN lsn; + DBUG_ENTER("_ma_apply_undo_row_insert"); + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + rownr= dirpos_korr(header); + header+= DIRPOS_STORE_SIZE; + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + buff= pagecache_read(share->pagecache, + &info->dfile, page, 0, + 0, share->page_type, + PAGECACHE_LOCK_WRITE, + &page_link.link); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= buff != 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + if (!buff) + goto err; + + if (read_row_extent_info(info, buff, rownr)) + goto err; + + _ma_bitmap_flushable(info, 1); + if (delete_head_or_tail(info, page, rownr, 1, 1) || + delete_tails(info, info->cur_row.tail_positions)) + goto err; + + if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row)) + goto err; + + checksum= 0; + if (share->calc_checksum) + checksum= (ha_checksum) 0 - ha_checksum_korr(header); + info->last_auto_increment= ~ (ulonglong) 0; + if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT, + share->calc_checksum != 0, checksum, &lsn, (void*) 0)) + goto err; + + res= 0; +end: + /* The following is true only if _ma_bitmap_flushable() was called earlier */ + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + res= 1; + _ma_mark_file_crashed(share); + /* + Don't write a new LSN on the used pages. Not important as the file is + marked as crashed and need to be repaired before it can be used. + */ + lsn= LSN_IMPOSSIBLE; + goto end; +} + + +/** Execute undo of a row delete (insert the row back where it was) */ + +my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t header_length + __attribute__((unused))) +{ + MARIA_SHARE *share= info->s; + MARIA_ROW row; + MARIA_COLUMNDEF *column, *end_column; + MARIA_BITMAP_BLOCKS *blocks; + struct st_row_pos_info row_pos; + uchar *record; + const uchar *null_bits, *field_length_data, *extent_info; + pgcache_page_no_t page; + ulong *blob_lengths; + uint *null_field_lengths, extent_count, rownr, length_on_head_page; + DBUG_ENTER("_ma_apply_undo_row_delete"); + + /* + Use cur row as a base; We need to make a copy as we will change + some buffers to point directly to 'header' + */ + memcpy(&row, &info->cur_row, sizeof(row)); + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + rownr= dirpos_korr(header); + header+= DIRPOS_STORE_SIZE; + length_on_head_page= uint2korr(header); + header+= 2; + extent_count= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) ma_recordpos(page, rownr), + (ulong) page, rownr)); + + if (share->calc_checksum) + { + /* + We extract the checksum delta here, saving a recomputation in + allocate_and_write_block_record(). It's only an optimization. + */ + row.checksum= (ha_checksum) 0 - ha_checksum_korr(header); + header+= HA_CHECKSUM_STORE_SIZE; + } + extent_info= header; + header+= extent_count * ROW_EXTENT_SIZE; + + null_field_lengths= row.null_field_lengths; + blob_lengths= row.blob_lengths; + + /* + Fill in info->cur_row with information about the row, like in + calc_record_size(), to be used by write_block_record() + */ + + row.normal_length= row.char_length= row.varchar_length= + row.blob_length= row.extents_count= row.field_lengths_length= 0; + + null_bits= header; + header+= share->base.null_bytes; + /* This will not be changed */ + row.empty_bits= (uchar*) header; + header+= share->base.pack_bytes; + if (share->base.max_field_lengths) + { + row.field_lengths_length= uint2korr(header); + row.field_lengths= (uchar*) header + 2 ; + header+= 2 + row.field_lengths_length; + } + if (share->base.blobs) + row.blob_length= ma_get_length(&header); + + /* We need to build up a record (without blobs) in rec_buff */ + if (!(record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength, + MYF(MY_WME)))) + DBUG_RETURN(1); + + memcpy(record, null_bits, share->base.null_bytes); + + /* Copy field information from header to record */ + + /* Handle constant length fields that are always present */ + for (column= share->columndef, + end_column= column+ share->base.fixed_not_null_fields; + column < end_column; + column++) + { + memcpy(record + column->offset, header, column->length); + header+= column->length; + } + + /* Handle NULL fields and CHAR/VARCHAR fields */ + field_length_data= row.field_lengths; + for (end_column= share->columndef + share->base.fields; + column < end_column; + column++, null_field_lengths++) + { + if ((record[column->null_pos] & column->null_bit) || + (column->empty_bit && + row.empty_bits[column->empty_pos] & column->empty_bit)) + { + if (column->type != FIELD_BLOB) + *null_field_lengths= 0; + else + *blob_lengths++= 0; + if (share->calc_checksum) + bfill(record + column->offset, column->fill_length, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + case FIELD_SKIP_ZERO: /* Fixed length field */ + row.normal_length+= column->length; + *null_field_lengths= column->length; + memcpy(record + column->offset, header, column->length); + header+= column->length; + break; + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + uint length; + if (column->length <= 255) + length= (uint) *field_length_data++; + else + { + length= uint2korr(field_length_data); + field_length_data+= 2; + } + row.char_length+= length; + *null_field_lengths= length; + memcpy(record + column->offset, header, length); + if (share->calc_checksum) + bfill(record + column->offset + length, (column->length - length), + ' '); + header+= length; + break; + } + case FIELD_VARCHAR: + { + uint length; + uchar *field_pos= record + column->offset; + + /* 256 is correct as this includes the length uchar */ + if (column->fill_length == 1) + { + field_pos[0]= *field_length_data; + length= (uint) *field_length_data; + } + else + { + field_pos[0]= field_length_data[0]; + field_pos[1]= field_length_data[1]; + length= uint2korr(field_length_data); + } + field_length_data+= column->fill_length; + field_pos+= column->fill_length; + row.varchar_length+= length; + *null_field_lengths= length; + memcpy(field_pos, header, length); + header+= length; + break; + } + case FIELD_BLOB: + { + /* Copy length of blob and pointer to blob data to record */ + uchar *field_pos= record + column->offset; + uint size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length, field_length_data); + + memcpy(field_pos, field_length_data, size_length); + field_length_data+= size_length; + memcpy(field_pos + size_length, &header, sizeof(header)); + header+= blob_length; + *blob_lengths++= blob_length; + break; + } + default: + DBUG_ASSERT(0); + } + } + row.head_length= (info->row_base_length + + share->base.fixed_not_null_fields_length + + row.field_lengths_length + + size_to_store_key_length(row.field_lengths_length) + + row.normal_length + + row.char_length + row.varchar_length); + row.total_length= (row.head_length + row.blob_length); + if (row.total_length < share->base.min_block_length) + row.total_length= share->base.min_block_length; + + /* + Row is now generated. Now we need to insert record on the original + pages with original size on each page. + */ + + _ma_bitmap_flushable(info, 1); + /* Change extent information to be usable by write_block_record() */ + blocks= &row.insert_blocks; + if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info)) + goto err; + blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info, + &share->bitmap, + page); + blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP; + + /* Read head page and allocate data for rowid */ + if (get_rowpos_in_head_or_tail_page(info, blocks->block, + info->buff, + length_on_head_page, + HEAD_PAGE, PAGECACHE_LOCK_WRITE, + rownr, &row_pos)) + goto err; + + if (share->calc_checksum) + { + DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record)); + } + /* Store same amount of data on head page as on original page */ + row_pos.length= (length_on_head_page - + (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE); + set_if_bigger(row_pos.length, share->base.min_block_length); + if (write_block_record(info, (uchar*) 0, record, &row, + blocks, blocks->block->org_bitmap_value != 0, + &row_pos, undo_lsn, 0)) + goto err; + + my_free(record); + DBUG_RETURN(0); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + _ma_mark_file_crashed(share); + if (info->non_flushable_state) + _ma_bitmap_flushable(info, -1); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); + my_free(record); + DBUG_RETURN(1); +} + + +/** + Execute undo of a row update + + @fn _ma_apply_undo_row_update() + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, + const uchar *header, + size_t header_length + __attribute__((unused))) +{ + MARIA_SHARE *share= info->s; + MARIA_RECORD_POS record_pos; + const uchar *field_length_data, *field_length_data_end, *extent_info; + uchar *current_record, *orig_record; + pgcache_page_no_t page; + ha_checksum UNINIT_VAR(checksum_delta); + uint rownr, field_length_header, extent_count, length_on_head_page; + int error; + DBUG_ENTER("_ma_apply_undo_row_update"); + + page= page_korr(header); + header+= PAGE_STORE_SIZE; + rownr= dirpos_korr(header); + header+= DIRPOS_STORE_SIZE; + + record_pos= ma_recordpos(page, rownr); + DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u", + (ulong) record_pos, (ulong) page, rownr)); + + if (share->calc_checksum) + { + checksum_delta= ha_checksum_korr(header); + header+= HA_CHECKSUM_STORE_SIZE; + } + length_on_head_page= uint2korr(header); + set_if_bigger(length_on_head_page, share->base.min_block_length); + header+= 2; + extent_count= pagerange_korr(header); + header+= PAGERANGE_STORE_SIZE; + extent_info= header; + header+= extent_count * ROW_EXTENT_SIZE; + + /* + Set header to point to old field values, generated by + fill_update_undo_parts() + */ + field_length_header= ma_get_length(&header); + field_length_data= (uchar*) header; + header+= field_length_header; + field_length_data_end= header; + + /* Allocate buffer for current row & original row */ + if (!(current_record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength * 2, + MYF(MY_WME)))) + DBUG_RETURN(1); + orig_record= current_record+ share->base.reclength; + + /* Read current record */ + if (_ma_read_block_record(info, current_record, record_pos)) + goto err; + + if (*field_length_data == 255) + { + /* Bitmap changed */ + field_length_data++; + memcpy(orig_record, header, share->base.null_bytes); + header+= share->base.null_bytes; + } + else + memcpy(orig_record, current_record, share->base.null_bytes); + bitmap_clear_all(&info->changed_fields); + + while (field_length_data < field_length_data_end) + { + uint field_nr= ma_get_length(&field_length_data), field_length; + MARIA_COLUMNDEF *column= share->columndef + field_nr; + uchar *orig_field_pos= orig_record + column->offset; + + bitmap_set_bit(&info->changed_fields, field_nr); + if (field_nr >= share->base.fixed_not_null_fields) + { + if (!(field_length= ma_get_length(&field_length_data))) + { + /* Null field or empty field */ + bfill(orig_field_pos, column->fill_length, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + continue; + } + } + else + field_length= column->length; + + switch (column->type) { + case FIELD_CHECK: + case FIELD_NORMAL: /* Fixed length field */ + case FIELD_ZERO: + case FIELD_SKIP_PRESPACE: /* Not packed */ + memcpy(orig_field_pos, header, column->length); + header+= column->length; + break; + case FIELD_SKIP_ZERO: /* Number */ + case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + uint diff; + memcpy(orig_field_pos, header, field_length); + if ((diff= (column->length - field_length))) + bfill(orig_field_pos + column->length - diff, diff, + column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0); + header+= field_length; + } + break; + case FIELD_VARCHAR: + if (column->length <= 256) + { + *orig_field_pos++= (uchar) field_length; + } + else + { + int2store(orig_field_pos, field_length); + orig_field_pos+= 2; + } + memcpy(orig_field_pos, header, field_length); + header+= field_length; + break; + case FIELD_BLOB: + { + uint size_length= column->length - portable_sizeof_char_ptr; + _ma_store_blob_length(orig_field_pos, size_length, field_length); + memcpy(orig_field_pos + size_length, &header, sizeof(header)); + header+= field_length; + break; + } + default: + DBUG_ASSERT(0); + } + } + copy_not_changed_fields(info, &info->changed_fields, + orig_record, current_record); + + if (share->calc_checksum) + { + info->new_row.checksum= checksum_delta + + (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record)); + /* verify that record's content is sane */ + DBUG_ASSERT(info->new_row.checksum == + (*share->calc_checksum)(info, current_record)); + } + + info->last_auto_increment= ~ (ulonglong) 0; + /* Now records are up to date, execute the update to original values */ + if (_ma_update_at_original_place(info, page, rownr, length_on_head_page, + extent_count, extent_info, + current_record, orig_record, undo_lsn)) + goto err; + + error= 0; +end: + my_free(current_record); + DBUG_RETURN(error); + +err: + DBUG_ASSERT(!maria_assert_if_crashed_table); + error= 1; + _ma_mark_file_crashed(share); + goto end; +} + + +/** + Execute undo of a bulk insert which used repair + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn) +{ + my_bool error; + LSN lsn; + DBUG_ENTER("_ma_apply_undo_bulk_insert"); + /* + We delete all rows, re-enable indices as bulk insert had disabled + non-unique ones. + */ + error= (maria_delete_all_rows(info) || + maria_enable_indexes(info) || + /* we enabled indices so need '2' below */ + _ma_state_info_write(info->s, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO | + MA_STATE_INFO_WRITE_LOCK) || + _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT, + FALSE, 0, &lsn, NULL)); + DBUG_RETURN(error); +} + + +/** + @brief Get the TRANSLOG_ADDRESS to flush up to + + @param page Page's content + @param page_no Page's number (<offset>/<page length>) + @param data_ptr Callback data pointer (pointer to MARIA_SHARE) + + @note + Usable for data (non-bitmap) and index pages + + @retval LSN to flush up to +*/ + +TRANSLOG_ADDRESS +maria_page_get_lsn(uchar *page, + pgcache_page_no_t page_no __attribute__((unused)), + uchar* data_ptr __attribute__((unused))) +{ +#ifndef DBUG_OFF + const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr; + DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE && + share->now_transactional); +#endif + return lsn_korr(page); +} + + +/** + @brief Enable reading of all rows, ignoring versioning + + @note + This is mainly useful in single user applications, like maria_pack, + where we want to be able to read all rows without having to read the + transaction id from the control file +*/ + +void maria_ignore_trids(MARIA_HA *info) +{ + if (info->s->base.born_transactional) + { + if (!info->trn) + _ma_set_tmp_trn_for_table(info, &dummy_transaction_object); + /* Ignore transaction id when row is read */ + info->trn->min_read_from= ~(TrID) 0; + } +} + + +#ifndef DBUG_OFF + +/* The following functions are useful to call from debugger */ + +void _ma_print_block_info(MARIA_SHARE *share, uchar *buff) +{ + LSN lsn= lsn_korr(buff); + + printf("LSN: " LSN_FMT " type: %u dir_entries: %u dir_free: %u empty_space: %u\n", + LSN_IN_PARTS(lsn), + (uint)buff[PAGE_TYPE_OFFSET], + (uint)buff[DIR_COUNT_OFFSET], + (uint)buff[DIR_FREE_OFFSET], + (uint) uint2korr(buff + EMPTY_SPACE_OFFSET)); + printf("Start of directory: %lu\n", + maria_block_size - PAGE_SUFFIX_SIZE - + (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE); + _ma_print_directory(share, stdout, buff, maria_block_size); +} +#endif diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h new file mode 100644 index 00000000..42546ebd --- /dev/null +++ b/storage/maria/ma_blockrec.h @@ -0,0 +1,314 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Storage of records in block +*/ + +#define LSN_SIZE 7 +#define DIR_COUNT_SIZE 1 /* Stores number of rows on page */ +#define DIR_FREE_SIZE 1 /* Pointer to first free dir entry */ +#define EMPTY_SPACE_SIZE 2 /* Stores empty space on page */ +#define PAGE_TYPE_SIZE 1 +#define PAGE_SUFFIX_SIZE 4 /* Bytes for checksum */ +#define PAGE_HEADER_SIZE_RAW (LSN_SIZE + DIR_COUNT_SIZE + DIR_FREE_SIZE + \ + EMPTY_SPACE_SIZE + PAGE_TYPE_SIZE) + +#define PAGE_HEADER_SIZE(share) (PAGE_HEADER_SIZE_RAW + \ + (share)->crypt_page_header_space) + +#define PAGE_OVERHEAD_SIZE_RAW (PAGE_HEADER_SIZE_RAW + DIR_ENTRY_SIZE + \ + PAGE_SUFFIX_SIZE) +#define PAGE_OVERHEAD_SIZE(share) (PAGE_OVERHEAD_SIZE_RAW + \ + (share)->crypt_page_header_space) + +#define BLOCK_RECORD_POINTER_SIZE 6 + +#define FULL_PAGE_HEADER_SIZE(share) (LSN_SIZE + PAGE_TYPE_SIZE + \ + (share)->crypt_page_header_space) +#define FULL_PAGE_SIZE(share) ((share)->block_size - \ + FULL_PAGE_HEADER_SIZE(share) - \ + PAGE_SUFFIX_SIZE) + +#define FULL_PAGE_SIZE2(block_size, crypt_size) \ + ((block_size) - (LSN_SIZE + PAGE_TYPE_SIZE + PAGE_SUFFIX_SIZE + (crypt_size))) + +#define ROW_EXTENT_PAGE_SIZE 5 +#define ROW_EXTENT_COUNT_SIZE 2 +#define SUB_RANGE_SIZE 2 +#define BLOCK_FILLER_SIZE 2 +#define ROW_EXTENT_SIZE (ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE) +#define TAIL_BIT 0x8000U /* Bit in page_count to signify tail */ +#define START_EXTENT_BIT 0x4000U /* Bit in page_count to signify start*/ +/* page_count set by bitmap code for tail pages */ +#define TAIL_PAGE_COUNT_MARKER 0xffff +/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */ +#define ELEMENTS_RESERVED_FOR_MAIN_PART 4 +/* This is just used to prealloc a dynamic array */ +#define AVERAGE_BLOB_SIZE (1024L*1024L) +/* Number of pages to store continuous blob parts */ +#define BLOB_SEGMENT_MIN_SIZE 128 + +/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */ +#define EXTRA_LENGTH_FIELDS 3 + +/* Size for the different parts in the row header (and head page) */ +#define FLAG_SIZE 1 +#define VERPTR_SIZE 7 +#define DIR_ENTRY_SIZE 4 +#define FIELD_OFFSET_SIZE 2 /* size of pointers to field starts */ + +/* Minimum header size needed for a new row */ +#define BASE_ROW_HEADER_SIZE FLAG_SIZE + +#define PAGE_TYPE_MASK 7 +enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE }; +#define PAGE_CAN_BE_COMPACTED 128U /* Bit in PAGE_TYPE */ + +#define PAGE_TYPE_OFFSET LSN_SIZE +#define DIR_COUNT_OFFSET (LSN_SIZE+PAGE_TYPE_SIZE) +#define DIR_FREE_OFFSET (DIR_COUNT_OFFSET+DIR_COUNT_SIZE) +#define EMPTY_SPACE_OFFSET (DIR_FREE_OFFSET+DIR_FREE_SIZE) + /* for encryption */ +#define KEY_VERSION_OFFSET (EMPTY_SPACE_OFFSET+EMPTY_SPACE_SIZE) +#define FULL_PAGE_KEY_VERSION_OFFSET (PAGE_TYPE_OFFSET + PAGE_TYPE_SIZE) + +/* Bits used for flag uchar (one byte, first in record) */ +#define ROW_FLAG_TRANSID 1U +#define ROW_FLAG_VER_PTR 2U +#define ROW_FLAG_DELETE_TRANSID 4U +#define ROW_FLAG_NULLS_EXTENDED 8U +#define ROW_FLAG_EXTENTS 128U +#define ROW_FLAG_ALL (1U+2U+4U+8U+128U) + +/* Size for buffer to hold information about bitmap */ +#define MAX_BITMAP_INFO_LENGTH ((MARIA_MAX_KEY_BLOCK_LENGTH*8/3)*(61*11/60)+10) + + +/******** Variables that affects how data pages are utilized ********/ + +/* Minimum size of tail segment */ +#define MIN_TAIL_SIZE 32U + +/* + Fixed length part of Max possible header size; See row data structure + table in ma_blockrec.c. +*/ +#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3) +#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \ + TRANSID_SIZE + VERPTR_SIZE + \ + TRANSID_SIZE) + +/* We use 1 uchar in record header to store number of directory entries */ +#define MAX_ROWS_PER_PAGE 255 +#define END_OF_DIR_FREE_LIST ((uchar) 255) + +/* Bits for MARIA_BITMAP_BLOCKS->used */ +/* We stored data on disk in the block */ +#define BLOCKUSED_USED 1 +/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */ +#define BLOCKUSED_USE_ORG_BITMAP 2 +/* We stored tail data on disk for the block */ +#define BLOCKUSED_TAIL 4 + +/******* defines that affects allocation (density) of data *******/ + +/* + If the tail part (from the main block or a blob) would use more than 75 % of + the size of page, store the tail on a full page instead of a shared + tail page. +*/ +#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4) + +/* Don't allocate memory for too many row extents on the stack */ +#define ROW_EXTENTS_ON_STACK 32 + +/* Functions to convert MARIA_RECORD_POS to/from page:offset */ + +static inline MARIA_RECORD_POS ma_recordpos(pgcache_page_no_t page, + uint dir_entry) +{ + DBUG_ASSERT(dir_entry <= 255); + DBUG_ASSERT(page > 0); /* page 0 is bitmap, not data page */ + return (MARIA_RECORD_POS) (((ulonglong) page << 8) | dir_entry); +} + +static inline pgcache_page_no_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos) +{ + return (pgcache_page_no_t) (record_pos >> 8); +} + +static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos) +{ + return (uint) (record_pos & 255); +} + +static inline uchar *dir_entry_pos(uchar *buff, uint block_size, uint pos) +{ + return (buff + block_size - DIR_ENTRY_SIZE * pos - PAGE_SUFFIX_SIZE - + DIR_ENTRY_SIZE); +} + +/* ma_blockrec.c */ +void _ma_init_block_record_data(void); +my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile); +my_bool _ma_once_end_block_record(MARIA_SHARE *share); +my_bool _ma_init_block_record(MARIA_HA *info); +void _ma_end_block_record(MARIA_HA *info); + +my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec, const uchar *newrec); +my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record); +int _ma_read_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS record_pos); +int _ma_read_block_record2(MARIA_HA *info, uchar *record, + uchar *data, uchar *end_of_data); +int _ma_scan_block_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS, my_bool); +my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_scan_init_block_record(MARIA_HA *info); +void _ma_scan_end_block_record(MARIA_HA *info); +int _ma_scan_remember_block_record(MARIA_HA *info, + MARIA_RECORD_POS *lastpos); +int _ma_scan_restore_block_record(MARIA_HA *info, + MARIA_RECORD_POS lastpos); + +MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info, + const uchar *record); +my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record); +my_bool _ma_write_abort_block_record(MARIA_HA *info); +my_bool _ma_compare_block_record(MARIA_HA *info, const uchar *record); +void _ma_compact_block_page(MARIA_SHARE *share, uchar *buff, uint rownr, + my_bool extend_block, TrID min_read_from, + uint min_row_length); +my_bool enough_free_entries_on_page(MARIA_SHARE *share, uchar *page_buff); +TRANSLOG_ADDRESS +maria_page_get_lsn(uchar *page, pgcache_page_no_t page_no, uchar* data_ptr); + +/* ma_bitmap.c */ +extern const char *bits_to_txt[]; + +my_bool _ma_bitmap_init(MARIA_SHARE *share, File file, + pgcache_page_no_t *last_page); +my_bool _ma_bitmap_end(MARIA_SHARE *share); +my_bool _ma_bitmap_flush(MARIA_SHARE *share); +my_bool _ma_bitmap_flush_all(MARIA_SHARE *share); +void _ma_bitmap_reset_cache(MARIA_SHARE *share); +my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row, + MARIA_BITMAP_BLOCKS *result_blocks); +my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks); +my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents, + uint count); +my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t pos, my_bool head, + uint empty_space); +my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, + uint page_count); +my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info, + MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page, uint page_count); +uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size); +my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row, + pgcache_page_no_t page, uint free_size, + MARIA_BITMAP_BLOCKS *result_blocks); +my_bool _ma_check_bitmap_data(MARIA_HA *info, + enum en_page_type page_type, + uint empty_space, uint bitmap_pattern); +my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info, + enum en_page_type page_type, + pgcache_page_no_t page, + uint *bitmap_pattern); +uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap, + pgcache_page_no_t page); +void _ma_bitmap_delete_all(MARIA_SHARE *share); +int _ma_bitmap_create_first(MARIA_SHARE *share); +void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc); +void _ma_bitmap_lock(MARIA_SHARE *share); +void _ma_bitmap_unlock(MARIA_SHARE *share); +void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share); +#ifndef DBUG_OFF +void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data, + pgcache_page_no_t page); +#endif +size_t _ma_get_bitmap_description(MARIA_FILE_BITMAP *bitmap, + uchar *bitmap_data, + pgcache_page_no_t page, + char *out); + +uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + my_bool new_page, + const uchar *header, + const uchar *data, + size_t data_length); +uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, + uint page_type, + const uchar *header); +uint _ma_apply_redo_free_blocks(MARIA_HA *info, LSN lsn, LSN rec_lsn, + const uchar *header); +uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, + const uchar *header); +uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, LSN lsn, + const uchar *header, LSN redo_lsn, + uint * const number_of_blobs, + uint * const number_of_ranges, + pgcache_page_no_t * const first_page, + pgcache_page_no_t * const last_page); +my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, LSN lsn, + const uchar *header); +my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header); +my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length); +my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn, + const uchar *header, size_t length); +my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn); + +my_bool write_hook_for_redo(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool write_hook_for_undo(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool write_hook_for_redo_delete_all(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_row_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_row_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_row_update(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +my_bool write_hook_for_file_id(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool write_hook_for_commit(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +my_bool _ma_block_get_status(void *param, my_bool concurrent_insert); +my_bool _ma_block_start_trans(void* param); +my_bool _ma_block_start_trans_no_versioning(void *param); +void _ma_block_update_status(void *param); +void _ma_block_restore_status(void *param); +my_bool _ma_block_check_status(void *param); diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c new file mode 100644 index 00000000..59cc0ad6 --- /dev/null +++ b/storage/maria/ma_cache.c @@ -0,0 +1,119 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Functions for read record cacheing with maria + Used for reading dynamic/compressed records from datafile. + + Can fetch data directly from file (outside cache), + if reading a small chunk straight before the cached part (with possible + overlap). + + Can be explicitly asked not to use cache (by not setting READING_NEXT in + flag) - useful for occasional out-of-cache reads, when the next read is + expected to hit the cache again. + + Allows "partial read" errors in the record header (when READING_HEADER flag + is set) - unread part is bzero'ed + + Note: out-of-cache reads are enabled for shared IO_CACHE's too, + as these reads will be cached by OS cache (and my_pread is always atomic) +*/ + + +#include "maria_def.h" + +my_bool _ma_read_cache(MARIA_HA *handler, IO_CACHE *info, uchar *buff, + my_off_t pos, size_t length, uint flag) +{ + size_t read_length,in_buff_length; + my_off_t offset; + uchar *in_buff_pos; + DBUG_ENTER("_ma_read_cache"); + DBUG_ASSERT(!(info->myflags & MY_ENCRYPT)); + + if (unlikely(pos >= info->end_of_file) && (flag & READING_HEADER)) + DBUG_RETURN(-1); + + if (pos < info->pos_in_file) + { + read_length=length; + if ((my_off_t) read_length > (my_off_t) (info->pos_in_file-pos)) + read_length=(uint) (info->pos_in_file-pos); + info->seek_not_done=1; + if (mysql_file_pread(info->file,buff,read_length,pos,MYF(MY_NABP))) + DBUG_RETURN(1); + if (!(length-=read_length)) + DBUG_RETURN(0); + pos+=read_length; + buff+=read_length; + } + if (pos >= info->pos_in_file && + (offset= (my_off_t) (pos - info->pos_in_file)) < + (my_off_t) (info->read_end - info->request_pos)) + { + in_buff_pos=info->request_pos+(uint) offset; + in_buff_length= MY_MIN(length,(size_t) (info->read_end-in_buff_pos)); + memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length); + if (!(length-=in_buff_length)) + DBUG_RETURN(0); + pos+=in_buff_length; + buff+=in_buff_length; + } + else + in_buff_length=0; + if (flag & READING_NEXT) + { + if (pos != (info->pos_in_file + + (uint) (info->read_end - info->request_pos))) + { + info->pos_in_file=pos; /* Force start here */ + info->read_pos=info->read_end=info->request_pos; /* Everything used */ + info->seek_not_done=1; + } + else + info->read_pos=info->read_end; /* All block used */ + if (!_my_b_read(info,buff,length)) + DBUG_RETURN(0); + read_length=info->error; + } + else + { + info->seek_not_done=1; + if ((read_length=mysql_file_pread(info->file,buff,length,pos,MYF(0))) == length) + DBUG_RETURN(0); + } + if (!(flag & READING_HEADER) || (int) read_length == -1 || + read_length+in_buff_length < 3) + { + if ((flag & READING_HEADER) && read_length + in_buff_length == 0) + DBUG_RETURN(-1); /* End of file */ + + DBUG_PRINT("error", + ("Error %d reading next-multi-part block (Got %d of %d bytes)", + my_errno, (int) read_length, (int) length)); + if (!my_errno || my_errno == HA_ERR_FILE_TOO_SHORT) + { + if (!handler->in_check_table) + _ma_set_fatal_error(handler, HA_ERR_FILE_TOO_SHORT); + if (!my_errno) + my_errno= HA_ERR_WRONG_IN_RECORD; + } + DBUG_RETURN(1); + } + bzero(buff+read_length,MARIA_BLOCK_INFO_HEADER_LENGTH - in_buff_length - + read_length); + DBUG_RETURN(0); +} /* _ma_read_cache */ diff --git a/storage/maria/ma_changed.c b/storage/maria/ma_changed.c new file mode 100644 index 00000000..ef708a12 --- /dev/null +++ b/storage/maria/ma_changed.c @@ -0,0 +1,33 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Check if somebody has changed table since last check. */ + +#include "maria_def.h" + + /* Return 0 if table isn't changed */ + +int maria_is_changed(MARIA_HA *info) +{ + int result; + DBUG_ENTER("maria_is_changed"); + if (fast_ma_readinfo(info)) + DBUG_RETURN(-1); + _ma_writeinfo(info, 0); + result=(int) info->data_changed; + info->data_changed=0; + DBUG_PRINT("exit",("result: %d",result)); + DBUG_RETURN(result); +} diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c new file mode 100644 index 00000000..1f6aa2ee --- /dev/null +++ b/storage/maria/ma_check.c @@ -0,0 +1,7097 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Describe, check and repair of MARIA tables */ + +/* + About checksum calculation. + + There are two types of checksums. Table checksum and row checksum. + + Row checksum is an additional uchar at the end of dynamic length + records. It must be calculated if the table is configured for them. + Otherwise they must not be used. The variable + MYISAM_SHARE::calc_checksum determines if row checksums are used. + MI_INFO::checksum is used as temporary storage during row handling. + For parallel repair we must assure that only one thread can use this + variable. There is no problem on the write side as this is done by one + thread only. But when checking a record after read this could go + wrong. But since all threads read through a common read buffer, it is + sufficient if only one thread checks it. + + Table checksum is an eight uchar value in the header of the index file. + It can be calculated even if row checksums are not used. The variable + MI_CHECK::glob_crc is calculated over all records. + MI_SORT_PARAM::calc_checksum determines if this should be done. This + variable is not part of MI_CHECK because it must be set per thread for + parallel repair. The global glob_crc must be changed by one thread + only. And it is sufficient to calculate the checksum once only. +*/ + +#include "ma_ftdefs.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_trnman.h" +#include "ma_key_recover.h" +#include <my_check_opt.h> +#include <my_stack_alloc.h> +#include <my_getopt.h> +#ifdef HAVE_SYS_VADVISE_H +#include <sys/vadvise.h> +#endif + +/* Functions defined in this file */ + +static int check_k_link(HA_CHECK *param, MARIA_HA *info, my_off_t next_link); +static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *page, ha_rows *keys, + ha_checksum *key_checksum, uint level); +static uint isam_key_length(MARIA_HA *info,MARIA_KEYDEF *keyinfo); +static ha_checksum calc_checksum(ha_rows count); +static int writekeys(MARIA_SORT_PARAM *sort_param); +static int sort_one_index(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t pagepos, File new_file); +static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key); +static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key); +static int sort_get_next_record(MARIA_SORT_PARAM *sort_param); +static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a, + const void *b); +static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param, + const uchar *a); +static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a); +static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, const uchar *key); +static int sort_insert_key(MARIA_SORT_PARAM *sort_param, + reg1 MA_SORT_KEY_BLOCKS *key_block, + const uchar *key, my_off_t prev_block); +static int sort_delete_record(MARIA_SORT_PARAM *sort_param); +/*static int _ma_flush_pending_blocks(HA_CHECK *param);*/ +static MA_SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, + uint buffer_length); +static ha_checksum maria_byte_checksum(const uchar *buf, uint length); +static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share); +static void restore_data_file_type(MARIA_SHARE *share); +static void change_data_file_descriptor(MARIA_HA *info, File new_file); +static void unuse_data_file_descriptor(MARIA_HA *info); +static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, + MARIA_HA *info, uchar *record); +static void copy_data_file_state(MARIA_STATE_INFO *to, + MARIA_STATE_INFO *from); +static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info, + my_off_t position); +static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file); +static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param, + MARIA_HA *info); +static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid); +void retry_if_quick(MARIA_SORT_PARAM *param, int error); +static void print_bitmap_description(MARIA_SHARE *share, + pgcache_page_no_t page, + uchar *buff); + + +/* Initialize check param with default values */ + +void maria_chk_init(HA_CHECK *param) +{ + bzero((uchar*) param,sizeof(*param)); + param->opt_follow_links=1; + param->keys_in_use= ~(ulonglong) 0; + param->search_after_block=HA_OFFSET_ERROR; + param->auto_increment_value= 0; + param->use_buffers= PAGE_BUFFER_INIT; + param->read_buffer_length=READ_BUFFER_INIT; + param->write_buffer_length=READ_BUFFER_INIT; + param->orig_sort_buffer_length=SORT_BUFFER_INIT; + param->sort_key_blocks=BUFFERS_WHEN_SORTING; + param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL; + param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL); + param->start_check_pos=0; + param->max_record_length= LONGLONG_MAX; + param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE; + param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL; + param->max_stage= 1; + param->stack_end_ptr= &my_thread_var->stack_ends_here; + param->max_allowed_lsn= (LSN) ~0ULL; + param->malloc_flags= MY_THREAD_SPECIFIC; +} + + +/* Initialize check param and maria handler for check of table */ + +void maria_chk_init_for_check(HA_CHECK *param, MARIA_HA *info) +{ + param->not_visible_rows_found= 0; + param->max_found_trid= 0; + + /* + Set up transaction handler so that we can see all rows. When rows is read + we will check the found id against param->max_tried + */ + if (!info->s->base.born_transactional) + { + /* + There are no trids. Howver we want to set max_trid to make test of + create_trid simpler. + */ + param->max_trid= ~(TrID) 0; + } + else if (param->max_trid == 0 || param->max_trid == ~(TrID) 0) + { + if (!ma_control_file_inited()) + param->max_trid= 0; /* Give warning for first trid found */ + else + param->max_trid= max_trid_in_system(); + } + + maria_ignore_trids(info); +} + + + /* Check the status flags for the table */ + +int maria_chk_status(HA_CHECK *param, MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + /* Protection for HA_EXTRA_FLUSH */ + mysql_mutex_lock(&share->intern_lock); + + if (maria_is_crashed_on_repair(info)) + _ma_check_print_warning(param, + "Table is marked as crashed and last repair failed"); + else if (maria_in_repair(info)) + _ma_check_print_warning(param, + "Last repair was aborted before finishing"); + else if (maria_is_crashed(info)) + _ma_check_print_warning(param, + "Table is marked as crashed"); + if (share->state.open_count != (uint) (share->global_changed ? 1 : 0)) + { + /* Don't count this as a real warning, as check can correct this ! */ + my_bool save=param->warning_printed; + _ma_check_print_warning(param, + share->state.open_count==1 ? + "%d client is using or hasn't closed the table properly" : + "%d clients are using or haven't closed the table properly", + share->state.open_count); + /* If this will be fixed by the check, forget the warning */ + if (param->testflag & T_UPDATE_STATE) + param->warning_printed=save; + } + + mysql_mutex_unlock(&share->intern_lock); + + if (share->state.create_trid > param->max_trid) + { + param->wrong_trd_printed= 1; /* Force should run zerofill */ + _ma_check_print_warning(param, + "Table create_trd (%llu) > current max_transaction id (%llu). Table needs to be repaired or zerofilled to be usable", + share->state.create_trid, param->max_trid); + return 1; + } + return 0; +} + +/* + Check delete links in row data +*/ + +int maria_chk_del(HA_CHECK *param, register MARIA_HA *info, + ulonglong test_flag) +{ + MARIA_SHARE *share= info->s; + reg2 ha_rows i; + uint delete_link_length; + my_off_t empty,next_link,UNINIT_VAR(old_link); + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_del"); + + param->record_checksum=0; + + if (share->data_file_type == BLOCK_RECORD) + DBUG_RETURN(0); /* No delete links here */ + + delete_link_length=((share->options & HA_OPTION_PACK_RECORD) ? 20 : + share->rec_reflength+1); + + if (!(test_flag & T_SILENT)) + puts("- check record delete-chain"); + + next_link=share->state.dellink; + if (share->state.state.del == 0) + { + if (test_flag & T_VERBOSE) + { + puts("No recordlinks"); + } + } + else + { + if (test_flag & T_VERBOSE) + printf("Recordlinks: "); + empty=0; + for (i= share->state.state.del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--) + { + if (_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (test_flag & T_VERBOSE) + printf(" %9s",llstr(next_link,buff)); + if (next_link >= share->state.state.data_file_length) + goto wrong; + if (mysql_file_pread(info->dfile.file, (uchar*) buff, delete_link_length, + next_link,MYF(MY_NABP))) + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Can't read delete-link at filepos: %s", + llstr(next_link,buff)); + DBUG_RETURN(1); + } + if (*buff != '\0') + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"Record at pos: %s is not remove-marked", + llstr(next_link,buff)); + goto wrong; + } + if (share->options & HA_OPTION_PACK_RECORD) + { + my_off_t prev_link=mi_sizekorr(buff+12); + if (empty && prev_link != old_link) + { + if (test_flag & T_VERBOSE) puts(""); + _ma_check_print_error(param, + "Deleted block at %s doesn't point back at previous delete link", + llstr(next_link,buff2)); + goto wrong; + } + old_link=next_link; + next_link=mi_sizekorr(buff+4); + empty+=mi_uint3korr(buff+1); + } + else + { + param->record_checksum+=(ha_checksum) next_link; + next_link= _ma_rec_pos(share, (uchar *) buff + 1); + empty+=share->base.pack_reclength; + } + } + if (share->state.state.del && (test_flag & T_VERBOSE)) + puts("\n"); + if (empty != share->state.state.empty) + { + _ma_check_print_warning(param, + "Found %s deleted space in delete link chain. Should be %s", + llstr(empty,buff2), + llstr(share->state.state.empty,buff)); + } + if (next_link != HA_OFFSET_ERROR) + { + _ma_check_print_error(param, + "Found more than the expected %s deleted rows in delete link chain", + llstr(share->state.state.del, buff)); + goto wrong; + } + if (i != 0) + { + _ma_check_print_error(param, + "Found %s deleted rows in delete link chain. Should be %s", + llstr(share->state.state.del - i, buff2), + llstr(share->state.state.del, buff)); + goto wrong; + } + } + DBUG_RETURN(0); + +wrong: + param->testflag|=T_RETRY_WITHOUT_QUICK; + if (test_flag & T_VERBOSE) + puts(""); + _ma_check_print_error(param,"record delete-link-chain corrupted"); + DBUG_RETURN(1); +} /* maria_chk_del */ + + +/* Check delete links in index file */ + +static int check_k_link(HA_CHECK *param, register MARIA_HA *info, + my_off_t next_link) +{ + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + ha_rows records; + char llbuff[21], llbuff2[21]; + uchar *buff; + DBUG_ENTER("check_k_link"); + + if (next_link == HA_OFFSET_ERROR) + DBUG_RETURN(0); /* Avoid printing empty line */ + + records= (ha_rows) (share->state.state.key_file_length / block_size); + while (next_link != HA_OFFSET_ERROR && records > 0) + { + if (_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (param->testflag & T_VERBOSE) + printf("%16s",llstr(next_link,llbuff)); + + /* Key blocks must lay within the key file length entirely. */ + if (next_link + block_size > share->state.state.key_file_length) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Invalid key block position: %s " + "key block size: %u file_length: %s", + llstr(next_link, llbuff), block_size, + llstr(share->state.state.key_file_length, llbuff2)); + DBUG_RETURN(1); + /* purecov: end */ + } + + /* Key blocks must be aligned at block_size */ + if (next_link & (block_size -1)) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Mis-aligned key block: %s " + "minimum key block length: %u", + llstr(next_link, llbuff), + block_size); + DBUG_RETURN(1); + /* purecov: end */ + } + + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, + (pgcache_page_no_t) (next_link / block_size), + DFLT_INIT_HITS, + info->buff, PAGECACHE_READ_UNKNOWN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "key cache read error for block: %s", + llstr(next_link,llbuff)); + DBUG_RETURN(1); + /* purecov: end */ + } + if (_ma_get_keynr(info->s, buff) != MARIA_DELETE_KEY_NR) + _ma_check_print_error(param, "Page at %s is not delete marked", + llstr(next_link, llbuff)); + + next_link= mi_sizekorr(buff + share->keypage_header); + records--; + param->key_file_blocks+=block_size; + } + if (param->testflag & T_VERBOSE) + { + if (next_link != HA_OFFSET_ERROR) + printf("%16s\n",llstr(next_link,llbuff)); + else + puts(""); + } + DBUG_RETURN (next_link != HA_OFFSET_ERROR); +} /* check_k_link */ + + + /* Check sizes of files */ + +int maria_chk_size(HA_CHECK *param, register MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + int error; + register my_off_t skr,size; + char buff[22],buff2[22]; + DBUG_ENTER("maria_chk_size"); + + if (info->s3) + { + /* We cannot check file sizes for S3 */ + DBUG_RETURN(0); + } + + if (!(param->testflag & T_SILENT)) + puts("- check file-size"); + + /* + The following is needed if called externally (not from maria_chk). + To get a correct physical size we need to flush them. + */ + if ((error= _ma_flush_table_files(info, + MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE))) + _ma_check_print_error(param, "Failed to flush data or index file"); + + size= mysql_file_seek(share->kfile.file, 0L, MY_SEEK_END, MYF(MY_THREADSAFE)); + if ((skr=(my_off_t) share->state.state.key_file_length) != size) + { + /* Don't give error if file generated by maria_pack */ + if (skr > size && maria_is_any_key_active(share->state.key_map)) + { + error=1; + _ma_check_print_error(param, + "Size of indexfile is: %-8s Expected: %s", + llstr(size,buff), llstr(skr,buff2)); + share->state.state.key_file_length= size; + } + else if (!(param->testflag & T_VERY_SILENT)) + _ma_check_print_warning(param, + "Size of indexfile is: %-8s Expected: %s", + llstr(size,buff), llstr(skr,buff2)); + } + if (size > share->base.max_key_file_length) + { + _ma_check_print_warning(param, + "Size of indexfile is: %-8s which is bigger than max indexfile size: %s", + ullstr(size,buff), + ullstr(share->base.max_key_file_length, buff2)); + } + else if (!(param->testflag & T_VERY_SILENT) && + ! (share->options & HA_OPTION_COMPRESS_RECORD) && + ulonglong2double(share->state.state.key_file_length) > + ulonglong2double(share->base.margin_key_file_length)*0.9) + _ma_check_print_warning(param,"Keyfile is almost full, %10s of %10s used", + llstr(share->state.state.key_file_length,buff), + llstr(share->base.max_key_file_length,buff)); + + size= mysql_file_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + skr=(my_off_t) share->state.state.data_file_length; + if (share->options & HA_OPTION_COMPRESS_RECORD) + skr+= MEMMAP_EXTRA_MARGIN; +#ifdef USE_RELOC + if (share->data_file_type == STATIC_RECORD && + skr < (my_off_t) share->base.reloc*share->base.min_pack_length) + skr=(my_off_t) share->base.reloc*share->base.min_pack_length; +#endif + if (skr != size) + { + share->state.state.data_file_length=size; /* Skip other errors */ + if (skr > size && skr != size + MEMMAP_EXTRA_MARGIN) + { + error=1; + _ma_check_print_error(param,"Size of datafile is: %-9s Expected: %s", + llstr(size,buff), llstr(skr,buff2)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + } + else + { + _ma_check_print_warning(param, + "Size of datafile is: %-9s Expected: %s", + llstr(size,buff), llstr(skr,buff2)); + } + } + if (size > share->base.max_data_file_length) + { + _ma_check_print_warning(param, + "Size of datafile is: %-8s which is bigger than max datafile size: %s", + ullstr(size,buff), + ullstr(share->base.max_data_file_length, buff2)); + } else if (!(param->testflag & T_VERY_SILENT) && + !(share->options & HA_OPTION_COMPRESS_RECORD) && + ulonglong2double(share->state.state.data_file_length) > + (ulonglong2double(share->base.max_data_file_length)*0.9)) + _ma_check_print_warning(param, "Datafile is almost full, %10s of %10s used", + llstr(share->state.state.data_file_length,buff), + llstr(share->base.max_data_file_length,buff2)); + DBUG_RETURN(error); +} /* maria_chk_size */ + + +/* Check keys */ + +int maria_chk_key(HA_CHECK *param, register MARIA_HA *info) +{ + uint key,found_keys=0,full_text_keys=0,result=0; + ha_rows keys; + ha_checksum old_record_checksum,init_checksum; + my_off_t all_keydata,all_totaldata,key_totlength,length; + double *rec_per_key_part; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + char buff[22],buff2[22]; + MARIA_PAGE page; + DBUG_ENTER("maria_chk_key"); + + if (!(param->testflag & T_SILENT)) + puts("- check key delete-chain"); + + param->key_file_blocks=share->base.keystart; + if (check_k_link(param, info, share->state.key_del)) + { + if (param->testflag & T_VERBOSE) puts(""); + _ma_check_print_error(param,"key delete-link-chain corrupted"); + DBUG_RETURN(-1); + } + + if (!(param->testflag & T_SILENT)) + puts("- check index reference"); + + all_keydata=all_totaldata=key_totlength=0; + init_checksum=param->record_checksum; + old_record_checksum=0; + if (share->data_file_type == STATIC_RECORD) + old_record_checksum= (calc_checksum(share->state.state.records + + share->state.state.del-1) * + share->base.pack_reclength); + rec_per_key_part= param->new_rec_per_key_part; + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + rec_per_key_part+=keyinfo->keysegs, key++, keyinfo++) + { + param->key_crc[key]=0; + if (! maria_is_key_active(share->state.key_map, key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part + + (uint) (rec_per_key_part - param->new_rec_per_key_part)), + keyinfo->keysegs*sizeof(*rec_per_key_part)); + continue; + } + found_keys++; + _ma_report_progress(param, key, share->base.keys); + + param->record_checksum=init_checksum; + + bzero((char*) ¶m->unique_count,sizeof(param->unique_count)); + bzero((char*) ¶m->notnull_count,sizeof(param->notnull_count)); + + if ((!(param->testflag & T_SILENT))) + printf ("- check data record references index: %d\n",key+1); + if (keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL)) + full_text_keys++; + if (share->state.key_root[key] == HA_OFFSET_ERROR) + { + if (share->state.state.records != 0 && !(keyinfo->flag & HA_FULLTEXT)) + _ma_check_print_error(param, "Key tree %u is empty", key + 1); + goto do_stat; + } + if (_ma_fetch_keypage(&page, info, keyinfo, share->state.key_root[key], + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff, 0)) + { + report_keypage_fault(param, info, share->state.key_root[key]); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + param->key_file_blocks+=keyinfo->block_length; + keys=0; + param->keydata=param->totaldata=0; + param->key_blocks=0; + param->max_level=0; + if (chk_index(param, info,keyinfo, &page, &keys, param->key_crc+key,1)) + DBUG_RETURN(-1); + if (!(keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX))) + { + if (keys != share->state.state.records) + { + _ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff), + llstr(share->state.state.records,buff2)); + if (!(param->testflag & (T_INFO | T_EXTEND))) + DBUG_RETURN(-1); + result= -1; + continue; + } + if ((found_keys - full_text_keys == 1 && + !(share->data_file_type == STATIC_RECORD)) || + (param->testflag & T_DONT_CHECK_CHECKSUM)) + old_record_checksum= param->record_checksum; + else if (old_record_checksum != param->record_checksum) + { + if (key) + _ma_check_print_error(param, + "Key %u doesn't point at same records as " + "key 1", + key+1); + else + _ma_check_print_error(param,"Key 1 doesn't point at all records"); + if (!(param->testflag & T_INFO)) + DBUG_RETURN(-1); + result= -1; + continue; + } + } + if ((uint) share->base.auto_key -1 == key) + { + /* Check that auto_increment key is bigger than max key value */ + ulonglong auto_increment; + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + info->lastinx=key; + _ma_read_key_record(info, info->rec_buff, 0); + auto_increment= + ma_retrieve_auto_increment(info->rec_buff + keyseg->start, + keyseg->type); + if (auto_increment > share->state.auto_increment) + { + _ma_check_print_warning(param, "Auto-increment value: %s is smaller " + "than max used value: %s", + llstr(share->state.auto_increment,buff2), + llstr(auto_increment, buff)); + } + if (param->testflag & T_AUTO_INC) + { + set_if_bigger(share->state.auto_increment, + auto_increment); + set_if_bigger(share->state.auto_increment, + param->auto_increment_value); + } + + /* Check that there isn't a row with auto_increment = 0 in the table */ + maria_extra(info,HA_EXTRA_KEYREAD,0); + bzero(info->lastkey_buff, keyinfo->seg->length); + if (!maria_rkey(info, info->rec_buff, key, + info->lastkey_buff, + (key_part_map) 1, HA_READ_KEY_EXACT)) + { + /* Don't count this as a real warning, as maria_chk can't correct it */ + my_bool save=param->warning_printed; + _ma_check_print_warning(param, "Found row where the auto_increment " + "column has the value 0"); + param->warning_printed=save; + } + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + } + + length=(my_off_t) isam_key_length(info,keyinfo)*keys + param->key_blocks*2; + if (param->testflag & T_INFO && param->totaldata != 0L && keys != 0L) + printf("Key: %2d: Keyblocks used: %3d%% Packed: %4d%% Max levels: %2d\n", + key+1, + (int) (my_off_t2double(param->keydata)*100.0/my_off_t2double(param->totaldata)), + (int) ((my_off_t2double(length) - my_off_t2double(param->keydata))*100.0/ + my_off_t2double(length)), + param->max_level); + all_keydata+=param->keydata; all_totaldata+=param->totaldata; key_totlength+=length; + +do_stat: + if (param->testflag & T_STATISTICS) + maria_update_key_parts(keyinfo, rec_per_key_part, param->unique_count, + param->stats_method == MI_STATS_METHOD_IGNORE_NULLS? + param->notnull_count: NULL, + (ulonglong)share->state.state.records); + } + if (param->testflag & T_INFO) + { + if (all_totaldata != 0L && found_keys > 0) + printf("Total: Keyblocks used: %3d%% Packed: %4d%%\n\n", + (int) (my_off_t2double(all_keydata)*100.0/ + my_off_t2double(all_totaldata)), + (int) ((my_off_t2double(key_totlength) - + my_off_t2double(all_keydata))*100.0/ + my_off_t2double(key_totlength))); + else if (all_totaldata != 0L && maria_is_any_key_active(share->state.key_map)) + puts(""); + } + if (param->key_file_blocks != share->state.state.key_file_length && + share->state.key_map == ~(ulonglong) 0) + _ma_check_print_warning(param, "Some data are unreferenced in keyfile"); + if (found_keys != full_text_keys) + param->record_checksum=old_record_checksum-init_checksum; /* Remove delete links */ + else + param->record_checksum=0; + DBUG_RETURN(result); +} /* maria_chk_key */ + + + +static int chk_index_down(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t page, uchar *buff, ha_rows *keys, + ha_checksum *key_checksum, uint level) +{ + char llbuff[22],llbuff2[22]; + MARIA_SHARE *share= info->s; + MARIA_PAGE ma_page; + DBUG_ENTER("chk_index_down"); + + /* Key blocks must lay within the key file length entirely. */ + if (page + keyinfo->block_length > share->state.state.key_file_length) + { + /* purecov: begin tested */ + /* Give it a chance to fit in the real file size. */ + my_off_t max_length= mysql_file_seek(info->s->kfile.file, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE)); + _ma_check_print_error(param, "Invalid key block position: %s " + "key block size: %u file_length: %s", + llstr(page, llbuff), keyinfo->block_length, + llstr(share->state.state.key_file_length, llbuff2)); + if (page + keyinfo->block_length > max_length) + goto err; + /* Fix the remembered key file length. */ + share->state.state.key_file_length= (max_length & + ~ (my_off_t) (keyinfo->block_length - + 1)); + /* purecov: end */ + } + + /* Key blocks must be aligned at block length */ + if (page & (info->s->block_size -1)) + { + /* purecov: begin tested */ + _ma_check_print_error(param, "Mis-aligned key block: %s " + "key block length: %u", + llstr(page, llbuff), info->s->block_size); + goto err; + /* purecov: end */ + } + + if (_ma_fetch_keypage(&ma_page, info, keyinfo, page, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0)) + { + report_keypage_fault(param, info, page); + goto err; + } + param->key_file_blocks+=keyinfo->block_length; + if (chk_index(param, info, keyinfo, &ma_page, keys, key_checksum,level)) + goto err; + + DBUG_RETURN(0); + + /* purecov: begin tested */ +err: + DBUG_RETURN(1); + /* purecov: end */ +} + + +/* + "Ignore NULLs" statistics collection method: process first index tuple. + + SYNOPSIS + maria_collect_stats_nonulls_first() + keyseg IN Array of key part descriptions + notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i} + tuples that don't contain NULLs) + key IN Key values tuple + + DESCRIPTION + Process the first index tuple - find out which prefix tuples don't + contain NULLs, and update the array of notnull counters accordingly. +*/ + +static +void maria_collect_stats_nonulls_first(HA_KEYSEG *keyseg, ulonglong *notnull, + const uchar *key) +{ + size_t first_null, kp; + first_null= ha_find_null(keyseg, key) - keyseg; + /* + All prefix tuples that don't include keypart_{first_null} are not-null + tuples (and all others aren't), increment counters for them. + */ + for (kp= 0; kp < first_null; kp++) + notnull[kp]++; +} + + +/* + "Ignore NULLs" statistics collection method: process next index tuple. + + SYNOPSIS + maria_collect_stats_nonulls_next() + keyseg IN Array of key part descriptions + notnull INOUT Array, notnull[i] = (number of {keypart1...keypart_i} + tuples that don't contain NULLs) + prev_key IN Previous key values tuple + last_key IN Next key values tuple + + DESCRIPTION + Process the next index tuple: + 1. Find out which prefix tuples of last_key don't contain NULLs, and + update the array of notnull counters accordingly. + 2. Find the first keypart number where the prev_key and last_key tuples + are different(A), or last_key has NULL value(B), and return it, so the + caller can count number of unique tuples for each key prefix. We don't + need (B) to be counted, and that is compensated back in + maria_update_key_parts(). + + RETURN + 1 + number of first keypart where values differ or last_key tuple has NULL +*/ + +static +int maria_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull, + const uchar *prev_key, + const uchar *last_key) +{ + uint diffs[2]; + size_t first_null_seg, kp; + HA_KEYSEG *seg; + + /* + Find the first keypart where values are different or either of them is + NULL. We get results in diffs array: + diffs[0]= 1 + number of first different keypart + diffs[1]=offset: (last_key + diffs[1]) points to first value in + last_key that is NULL or different from corresponding + value in prev_key. + */ + ha_key_cmp(keyseg, prev_key, last_key, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diffs); + seg= keyseg + diffs[0] - 1; + + /* Find first NULL in last_key */ + first_null_seg= ha_find_null(seg, last_key + diffs[1]) - keyseg; + for (kp= 0; kp < first_null_seg; kp++) + notnull[kp]++; + + /* + Return 1+ number of first key part where values differ. Don't care if + these were NULLs and not .... We compensate for that in + maria_update_key_parts. + */ + return diffs[0]; +} + + +/* Check if index is ok */ + +static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *anc_page, ha_rows *keys, + ha_checksum *key_checksum, uint level) +{ + int flag; + uint comp_flag, page_flag, nod_flag; + uchar *temp_buff, *keypos, *old_keypos, *endpos; + my_off_t next_page,record; + MARIA_SHARE *share= info->s; + char llbuff[22]; + uint diff_pos[2]; + uchar *tmp_key_buff; + my_bool temp_buff_alloced; + MARIA_KEY tmp_key; + DBUG_ENTER("chk_index"); + DBUG_DUMP("buff", anc_page->buff, anc_page->size); + + /* TODO: implement appropriate check for RTree keys */ + if (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX)) + DBUG_RETURN(0); + + alloc_on_stack(*param->stack_end_ptr, temp_buff, temp_buff_alloced, + (keyinfo->block_length + keyinfo->max_store_length)); + if (!temp_buff) + { + _ma_check_print_error(param,"Not enough memory for keyblock"); + DBUG_RETURN(-1); + } + tmp_key_buff= temp_buff+ keyinfo->block_length; + + if (keyinfo->flag & HA_NOSAME) + { + /* Not real duplicates */ + comp_flag=SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT; + } + else + comp_flag=SEARCH_SAME; /* Keys in positionorder */ + + page_flag= anc_page->flag; + nod_flag= anc_page->node; + old_keypos= anc_page->buff + share->keypage_header; + keypos= old_keypos + nod_flag; + endpos= anc_page->buff + anc_page->size; + + param->keydata+= anc_page->size; + param->totaldata+= keyinfo->block_length; /* INFO */ + param->key_blocks++; + if (level > param->max_level) + param->max_level=level; + + if (_ma_get_keynr(share, anc_page->buff) != keyinfo->key_nr) + _ma_check_print_error(param, "Page at %s is not marked for index %u", + llstr(anc_page->pos, llbuff), + (uint) keyinfo->key_nr); + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + if (!share->base.born_transactional) + { + _ma_check_print_error(param, + "Page at %s is marked with HAS_TRANSID even if " + "table is not transactional", + llstr(anc_page->pos, llbuff)); + } + } + if (share->base.born_transactional) + { + LSN lsn= lsn_korr(anc_page->buff); + if ((ulonglong) lsn > param->max_allowed_lsn) + { + /* Avoid flooding of errors */ + if (param->skip_lsn_error_count++ < MAX_LSN_ERRORS) + { + _ma_check_print_error(param, + "Page at %s as wrong LSN " LSN_FMT ". Current " + "LSN is " LSN_FMT, + llstr(anc_page->pos, llbuff), + LSN_IN_PARTS(lsn), + LSN_IN_PARTS(param->max_allowed_lsn)); + } + } + } + if (anc_page->size > share->max_index_block_size) + { + _ma_check_print_error(param, + "Page at %s has impossible (too big) pagelength", + llstr(anc_page->pos, llbuff)); + goto err; + } + + info->last_key.keyinfo= tmp_key.keyinfo= keyinfo; + info->lastinx= ~0; /* Safety */ + tmp_key.data= tmp_key_buff; + for ( ;; _ma_copy_key(&info->last_key, &tmp_key)) + { + if (nod_flag) + { + if (_ma_killed_ptr(param)) + goto err; + next_page= _ma_kpos(nod_flag,keypos); + if (chk_index_down(param,info,keyinfo,next_page, + temp_buff,keys,key_checksum,level+1)) + { + DBUG_DUMP("page_data", old_keypos, (uint) (keypos - old_keypos)); + goto err; + } + } + old_keypos=keypos; + if (keypos >= endpos || + !(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos)) + break; + if (keypos > endpos) + { + _ma_check_print_error(param, + "Page length and length of keys don't match at " + "page: %s", + llstr(anc_page->pos,llbuff)); + goto err; + } + if (share->data_file_type == BLOCK_RECORD && + !(page_flag & KEYPAGE_FLAG_HAS_TRANSID) && + key_has_transid(tmp_key.data + tmp_key.data_length + + share->rec_reflength-1)) + { + _ma_check_print_error(param, + "Found key marked for transid on page that is not " + "marked for transid at: %s", + llstr(anc_page->pos,llbuff)); + goto err; + } + + if ((*keys)++ && + (flag=ha_key_cmp(keyinfo->seg, info->last_key.data, tmp_key.data, + tmp_key.data_length + tmp_key.ref_length, + (comp_flag | SEARCH_INSERT | (tmp_key.flag >> 1) | + info->last_key.flag), diff_pos)) >=0) + { + DBUG_DUMP_KEY("old", &info->last_key); + DBUG_DUMP_KEY("new", &tmp_key); + DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos)); + + if ((comp_flag & SEARCH_FIND) && flag == 0) + _ma_check_print_error(param,"Found duplicated key at page %s", + llstr(anc_page->pos,llbuff)); + else + _ma_check_print_error(param,"Key in wrong position at page %s", + llstr(anc_page->pos,llbuff)); + goto err; + } + + if (param->testflag & T_STATISTICS) + { + if (*keys != 1L) /* not first_key */ + { + if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) + ha_key_cmp(keyinfo->seg, info->last_key.data, + tmp_key.data, tmp_key.data_length, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, + diff_pos); + else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + { + diff_pos[0]= maria_collect_stats_nonulls_next(keyinfo->seg, + param->notnull_count, + info->last_key.data, + tmp_key.data); + } + param->unique_count[diff_pos[0]-1]++; + } + else + { + if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + maria_collect_stats_nonulls_first(keyinfo->seg, param->notnull_count, + tmp_key.data); + } + } + (*key_checksum)+= maria_byte_checksum(tmp_key.data, tmp_key.data_length); + record= _ma_row_pos_from_key(&tmp_key); + + if (keyinfo->flag & HA_FULLTEXT) /* special handling for ft2 */ + { + uint off; + int subkeys; + get_key_full_length_rdonly(off, tmp_key.data); + subkeys= ft_sintXkorr(tmp_key.data + off); + if (subkeys < 0) + { + ha_rows tmp_keys=0; + share->ft2_keyinfo.key_nr= keyinfo->key_nr; + if (chk_index_down(param,info,&share->ft2_keyinfo,record, + temp_buff,&tmp_keys,key_checksum,1)) + goto err; + if (tmp_keys + subkeys) + { + _ma_check_print_error(param, + "Number of words in the 2nd level tree " + "does not match the number in the header. " + "Parent word in on the page %s, offset %u", + llstr(anc_page->pos,llbuff), + (uint) (old_keypos - anc_page->buff)); + goto err; + } + (*keys)+=tmp_keys-1; + continue; + } + /* fall through */ + } + if ((share->data_file_type != BLOCK_RECORD && + share->data_file_type != NO_RECORD && + record >= share->state.state.data_file_length) || + (share->data_file_type == BLOCK_RECORD && + ma_recordpos_to_page(record) * share->base.min_block_length >= + share->state.state.data_file_length) || + (share->data_file_type == NO_RECORD && record != 0)) + { +#ifdef DBUG_TRACE + char llbuff2[22], llbuff3[22]; +#endif + _ma_check_print_error(param, + "Found key at page %s that points to record " + "outside datafile", + llstr(anc_page->pos,llbuff)); + DBUG_PRINT("test",("page: %s record: %s filelength: %s", + llstr(anc_page->pos,llbuff),llstr(record,llbuff2), + llstr(share->state.state.data_file_length,llbuff3))); + DBUG_DUMP_KEY("key", &tmp_key); + DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos)); + goto err; + } + param->record_checksum+= (ha_checksum) record; + } + if (keypos != endpos) + { + _ma_check_print_error(param, + "Keyblock size at page %s is not correct. " + "Block length: %u key length: %u", + llstr(anc_page->pos, llbuff), anc_page->size, + (uint) (keypos - anc_page->buff)); + goto err; + } + stack_alloc_free(temp_buff, temp_buff_alloced); + DBUG_RETURN(0); + err: + stack_alloc_free(temp_buff, temp_buff_alloced); + DBUG_RETURN(1); +} /* chk_index */ + + + /* Calculate a checksum of 1+2+3+4...N = N*(N+1)/2 without overflow */ + +static ha_checksum calc_checksum(ha_rows count) +{ + ulonglong sum,a,b; + DBUG_ENTER("calc_checksum"); + + sum=0; + a=count; b=count+1; + if (a & 1) + b>>=1; + else + a>>=1; + while (b) + { + if (b & 1) + sum+=a; + a<<=1; b>>=1; + } + DBUG_PRINT("exit",("sum: %lx",(ulong) sum)); + DBUG_RETURN((ha_checksum) sum); +} /* calc_checksum */ + + + /* Calc length of key in normal isam */ + +static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo) +{ + uint length; + HA_KEYSEG *keyseg; + DBUG_ENTER("isam_key_length"); + + length= info->s->rec_reflength; + for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++) + length+= keyseg->length; + + DBUG_PRINT("exit",("length: %d",length)); + DBUG_RETURN(length); +} /* key_length */ + + + +static char * record_pos_to_txt(MARIA_HA *info, my_off_t recpos, + char *buff) +{ + if (info->s->data_file_type != BLOCK_RECORD) + llstr(recpos, buff); + else + { + my_off_t page= ma_recordpos_to_page(recpos); + uint row= ma_recordpos_to_dir_entry(recpos); + char *end= longlong10_to_str(page, buff, 10); + *(end++)= ':'; + longlong10_to_str(row, end, 10); + } + return buff; +} + + +/* + Check that keys in records exist in index tree + + SYNOPSIS + check_keys_in_record() + param Check paramenter + info Maria handler + extend Type of check (extended or normal) + start_recpos Position to row + record Record buffer + + NOTES + This function also calculates record checksum & number of rows +*/ + +static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend, + my_off_t start_recpos, uchar *record) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + char llbuff[22+4]; + uint keynr; + + param->tmp_record_checksum+= (ha_checksum) start_recpos; + param->records++; + if (param->records % WRITE_COUNT == 0) + { + if (param->testflag & T_WRITE_LOOP) + { + printf("%s\r", llstr(param->records, llbuff)); + fflush(stdout); + } + _ma_report_progress(param, param->records, share->state.state.records); + } + + /* Check if keys match the record */ + for (keynr=0, keyinfo= share->keyinfo; keynr < share->base.keys; + keynr++, keyinfo++) + { + if (maria_is_key_active(share->state.key_map, keynr)) + { + MARIA_KEY key; + if (!(keyinfo->flag & HA_FULLTEXT)) + { + (*keyinfo->make_key)(info, &key, keynr, info->lastkey_buff, record, + start_recpos, 0); + info->last_key.keyinfo= key.keyinfo; + if (extend) + { + /* We don't need to lock the key tree here as we don't allow + concurrent threads when running maria_chk + */ + int search_result= +#ifdef HAVE_RTREE_KEYS + (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX)) ? + maria_rtree_find_first(info, &key, MBR_EQUAL | MBR_DATA) : +#endif + _ma_search(info, &key, SEARCH_SAME, share->state.key_root[keynr]); + if (search_result) + { + _ma_check_print_error(param, + "Record at: %14s " + "Can't find key for index: %2d", + record_pos_to_txt(info, start_recpos, + llbuff), + keynr+1); + if (param->testflag & T_VERBOSE) + _ma_print_key(stdout, &key); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + return -1; + } + } + else + param->tmp_key_crc[keynr]+= + maria_byte_checksum(key.data, key.data_length); + } + } + } + return 0; +} + + +/* + Functions to loop through all rows and check if they are ok + + NOTES + One function for each record format + + RESULT + 0 ok + -1 Interrupted by user + 1 Error +*/ + +static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_SHARE *share= info->s; + my_off_t start_recpos, pos; + char llbuff[22]; + + pos= 0; + while (pos < share->state.state.data_file_length) + { + if (_ma_killed_ptr(param)) + return -1; + if (my_b_read(¶m->read_cache, record, + share->base.pack_reclength)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: " + "%s", + my_errno, llstr(pos, llbuff)); + return 1; + } + start_recpos= pos; + pos+= share->base.pack_reclength; + param->splits++; + if (*record == '\0') + { + param->del_blocks++; + param->del_length+= share->base.pack_reclength; + continue; /* Record removed */ + } + param->glob_crc+= _ma_static_checksum(info,record); + param->used+= share->base.pack_reclength; + if (check_keys_in_record(param, info, extend, start_recpos, record)) + return 1; + } + return 0; +} + + +static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + my_off_t UNINIT_VAR(start_recpos), start_block, pos; + uchar *UNINIT_VAR(to); + ulong UNINIT_VAR(left_length); + uint b_type; + char llbuff[22],llbuff2[22],llbuff3[22]; + myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("check_dynamic_record"); + + pos= 0; + while (pos < share->state.state.data_file_length) + { + my_bool got_error= 0; + int flag; + if (_ma_killed_ptr(param)) + DBUG_RETURN(-1); + + flag= block_info.second_read=0; + block_info.next_filepos=pos; + do + { + if (_ma_read_cache(info, ¶m->read_cache, block_info.header, + (start_block=block_info.next_filepos), + sizeof(block_info.header), + (flag ? 0 : READING_NEXT) | READING_HEADER)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at " + "position: %s", + my_errno, llstr(start_block, llbuff)); + DBUG_RETURN(1); + } + + if (start_block & (MARIA_DYN_ALIGN_SIZE-1)) + { + _ma_check_print_error(param,"Wrong aligned block at %s", + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + b_type= _ma_get_block_info(info, &block_info,-1,start_block); + if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & BLOCK_SYNC_ERROR) + { + if (flag) + { + _ma_check_print_error(param,"Unexpected byte: %d at link: %s", + (int) block_info.header[0], + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + pos=block_info.filepos+block_info.block_len; + goto next; + } + if (b_type & BLOCK_DELETED) + { + if (block_info.block_len < share->base.min_block_length) + { + _ma_check_print_error(param, + "Deleted block with impossible length %lu " + "at %s", + block_info.block_len,llstr(pos,llbuff)); + DBUG_RETURN(1); + } + if ((block_info.next_filepos != HA_OFFSET_ERROR && + block_info.next_filepos >= share->state.state.data_file_length) || + (block_info.prev_filepos != HA_OFFSET_ERROR && + block_info.prev_filepos >= share->state.state.data_file_length)) + { + _ma_check_print_error(param,"Delete link points outside datafile " + "at %s", + llstr(pos,llbuff)); + DBUG_RETURN(1); + } + param->del_blocks++; + param->del_length+= block_info.block_len; + param->splits++; + pos= block_info.filepos+block_info.block_len; + goto next; + } + _ma_check_print_error(param,"Wrong bytesec: %d-%d-%d at linkstart: %s", + block_info.header[0],block_info.header[1], + block_info.header[2], + llstr(start_block,llbuff)); + DBUG_RETURN(1); + } + if (share->state.state.data_file_length < block_info.filepos+ + block_info.block_len) + { + _ma_check_print_error(param, + "Recordlink that points outside datafile at %s", + llstr(pos,llbuff)); + got_error=1; + break; + } + param->splits++; + if (!flag++) /* First block */ + { + start_recpos=pos; + pos=block_info.filepos+block_info.block_len; + if (block_info.rec_len > (uint) share->base.max_pack_length) + { + my_errno= HA_ERR_WRONG_IN_RECORD; + _ma_check_print_error(param,"Found too long record (%lu) at %s", + (ulong) block_info.rec_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + if (share->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + share->base.extra_rec_buff_size, myflag)) + + { + _ma_check_print_error(param, + "Not enough memory (%lu) for blob at %s", + (ulong) block_info.rec_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + } + to= info->rec_buff; + left_length= block_info.rec_len; + } + if (left_length < block_info.data_len) + { + _ma_check_print_error(param,"Found too long record (%lu) at %s", + (ulong) block_info.data_len, + llstr(start_recpos,llbuff)); + got_error=1; + break; + } + if (_ma_read_cache(info, ¶m->read_cache, to, block_info.filepos, + (uint) block_info.data_len, + flag == 1 ? READING_NEXT : 0)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at " + "position: %s", my_errno, + llstr(block_info.filepos, llbuff)); + + DBUG_RETURN(1); + } + to+=block_info.data_len; + param->link_used+= block_info.filepos-start_block; + param->used+= block_info.filepos - start_block + block_info.data_len; + param->empty+= block_info.block_len-block_info.data_len; + left_length-= block_info.data_len; + if (left_length) + { + if (b_type & BLOCK_LAST) + { + _ma_check_print_error(param, + "Wrong record length %s of %s at %s", + llstr(block_info.rec_len-left_length,llbuff), + llstr(block_info.rec_len, llbuff2), + llstr(start_recpos,llbuff3)); + got_error=1; + break; + } + if (share->state.state.data_file_length < block_info.next_filepos) + { + _ma_check_print_error(param, + "Found next-recordlink that points outside " + "datafile at %s", + llstr(block_info.filepos,llbuff)); + got_error=1; + break; + } + } + } while (left_length); + + if (! got_error) + { + if (_ma_rec_unpack(info,record,info->rec_buff,block_info.rec_len) == + MY_FILE_ERROR) + { + _ma_check_print_error(param,"Found wrong record at %s", + llstr(start_recpos,llbuff)); + got_error=1; + } + else + { + ha_checksum checksum= 0; + if (share->calc_checksum) + checksum= (*share->calc_checksum)(info, record); + + if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE)) + { + if (_ma_rec_check(info,record, info->rec_buff,block_info.rec_len, + MY_TEST(share->calc_checksum), checksum)) + { + _ma_check_print_error(param,"Found wrong packed record at %s", + llstr(start_recpos,llbuff)); + got_error= 1; + } + } + param->glob_crc+= checksum; + } + + if (! got_error) + { + if (check_keys_in_record(param, info, extend, start_recpos, record)) + DBUG_RETURN(1); + } + else + { + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + else if (!flag) + pos= block_info.filepos+block_info.block_len; +next:; + } + DBUG_RETURN(0); +} + + +static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + my_off_t start_recpos, pos; + char llbuff[22]; + my_bool got_error= 0; + DBUG_ENTER("check_compressed_record"); + + pos= share->pack.header_length; /* Skip header */ + while (pos < share->state.state.data_file_length) + { + if (_ma_killed_ptr(param)) + DBUG_RETURN(-1); + + if (_ma_read_cache(info, ¶m->read_cache, block_info.header, pos, + share->pack.ref_length, READING_NEXT)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: " + "%s", + my_errno, llstr(pos, llbuff)); + DBUG_RETURN(1); + } + + start_recpos= pos; + param->splits++; + _ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, -1, + start_recpos); + pos=block_info.filepos+block_info.rec_len; + if (block_info.rec_len < (uint) share->min_pack_length || + block_info.rec_len > (uint) share->max_pack_length) + { + _ma_check_print_error(param, + "Found block with wrong recordlength: %lu at %s", + block_info.rec_len, llstr(start_recpos,llbuff)); + got_error=1; + goto end; + } + if (_ma_read_cache(info, ¶m->read_cache, info->rec_buff, + block_info.filepos, block_info.rec_len, READING_NEXT)) + { + _ma_check_print_error(param, + "got error: %d when reading datafile at position: " + "%s", + my_errno, llstr(block_info.filepos, llbuff)); + DBUG_RETURN(1); + } + info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */ + if (_ma_pack_rec_unpack(info, &info->bit_buff, record, + info->rec_buff, block_info.rec_len)) + { + _ma_check_print_error(param,"Found wrong record at %s", + llstr(start_recpos,llbuff)); + got_error=1; + goto end; + } + param->glob_crc+= (*share->calc_checksum)(info,record); + param->link_used+= (block_info.filepos - start_recpos); + param->used+= (pos-start_recpos); + +end: + if (! got_error) + { + if (check_keys_in_record(param, info, extend, start_recpos, record)) + DBUG_RETURN(1); + } + else + { + got_error= 0; /* Reset for next loop */ + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Check if layout on head or tail page is ok + + NOTES + This is for rows-in-block format. +*/ + +static int check_page_layout(HA_CHECK *param, MARIA_HA *info, + my_off_t page_pos, uchar *page, + uint row_count, uint head_empty, + uint *real_rows_found, uint *free_slots_found) +{ + uint empty, last_row_end, row, first_dir_entry, free_entry, block_size; + uint free_entries, prev_free_entry; + uchar *dir_entry; + char llbuff[22]; + my_bool error_in_free_list= 0; + DBUG_ENTER("check_page_layout"); + + block_size= info->s->block_size; + empty= 0; + last_row_end= PAGE_HEADER_SIZE(info->s); + *real_rows_found= 0; + + /* Check free directory list */ + free_entry= (uint) page[DIR_FREE_OFFSET]; + free_entries= 0; + prev_free_entry= END_OF_DIR_FREE_LIST; + while (free_entry != END_OF_DIR_FREE_LIST) + { + uchar *dir; + if (free_entry > row_count) + { + _ma_check_print_error(param, + "Page %9s: Directory free entry points outside " + "directory", + llstr(page_pos, llbuff)); + error_in_free_list= 1; + break; + } + dir= dir_entry_pos(page, block_size, free_entry); + if (uint2korr(dir) != 0) + { + _ma_check_print_error(param, + "Page %9s: Directory free entry points to " + "not deleted entry", + llstr(page_pos, llbuff)); + error_in_free_list= 1; + break; + } + if (dir[2] != prev_free_entry) + { + _ma_check_print_error(param, + "Page %9s: Directory free list back pointer " + "points to wrong entry", + llstr(page_pos, llbuff)); + error_in_free_list= 1; + break; + } + prev_free_entry= free_entry; + free_entry= dir[3]; + free_entries++; + } + *free_slots_found= free_entries; + + /* Check directry */ + dir_entry= page+ block_size - PAGE_SUFFIX_SIZE; + first_dir_entry= (block_size - row_count * DIR_ENTRY_SIZE - + PAGE_SUFFIX_SIZE); + for (row= 0 ; row < row_count ; row++) + { + uint pos, length; + dir_entry-= DIR_ENTRY_SIZE; + pos= uint2korr(dir_entry); + if (!pos) + { + free_entries--; + if (row == row_count -1) + { + _ma_check_print_error(param, + "Page %9s: First entry in directory is 0", + llstr(page_pos, llbuff)); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + continue; /* Deleted row */ + } + (*real_rows_found)++; + length= uint2korr(dir_entry+2); + param->used+= length; + if (pos < last_row_end) + { + _ma_check_print_error(param, + "Page %9s: Row %3u overlapps with previous row", + llstr(page_pos, llbuff), row); + DBUG_RETURN(1); + } + empty+= (pos - last_row_end); + last_row_end= pos + length; + if (last_row_end > first_dir_entry) + { + _ma_check_print_error(param, + "Page %9s: Row %3u overlapps with directory", + llstr(page_pos, llbuff), row); + DBUG_RETURN(1); + } + } + empty+= (first_dir_entry - last_row_end); + + if (empty != head_empty) + { + _ma_check_print_error(param, + "Page %9s: Wrong empty size. Stored: %5u " + "Actual: %5u", + llstr(page_pos, llbuff), head_empty, empty); + param->err_count++; + } + if (free_entries != 0 && !error_in_free_list) + { + _ma_check_print_error(param, + "Page %9s: Directory free link don't include " + "all free entries", + llstr(page_pos, llbuff)); + param->err_count++; + } + DBUG_RETURN(param->err_count && + (param->err_count >= MAXERR || !(param->testflag & T_VERBOSE))); +} + + +/* + Check all rows on head page + + NOTES + This is for rows-in-block format. + + Before this, we have already called check_page_layout(), so + we know the block is logicaly correct (even if the rows may not be that) + + RETURN + 0 ok + 1 error +*/ + + +static my_bool check_head_page(HA_CHECK *param, MARIA_HA *info, uchar *record, + int extend, my_off_t page_pos, uchar *page_buff, + uint row_count) +{ + MARIA_SHARE *share= info->s; + uchar *dir_entry; + uint row; + char llbuff[22], llbuff2[22]; + ulonglong page= page_pos / share->block_size; + DBUG_ENTER("check_head_page"); + + dir_entry= page_buff+ share->block_size - PAGE_SUFFIX_SIZE; + for (row= 0 ; row < row_count ; row++) + { + uint pos, length, flag; + dir_entry-= DIR_ENTRY_SIZE; + pos= uint2korr(dir_entry); + if (!pos) + continue; + length= uint2korr(dir_entry+2); + if (length < share->base.min_block_length) + { + _ma_check_print_error(param, + "Page %9s: Row %3u is too short " + "(%d of min %d bytes)", + llstr(page, llbuff), row, length, + (uint) share->base.min_block_length); + DBUG_RETURN(1); + } + flag= (uint) (uchar) page_buff[pos]; + if (flag & ~(ROW_FLAG_ALL)) + _ma_check_print_error(param, + "Page %9s: Row %3u has wrong flag: %u", + llstr(page, llbuff), row, flag); + + DBUG_PRINT("info", ("rowid: %s page: %lu row: %u", + llstr(ma_recordpos(page, row), llbuff), + (ulong) page, row)); + info->cur_row.trid= 0; + if (_ma_read_block_record2(info, record, page_buff+pos, + page_buff+pos+length)) + { + _ma_check_print_error(param, + "Page %9s: Row %3d is crashed", + llstr(page, llbuff), row); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + continue; + } + set_if_bigger(param->max_found_trid, info->cur_row.trid); + if (info->cur_row.trid > param->max_trid) + _ma_check_print_not_visible_error(param, info->cur_row.trid); + + if (share->calc_checksum) + { + ha_checksum checksum= (*share->calc_checksum)(info, record); + if (info->cur_row.checksum != (checksum & 255)) + _ma_check_print_error(param, "Page %9s: Row %3d has wrong checksum", + llstr(page, llbuff), row); + param->glob_crc+= checksum; + } + if (info->cur_row.extents_count) + { + uchar *extents= info->cur_row.extents; + uint i; + /* Check that bitmap has the right marker for the found extents */ + for (i= 0 ; i < info->cur_row.extents_count ; i++) + { + pgcache_page_no_t extent_page; + uint page_count, page_type; + extent_page= uint5korr(extents); + page_count= uint2korr(extents+5) & ~START_EXTENT_BIT; + extents+= ROW_EXTENT_SIZE; + page_type= BLOB_PAGE; + if (page_count & TAIL_BIT) + { + page_count= 1; + page_type= TAIL_PAGE; + } + /* + TODO OPTIMIZE: + Check the whole extent with one test and only do the loop if + something is wrong (for exact error reporting) + */ + for ( ; page_count--; extent_page++) + { + uint bitmap_pattern; + if (_ma_check_if_right_bitmap_type(info, page_type, extent_page, + &bitmap_pattern)) + { + _ma_check_print_error(param, + "Page %9s: Row: %3d has an extent with " + "wrong information in bitmap: " + "Page: %9s Page_type: %d Bitmap: %d", + llstr(page, llbuff), row, + llstr(extent_page, llbuff2), + page_type, bitmap_pattern); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + DBUG_RETURN(1); + } + } + } + } + param->full_page_count+= info->cur_row.full_page_count; + param->tail_count+= info->cur_row.tail_count; + if (check_keys_in_record(param, info, extend, + ma_recordpos(page, row), record)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/* + Check if rows-in-block data file is consistent +*/ + +static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend, + uchar *record) +{ + MARIA_SHARE *share= info->s; + my_off_t pos; + pgcache_page_no_t page; + uchar *page_buff, *bitmap_buff, *data; + char llbuff[22], llbuff2[22]; + uint block_size= share->block_size; + ha_rows full_page_count, tail_count; + my_bool UNINIT_VAR(full_dir), now_transactional; + uint offset_page, offset, free_count; + LSN lsn; + + if (_ma_scan_init_block_record(info)) + { + _ma_check_print_error(param, "got error %d when initializing scan", + my_errno); + return 1; + } + + now_transactional= info->s->now_transactional; + info->s->now_transactional= 0; /* Don't log changes */ + + bitmap_buff= info->scan.bitmap_buff; + page_buff= info->scan.page_buff; + full_page_count= tail_count= 0; + param->full_page_count= param->tail_count= 0; + param->used= param->link_used= 0; + param->splits= share->state.state.data_file_length / block_size; + + for (pos= 0, page= 0; + pos < share->state.state.data_file_length; + pos+= block_size, page++) + { + uint UNINIT_VAR(row_count), real_row_count, UNINIT_VAR(empty_space), + page_type, bitmap_pattern; + uint bitmap_for_page; + + if (_ma_killed_ptr(param)) + { + _ma_scan_end_block_record(info); + info->s->now_transactional= now_transactional; + return -1; /* Interrupted */ + } + if ((page % share->bitmap.pages_covered) == 0) + { + /* Bitmap page */ + if (pagecache_read(share->pagecache, + &info->s->bitmap.file, + page, 1, + bitmap_buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + llstr(page, llbuff), my_errno); + goto err; + } + param->used+= block_size; + param->link_used+= block_size; + if (param->verbose > 2) + print_bitmap_description(share, page, bitmap_buff); + continue; + } + /* Skip pages marked as empty in bitmap */ + offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3; + offset= offset_page & 7; + data= bitmap_buff + offset_page / 8; + bitmap_pattern= uint2korr(data); + if (!(bitmap_for_page= ((bitmap_pattern >> offset) & 7))) + { + param->empty+= block_size; + param->del_blocks++; + continue; + } + + if (pagecache_read(share->pagecache, + &info->dfile, + page, 1, + page_buff, + share->page_type, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + llstr(page, llbuff), my_errno); + goto err; + } + page_type= page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK; + if (page_type == UNALLOCATED_PAGE || page_type >= MAX_PAGE_TYPE) + { + _ma_check_print_error(param, + "Page: %9s Found wrong page type %d. Bitmap: %d '%s'", + llstr(page, llbuff), page_type, + bitmap_for_page, bits_to_txt[bitmap_for_page]); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + goto err; + continue; + } + switch ((enum en_page_type) page_type) { + case UNALLOCATED_PAGE: + case MAX_PAGE_TYPE: + default: + DBUG_ASSERT(0); /* Impossible */ + break; + case HEAD_PAGE: + row_count= page_buff[DIR_COUNT_OFFSET]; + empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET); + param->used+= block_size - empty_space; + param->link_used+= (PAGE_HEADER_SIZE(info->s) + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + if (empty_space < share->bitmap.sizes[3]) + param->lost+= empty_space; + if (check_page_layout(param, info, pos, page_buff, row_count, + empty_space, &real_row_count, &free_count)) + goto err; + full_dir= (row_count == MAX_ROWS_PER_PAGE && + page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST); + break; + case TAIL_PAGE: + row_count= page_buff[DIR_COUNT_OFFSET]; + empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET); + param->used+= block_size - empty_space; + param->link_used+= (PAGE_HEADER_SIZE(info->s) + PAGE_SUFFIX_SIZE + + row_count * DIR_ENTRY_SIZE); + if (empty_space < share->bitmap.sizes[6]) + param->lost+= empty_space; + if (check_page_layout(param, info, pos, page_buff, row_count, + empty_space, &real_row_count, &free_count)) + goto err; + full_dir= (row_count - free_count >= MAX_ROWS_PER_PAGE - + share->base.blobs); + break; + case BLOB_PAGE: + full_page_count++; + full_dir= 0; + empty_space= block_size; /* for error reporting */ + param->link_used+= FULL_PAGE_HEADER_SIZE(info->s); + param->used+= block_size; + break; + } + if (_ma_check_bitmap_data(info, page_type, + full_dir ? 0 : empty_space, + bitmap_for_page)) + { + _ma_check_print_error(param, + "Page %9s: Wrong data in bitmap. Page_type: " + "%d full: %d empty_space: %u Bitmap-bits: %d " + "'%s'", + llstr(page, llbuff), page_type, full_dir, + empty_space, bitmap_for_page, + bits_to_txt[bitmap_for_page]); + if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE)) + goto err; + } + if (share->base.born_transactional) + { + lsn= lsn_korr(page_buff); + if ((ulonglong) lsn > param->max_allowed_lsn) + { + /* Avoid flooding of errors */ + if (param->skip_lsn_error_count++ < MAX_LSN_ERRORS) + { + _ma_check_print_error(param, + "Page %9s: Wrong LSN " LSN_FMT ". Current " + "LSN is " LSN_FMT, + llstr(page, llbuff), + LSN_IN_PARTS(lsn), + LSN_IN_PARTS(param->max_allowed_lsn)); + } + } + } + if ((enum en_page_type) page_type == BLOB_PAGE) + continue; + param->empty+= empty_space; + if ((enum en_page_type) page_type == TAIL_PAGE) + { + tail_count+= real_row_count; + continue; + } + if (check_head_page(param, info, record, extend, pos, page_buff, + row_count)) + goto err; + } + + /* Verify that rest of bitmap is zero */ + + if (page % share->bitmap.pages_covered) + { + /* Not at end of bitmap */ + uint bitmap_pattern; + uint byte_offset; + + offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3; + offset= offset_page & 7; + byte_offset= offset_page / 8; + data= bitmap_buff + byte_offset; + bitmap_pattern= uint2korr(data); + if (byte_offset + 1 == share->bitmap.max_total_size) + { + /* On last byte of bitmap; Remove possible checksum */ + bitmap_pattern&= 0xff; + } + if (((bitmap_pattern >> offset)) || + (byte_offset + 2 < share->bitmap.max_total_size && + _ma_check_if_zero(data+2, share->bitmap.max_total_size - + byte_offset - 2))) + { + ulonglong bitmap_page; + bitmap_page= page / share->bitmap.pages_covered; + bitmap_page*= share->bitmap.pages_covered; + + _ma_check_print_error(param, + "Bitmap at page %s has pages reserved outside of " + "data file length", + llstr(bitmap_page, llbuff)); + DBUG_EXECUTE("bitmap", _ma_print_bitmap(&share->bitmap, bitmap_buff, + bitmap_page);); + } + } + + _ma_scan_end_block_record(info); + + if (full_page_count != param->full_page_count) + _ma_check_print_error(param, "Full page count read through records was %s " + "but we found %s pages while scanning table", + llstr(param->full_page_count, llbuff), + llstr(full_page_count, llbuff2)); + if (tail_count != param->tail_count) + _ma_check_print_error(param, "Tail count read through records was %s but " + "we found %s tails while scanning table", + llstr(param->tail_count, llbuff), + llstr(tail_count, llbuff2)); + + info->s->now_transactional= now_transactional; + return param->error_printed != 0; + +err: + _ma_scan_end_block_record(info); + info->s->now_transactional= now_transactional; + return 1; +} + + +/* Check that record-link is ok */ + +int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend) +{ + MARIA_SHARE *share= info->s; + int error; + uchar *record; + char llbuff[22],llbuff2[22],llbuff3[22]; + DBUG_ENTER("maria_chk_data_link"); + + if (!(param->testflag & T_SILENT)) + { + if (extend) + puts("- check records and index references"); + else + puts("- check record links"); + } + + if (!(record= (uchar*) my_malloc(PSI_INSTRUMENT_ME, + share->base.default_rec_buff_size, + MYF(param->malloc_flags)))) + { + _ma_check_print_error(param,"Not enough memory for record"); + DBUG_RETURN(-1); + } + param->records= param->del_blocks= 0; + param->used= param->link_used= param->splits= param->del_length= 0; + param->lost= 0; + param->tmp_record_checksum= param->glob_crc= 0; + param->err_count= 0; + + error= 0; + param->empty= share->pack.header_length; + + bzero((char*) param->tmp_key_crc, + share->base.keys * sizeof(param->tmp_key_crc[0])); + + info->in_check_table= 1; /* Don't assert on checksum errors */ + + switch (share->data_file_type) { + case BLOCK_RECORD: + error= check_block_record(param, info, extend, record); + break; + case STATIC_RECORD: + error= check_static_record(param, info, extend, record); + break; + case DYNAMIC_RECORD: + error= check_dynamic_record(param, info, extend, record); + break; + case COMPRESSED_RECORD: + error= check_compressed_record(param, info, extend, record); + break; + case NO_RECORD: + param->records= share->state.state.records; + param->record_checksum= 0; + extend= 1; /* No row checksums */ + /* no data, nothing to do */ + break; + } /* switch */ + + info->in_check_table= 0; + + if (error) + goto err; + + if (param->testflag & T_WRITE_LOOP) + { + fputs(" \r",stdout); + fflush(stdout); + } + if (param->records != share->state.state.records) + { + _ma_check_print_error(param, + "Record-count is not ok; found %-10s Should be: %s", + llstr(param->records,llbuff), + llstr(share->state.state.records,llbuff2)); + error=1; + } + if (param->record_checksum && + param->record_checksum != param->tmp_record_checksum) + { + _ma_check_print_error(param, + "Key pointers and record positions doesn't match"); + error=1; + } + if (param->glob_crc != share->state.state.checksum && + (share->options & + (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))) + { + _ma_check_print_warning(param, + "Record checksum is not the same as checksum " + "stored in the index file"); + error=1; + } + if (!extend) + { + uint key; + for (key=0 ; key < share->base.keys; key++) + { + if (param->tmp_key_crc[key] != param->key_crc[key] && + !(share->keyinfo[key].flag & + (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX))) + { + _ma_check_print_error(param,"Checksum for key: %2d doesn't match " + "checksum for records", + key+1); + error=1; + } + } + } + + if (param->del_length != share->state.state.empty) + { + _ma_check_print_warning(param, + "Found %s deleted space. Should be %s", + llstr(param->del_length,llbuff2), + llstr(share->state.state.empty,llbuff)); + } + /* Skip following checks for BLOCK RECORD as they don't make any sence */ + if (share->data_file_type != BLOCK_RECORD) + { + if (param->used + param->empty + param->del_length != + share->state.state.data_file_length) + { + _ma_check_print_warning(param, + "Found %s record data and %s unused data and %s " + "deleted data", + llstr(param->used, llbuff), + llstr(param->empty,llbuff2), + llstr(param->del_length,llbuff3)); + _ma_check_print_warning(param, + "Total %s Should be: %s", + llstr((param->used+param->empty + + param->del_length), llbuff), + llstr(share->state.state.data_file_length, + llbuff2)); + } + if (param->del_blocks != share->state.state.del) + { + _ma_check_print_warning(param, + "Found %10s deleted blocks. Should be: %s", + llstr(param->del_blocks,llbuff), + llstr(share->state.state.del,llbuff2)); + } + if (param->splits != share->state.split) + { + _ma_check_print_warning(param, + "Found %10s parts. Should be: %s", + llstr(param->splits, llbuff), + llstr(share->state.split,llbuff2)); + } + } + if (param->testflag & T_INFO) + { + if (param->warning_printed || param->error_printed) + puts(""); + if (param->used != 0 && ! param->error_printed) + { + if (param->records) + { + printf("Records:%18s M.recordlength:%9lu Packed:%14.0f%%\n", + llstr(param->records,llbuff), + (long)((param->used - param->link_used)/param->records), + (share->base.blobs ? 0.0 : + (ulonglong2double((ulonglong) share->base.reclength * + param->records)- + my_off_t2double(param->used))/ + ulonglong2double((ulonglong) share->base.reclength * + param->records)*100.0)); + printf("Recordspace used:%9.0f%% Empty space:%12d%% " + "Blocks/Record: %6.2f\n", + (ulonglong2double(param->used - param->link_used)/ + ulonglong2double(param->used-param->link_used+param->empty) * + 100.0), + (!param->records ? 100 : + (int) (ulonglong2double(param->del_length+param->empty)/ + my_off_t2double(param->used)*100.0)), + ulonglong2double(param->splits - param->del_blocks) / + param->records); + } + else + printf("Records:%18s\n", "0"); + } + printf("Record blocks:%12s Delete blocks:%10s\n", + llstr(param->splits - param->del_blocks, llbuff), + llstr(param->del_blocks, llbuff2)); + printf("Record data: %12s Deleted data: %10s\n", + llstr(param->used - param->link_used,llbuff), + llstr(param->del_length, llbuff2)); + printf("Empty space: %12s Linkdata: %10s\n", + llstr(param->empty, llbuff),llstr(param->link_used, llbuff2)); + if (share->data_file_type == BLOCK_RECORD) + { + printf("Full pages: %12s Tail count: %12s\n", + llstr(param->full_page_count, llbuff), + llstr(param->tail_count, llbuff2)); + printf("Lost space: %12s\n", llstr(param->lost, llbuff)); + if (param->max_found_trid) + { + printf("Max trans. id: %11s\n", + llstr(param->max_found_trid, llbuff)); + } + } + } + my_free(record); + DBUG_RETURN (error); + +err: + my_free(record); + param->testflag|=T_RETRY_WITHOUT_QUICK; + DBUG_RETURN(1); +} /* maria_chk_data_link */ + + +/** + Prepares a table for a repair or index sort: flushes pages, records durably + in the table that it is undergoing the operation (if that op crashes, that + info will serve for Recovery and the user). + + If we start overwriting the index file, and crash then, old REDOs will + be tried and fail. To prevent that, we bump skip_redo_lsn, and thus we have + to flush and sync pages so that old REDOs can be skipped. + If this is not a bulk insert, which Recovery can handle gracefully (by + truncating files, see UNDO_BULK_INSERT) we also mark the table + crashed-on-repair, so that user knows it has to re-repair. If bulk insert we + shouldn't mark it crashed-on-repair, because if we did this, the UNDO phase + would skip the table (UNDO_BULK_INSERT would not be applied), + and maria_chk would not improve that. + If this is an OPTIMIZE which merely sorts index, we need to do the same + too: old REDOs should not apply to the new index file. + Only the flush is needed when in maria_chk which is not crash-safe. + + @param info table + @param param repair parameters + @param discard_index if index pages can be thrown away +*/ + +static my_bool protect_against_repair_crash(MARIA_HA *info, + const HA_CHECK *param, + my_bool discard_index) +{ + MARIA_SHARE *share= info->s; + + /* + There are other than recovery-related reasons to do the writes below: + - the physical size of the data file is sometimes used during repair: we + need to flush to have it exact + - we flush the state because maria_open(HA_OPEN_COPY) will want to read + it from disk. + */ + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, + discard_index ? FLUSH_IGNORE_CHANGED : + FLUSH_FORCE_WRITE) || + (share->changed && + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO | + MA_STATE_INFO_WRITE_LOCK))) + return TRUE; + /* In maria_chk this is not needed: */ + if (maria_multi_threaded && share->base.born_transactional) + { + if ((param->testflag & T_NO_CREATE_RENAME_LSN) == 0) + { + /* this can be true only for a transactional table */ + maria_mark_in_repair(info); + if (_ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK)) + return TRUE; + } + if (translog_status == TRANSLOG_OK && + _ma_update_state_lsns(share, translog_get_horizon(), + share->state.create_trid, FALSE, FALSE)) + return TRUE; + if (_ma_sync_table_files(info)) + return TRUE; + } + return FALSE; +} + + +/** + @brief Initialize variables for repair +*/ + +static int initialize_variables_for_repair(HA_CHECK *param, + MARIA_SORT_INFO *sort_info, + MARIA_SORT_PARAM *sort_param, + MARIA_HA *info, + my_bool rep_quick, + MARIA_SHARE *org_share) +{ + MARIA_SHARE *share= info->s; + size_t tmp; + uint threads; + + /* + We have to clear these variables first, as the cleanup-in-case-of-error + handling may touch these. + */ + bzero((char*) sort_info, sizeof(*sort_info)); + bzero((char*) sort_param, sizeof(*sort_param)); + bzero(&info->rec_cache, sizeof(info->rec_cache)); + + if (share->data_file_type == NO_RECORD) + { + _ma_check_print_error(param, + "Can't repair tables with record type NO_DATA"); + return 1; + } + + /* Make a copy to allow us to restore state and check how state changed */ + memcpy(org_share, share, sizeof(*share)); + + /* Repair code relies on share->state.state so we have to update it here */ + if (share->lock.update_status) + (*share->lock.update_status)(info->lock.status_param); + + param->testflag|= T_REP; /* for easy checking */ + if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)) + param->testflag|= T_CALC_CHECKSUM; + param->glob_crc= 0; + if (rep_quick) + param->testflag|= T_QUICK; + else + param->testflag&= ~T_QUICK; + param->org_key_map= share->state.key_map; + + /* + Clear check variables set by repair. This is needed to allow one to run + several repair's in a row with same param + */ + param->retry_repair= 0; + param->warning_printed= 0; + param->error_printed= 0; + param->wrong_trd_printed= 0; + + sort_param->sort_info= sort_info; + sort_param->fix_datafile= ! rep_quick; + sort_param->calc_checksum= MY_TEST(param->testflag & T_CALC_CHECKSUM); + sort_info->info= sort_info->new_info= info; + sort_info->param= param; + set_data_file_type(sort_info, info->s); + sort_info->org_data_file_type= share->data_file_type; + + info->rec_cache.file= info->dfile.file; + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (protect_against_repair_crash(info, param, + !MY_TEST(param->testflag & + T_CREATE_MISSING_KEYS))) + return 1; + + /* calculate max_records */ + sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)); + + param->max_progress= sort_info->filelength; + if ((param->testflag & T_CREATE_MISSING_KEYS) || + sort_info->org_data_file_type == COMPRESSED_RECORD) + sort_info->max_records= share->state.state.records; + else + { + ulong rec_length; + rec_length= MY_MAX(share->base.min_pack_length, + share->base.min_block_length); + sort_info->max_records= (ha_rows) (sort_info->filelength / rec_length); + } + + /* We don't need a bigger sort buffer than file_length * 8 */ + threads= (param->testflag & T_REP_PARALLEL) ? (uint) share->base.keys : 1; + tmp= (size_t) MY_MIN(sort_info->filelength, + (my_off_t) (SIZE_T_MAX/10/threads)); + tmp= MY_MAX(tmp * 8 * threads, (size_t) 65536); /* Some margin */ + param->sort_buffer_length= MY_MIN(param->orig_sort_buffer_length, tmp); + set_if_bigger(param->sort_buffer_length, MARIA_MIN_SORT_MEMORY); + /* Protect against too big sort buffer length */ +#if SIZEOF_SIZE_T >= 8 + set_if_smaller(param->sort_buffer_length, 16LL*1024LL*1024LL*1024LL); +#else + set_if_smaller(param->sort_buffer_length, 1L*1024L*1024L*1024L); +#endif + + /* Set up transaction handler so that we can see all rows */ + if (param->max_trid == 0) + { + if (!ma_control_file_inited()) + param->max_trid= 0; /* Give warning for first trid found */ + else + param->max_trid= max_trid_in_system(); + } + maria_ignore_trids(info); + /* Don't write transid's during repair */ + maria_versioning(info, 0); + /* remember original number of rows */ + *info->state= info->s->state.state; + return 0; +} + + +/* + During initialize_variables_for_repair and related functions we set some + variables to values that makes sence during repair. + This function restores these values to their original values so that we can + use the handler in MariaDB without having to close and open the table. +*/ + +static void restore_table_state_after_repair(MARIA_HA *info, + MARIA_SHARE *org_share) +{ + maria_versioning(info, info->s->have_versioning); + info->s->lock_key_trees= org_share->lock_key_trees; + DBUG_ASSERT(!info->s->have_versioning || info->s->lock_key_trees); +} + + +/** + @brief Drop all indexes + + @param[in] param check parameters + @param[in] info MARIA_HA handle + @param[in] force if to force drop all indexes + + @return status + @retval 0 OK + @retval != 0 Error + + @note + Once allocated, index blocks remain part of the key file forever. + When indexes are disabled, no block is freed. When enabling indexes, + no block is freed either. The new indexes are create from new + blocks. (Bug #4692) + + Before recreating formerly disabled indexes, the unused blocks + must be freed. There are two options to do this: + - Follow the tree of disabled indexes, add all blocks to the + deleted blocks chain. Would require a lot of random I/O. + - Drop all blocks by clearing all index root pointers and all + delete chain pointers and resetting key_file_length to the end + of the index file header. This requires to recreate all indexes, + even those that may still be intact. + The second method is probably faster in most cases. + + When disabling indexes, MySQL disables either all indexes or all + non-unique indexes. When MySQL [re-]enables disabled indexes + (T_CREATE_MISSING_KEYS), then we either have "lost" blocks in the + index file, or there are no non-unique indexes. In the latter case, + maria_repair*() would not be called as there would be no disabled + indexes. + + If there would be more unique indexes than disabled (non-unique) + indexes, we could do the first method. But this is not implemented + yet. By now we drop and recreate all indexes when repair is called. + + However, there is an exception. Sometimes MySQL disables non-unique + indexes when the table is empty (e.g. when copying a table in + mysql_alter_table()). When enabling the non-unique indexes, they + are still empty. So there is no index block that can be lost. This + optimization is implemented in this function. + + Note that in normal repair (T_CREATE_MISSING_KEYS not set) we + recreate all enabled indexes unconditonally. We do not change the + key_map. Otherwise we invert the key map temporarily (outside of + this function) and recreate the then "seemingly" enabled indexes. + When we cannot use the optimization, and drop all indexes, we + pretend that all indexes were disabled. By the inversion, we will + then recrate all indexes. +*/ + +static int maria_drop_all_indexes(HA_CHECK *param, MARIA_HA *info, + my_bool force) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO *state= &share->state; + uint i; + DBUG_ENTER("maria_drop_all_indexes"); + + /* + If any of the disabled indexes has a key block assigned, we must + drop and recreate all indexes to avoid losing index blocks. + + If we want to recreate disabled indexes only _and_ all of these + indexes are empty, we don't need to recreate the existing indexes. + */ + if (!force && (param->testflag & T_CREATE_MISSING_KEYS)) + { + DBUG_PRINT("repair", ("creating missing indexes")); + for (i= 0; i < share->base.keys; i++) + { + DBUG_PRINT("repair", ("index #: %u key_root:%lld active: %d", + i, state->key_root[i], + maria_is_key_active(state->key_map, i))); + if ((state->key_root[i] != HA_OFFSET_ERROR) && + !maria_is_key_active(state->key_map, i)) + { + /* + This index has at least one key block and it is disabled. + We would lose its block(s) if would just recreate it. + So we need to drop and recreate all indexes. + */ + DBUG_PRINT("repair", ("nonempty and disabled: recreate all")); + break; + } + } + if (i >= share->base.keys) + goto end; + + /* + We do now drop all indexes and declare them disabled. With the + T_CREATE_MISSING_KEYS flag, maria_repair*() will recreate all + disabled indexes and enable them. + */ + maria_clear_all_keys_active(state->key_map); + DBUG_PRINT("repair", ("declared all indexes disabled")); + } + + /* Flush obsolete index data from key cache */ + _ma_flush_table_files(info, MARIA_FLUSH_INDEX, + FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED); + /* Clear index root block pointers. */ + for (i= 0; i < share->base.keys; i++) + state->key_root[i]= HA_OFFSET_ERROR; + + /* Drop the delete chain. */ + share->state.key_del= HA_OFFSET_ERROR; + + /* Reset index file length to end of index file header. */ + share->state.state.key_file_length= share->base.keystart; + +end: + DBUG_RETURN(0); +} + + +/* + Recover old table by reading each record and writing all keys + + NOTES + Save new datafile-name in temp_filename. + We overwrite the index file as we go (writekeys() for example), so if we + crash during this the table is unusable and user (or Recovery in the + future) must repeat the REPAIR/OPTIMIZE operation. We could use a + temporary index file in the future (drawback: more disk space). + + IMPLEMENTATION (for hard repair with block format) + - Create new, unrelated MARIA_HA of the table + - Create new datafile and associate it with new handler + - Reset all statistic information in new handler + - Copy all data to new handler with normal write operations + - Move state of new handler to old handler + - Close new handler + - Close data file in old handler + - Rename old data file to new data file. + - Reopen data file in old handler +*/ + +int maria_repair(HA_CHECK *param, register MARIA_HA *info, + char *name, my_bool rep_quick) +{ + int error, got_error; + ha_rows start_records,new_header_length; + my_off_t del; + File new_file; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO sort_info; + MARIA_SORT_PARAM sort_param; + my_bool block_record, scan_inited= 0, reenable_logging= 0; + enum data_file_type org_data_file_type= share->data_file_type; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + MARIA_SHARE backup_share; + DBUG_ENTER("maria_repair"); + + got_error= 1; + new_file= -1; + start_records= share->state.state.records; + if (!(param->testflag & T_SILENT)) + { + printf("- recovering (with keycache) Aria-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records, llbuff)); + } + + if (initialize_variables_for_repair(param, &sort_info, &sort_param, info, + rep_quick, &backup_share)) + goto err; + + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + + sort_param.current_filepos= sort_param.filepos= new_header_length= + ((param->testflag & T_UNPACK) ? 0L : share->pack.header_length); + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file= mysql_file_create(key_file_tmp, + fn_format(param->temp_filename, + share->data_file_name.str, "", + DATA_TMP_EXT, 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file, 0L, + new_header_length, "datafile-header")) + goto err; + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file= new_file; /* For sort_delete_record */ + if (share->data_file_type == BLOCK_RECORD || + (param->testflag & T_UNPACK)) + { + if (create_new_data_handle(&sort_param, new_file)) + goto err; + sort_info.new_info->rec_cache.file= new_file; + } + } + + block_record= sort_info.new_info->s->data_file_type == BLOCK_RECORD; + + if (org_data_file_type != BLOCK_RECORD) + { + /* We need a read buffer to read rows in big blocks */ + if (init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, share->pack.header_length, 1, MYF(MY_WME))) + goto err; + } + if (sort_info.new_info->s->data_file_type != BLOCK_RECORD) + { + /* When writing to not block records, we need a write buffer */ + if (!rep_quick) + { + if (init_io_cache(&sort_info.new_info->rec_cache, new_file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw)) + goto err; + sort_info.new_info->opt_flag|=WRITE_CACHE_USED; + } + } + else if (block_record) + { + scan_inited= 1; + if (maria_scan_init(sort_info.info)) + goto err; + } + + if (!(sort_param.record= + (uchar *) my_malloc(PSI_INSTRUMENT_ME, (uint) + share->base.default_rec_buff_size, + MYF(param->malloc_flags))) || + _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size, + share->base.default_rec_buff_size, + MYF(param->malloc_flags))) + { + _ma_check_print_error(param, "Not enough memory for extra record"); + goto err; + } + + sort_param.read_cache=param->read_cache; + sort_param.pos=sort_param.max_pos=share->pack.header_length; + param->read_cache.end_of_file= sort_info.filelength; + sort_param.master=1; + sort_info.max_records= ~(ha_rows) 0; + + del= share->state.state.del; + share->state.state.records= share->state.state.del= share->state.split= 0; + share->state.state.empty= 0; + + if (param->testflag & T_CREATE_MISSING_KEYS) + maria_set_all_keys_active(share->state.key_map, share->base.keys); + maria_drop_all_indexes(param, info, TRUE); + + maria_lock_memory(param); /* Everything is alloced */ + + sort_param.sort_info->info->in_check_table= 1; + /* Re-create all keys, which are set in key_map. */ + while (!(error=sort_get_next_record(&sort_param))) + { + if (block_record && _ma_sort_write_record(&sort_param)) + goto err; + + if (writekeys(&sort_param)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY) + goto err; + DBUG_DUMP("record", sort_param.record, + share->base.default_rec_buff_size); + _ma_check_print_warning(param, + "Duplicate key %2d for record at %10s against " + "new record at %10s", + info->errkey+1, + record_pos_to_txt(info, + sort_param.current_filepos, + llbuff), + record_pos_to_txt(info, + info->dup_key_pos, llbuff2)); + if (param->testflag & T_VERBOSE) + { + MARIA_KEY tmp_key; + MARIA_KEYDEF *keyinfo= share->keyinfo + info->errkey; + (*keyinfo->make_key)(info, &tmp_key, (uint) info->errkey, + info->lastkey_buff, + sort_param.record, 0L, 0); + _ma_print_key(stdout, &tmp_key); + } + sort_info.dupp++; + if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK) + { + param->testflag|=T_RETRY_WITHOUT_QUICK; + param->error_printed++; + goto err; + } + /* purecov: begin tested */ + if (block_record) + { + sort_info.new_info->s->state.state.records--; + if ((*sort_info.new_info->s->write_record_abort)(sort_info.new_info)) + { + _ma_check_print_error(param,"Couldn't delete duplicate row"); + goto err; + } + } + /* purecov: end */ + continue; + } + if (!block_record) + { + if (_ma_sort_write_record(&sort_param)) + goto err; + /* Filepos is pointer to where next row will be stored */ + sort_param.current_filepos= sort_param.filepos; + } + } + if (error > 0 || maria_write_data_suffix(&sort_info, !rep_quick) || + flush_io_cache(&sort_info.new_info->rec_cache) || + param->read_cache.error < 0) + goto err; + + if (param->testflag & T_WRITE_LOOP) + { + fputs(" \r",stdout); fflush(stdout); + } + if (mysql_file_chsize(share->kfile.file, + share->state.state.key_file_length, 0, MYF(0))) + { + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + goto err; + } + + if (rep_quick && del+sort_info.dupp != share->state.state.del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: " + "Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (sort_info.new_info->s->state.state.records+1 < start_records) + { + share->state.state.records= start_records; + goto err; + } + } + + end_io_cache(&sort_info.new_info->rec_cache); + info->opt_flag&= ~WRITE_CACHE_USED; + + /* + As we have read the data file (sort_get_next_record()) we may have + cached, non-changed blocks of it in the page cache. We must throw them + away as we are going to close their descriptor ('new_file'). We also want + to flush any index block, so that it is ready for the upcoming sync. + */ + if (_ma_flush_table_files_before_swap(param, info)) + goto err; + + if (!rep_quick) + { + sort_info.new_info->s->state.state.data_file_length= sort_param.filepos; + if (sort_info.new_info != sort_info.info) + { + MARIA_STATE_INFO save_state= sort_info.new_info->s->state; + if (maria_close(sort_info.new_info)) + { + _ma_check_print_error(param, "Got error %d on close", my_errno); + goto err; + } + copy_data_file_state(&share->state, &save_state); + new_file= -1; + sort_info.new_info= info; + } + share->state.version=(ulong) time((time_t*) 0); /* Force reopen */ + + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + mysql_file_close(new_file, MYF(MY_WME)); + new_file= -1; + change_data_file_descriptor(info, -1); + if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT, + DATA_TMP_EXT, param->backup_time, + (param->testflag & T_BACKUP_DATA ? + MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) | + sync_dir) || + _ma_open_datafile(info, share)) + { + goto err; + } + } + else + { + share->state.state.data_file_length= sort_param.max_pos; + } + if (param->testflag & T_CALC_CHECKSUM) + share->state.state.checksum= param->glob_crc; + + if (!(param->testflag & T_SILENT)) + { + if (start_records != share->state.state.records) + printf("Data records: %s\n", llstr(share->state.state.records,llbuff)); + } + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + + got_error= 0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + *info->state= *info->state_start= share->state.state; + +err: + if (scan_inited) + maria_scan_end(sort_info.info); + _ma_reset_state(info); + + end_io_cache(¶m->read_cache); + if (sort_info.new_info) + { + end_io_cache(&sort_info.new_info->rec_cache); + sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + } + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + + sort_param.sort_info->info->in_check_table= 0; + /* this below could fail, shouldn't we detect error? */ + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d for record at pos %s",my_errno, + llstr(sort_param.start_recpos,llbuff)); + (void)_ma_flush_table_files_before_swap(param, info); + if (sort_info.new_info && sort_info.new_info != sort_info.info) + { + unuse_data_file_descriptor(sort_info.new_info); + maria_close(sort_info.new_info); + } + if (new_file >= 0) + { + mysql_file_close(new_file,MYF(0)); + mysql_file_delete(key_file_tmp, param->temp_filename, MYF(MY_WME)); + } + maria_mark_crashed_on_repair(info); + } + /* If caller had disabled logging it's not up to us to re-enable it */ + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + restore_table_state_after_repair(info, &backup_share); + + my_free(sort_param.rec_buff); + my_free(sort_param.record); + my_free(sort_info.buff); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + share->state.changed|= (STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES | + STATE_NOT_ANALYZED | STATE_NOT_ZEROFILLED); + if (!rep_quick) + share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE); + DBUG_RETURN(got_error); +} + + +/* Uppdate keyfile when doing repair */ + +static int writekeys(MARIA_SORT_PARAM *sort_param) +{ + uint i; + MARIA_HA *info= sort_param->sort_info->info; + MARIA_SHARE *share= info->s; + uchar *record= sort_param->record; + uchar *key_buff; + my_off_t filepos= sort_param->current_filepos; + MARIA_KEY key; + DBUG_ENTER("writekeys"); + + key_buff= info->lastkey_buff+share->base.max_key_length; + + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (share->keyinfo[i].flag & HA_FULLTEXT ) + { + if (_ma_ft_add(info, i, key_buff, record, filepos)) + goto err; + } + else + { + if (!(*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, + filepos, 0)) + goto err; + if ((*share->keyinfo[i].ck_insert)(info, &key)) + goto err; + } + } + } + DBUG_RETURN(0); + + err: + if (my_errno == HA_ERR_FOUND_DUPP_KEY) + { + info->errkey=(int) i; /* This key was found */ + while ( i-- > 0 ) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if (_ma_ft_del(info,i,key_buff,record,filepos)) + break; + } + else + { + (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, + filepos, 0); + if (_ma_ck_delete(info, &key)) + break; + } + } + } + } + /* Remove checksum that was added to glob_crc in sort_get_next_record */ + if (sort_param->calc_checksum) + sort_param->sort_info->param->glob_crc-= info->cur_row.checksum; + DBUG_PRINT("error",("errno: %d",my_errno)); + DBUG_RETURN(-1); +} /* writekeys */ + + + /* Change all key-pointers that points to a records */ + +int maria_movepoint(register MARIA_HA *info, uchar *record, + MARIA_RECORD_POS oldpos, MARIA_RECORD_POS newpos, + uint prot_key) +{ + uint i; + uchar *key_buff; + MARIA_SHARE *share= info->s; + MARIA_PAGE page; + DBUG_ENTER("maria_movepoint"); + + key_buff= info->lastkey_buff + share->base.max_key_length; + for (i=0 ; i < share->base.keys; i++) + { + if (i != prot_key && maria_is_key_active(share->state.key_map, i)) + { + MARIA_KEY key; + (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, oldpos, + 0); + if (key.keyinfo->flag & HA_NOSAME) + { /* Change pointer direct */ + MARIA_KEYDEF *keyinfo; + keyinfo=share->keyinfo+i; + if (_ma_search(info, &key, (uint32) (SEARCH_SAME | SEARCH_SAVE_BUFF), + share->state.key_root[i])) + DBUG_RETURN(-1); + _ma_page_setup(&page, info, keyinfo, info->last_keypage, + info->keyread_buff); + + _ma_dpointer(share, info->int_keypos - page.node - + share->rec_reflength,newpos); + + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS)) + DBUG_RETURN(-1); + } + else + { /* Change old key to new */ + if (_ma_ck_delete(info, &key)) + DBUG_RETURN(-1); + (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, newpos, + 0); + if (_ma_ck_write(info, &key)) + DBUG_RETURN(-1); + } + } + } + DBUG_RETURN(0); +} /* maria_movepoint */ + + + /* Tell system that we want all memory for our cache */ + +void maria_lock_memory(HA_CHECK *param __attribute__((unused))) +{ +#ifdef SUN_OS /* Key-cacheing thrases on sun 4.1 */ + if (param->opt_maria_lock_memory) + { + int success = mlockall(MCL_CURRENT); /* or plock(DATLOCK); */ + if (geteuid() == 0 && success != 0) + _ma_check_print_warning(param, + "Failed to lock memory. errno %d",my_errno); + } +#endif +} /* maria_lock_memory */ + + +/** + Flush all changed blocks to disk. + + We release blocks as it's unlikely that they would all be needed soon. + This function needs to be called before swapping data or index files or + syncing them. + + @param param description of the repair operation + @param info table +*/ + +static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param, + MARIA_HA *info) +{ + DBUG_ENTER("_ma_flush_table_files_before_swap"); + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE)) + { + _ma_check_print_error(param, "%d when trying to write buffers", my_errno); + DBUG_RETURN(TRUE); + } + DBUG_RETURN(FALSE); +} + + + /* Sort index for more efficent reads */ + +int maria_sort_index(HA_CHECK *param, register MARIA_HA *info, char *name) +{ + reg2 uint key; + reg1 MARIA_KEYDEF *keyinfo; + File new_file; + my_off_t index_pos[HA_MAX_POSSIBLE_KEY]; + uint r_locks,w_locks; + int old_lock; + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO old_state; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + DBUG_ENTER("maria_sort_index"); + + /* cannot sort index files with R-tree indexes */ + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + key++,keyinfo++) + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + DBUG_RETURN(0); + + if (!(param->testflag & T_SILENT)) + printf("- Sorting index for Aria-table '%s'\n",name); + + if (protect_against_repair_crash(info, param, FALSE)) + DBUG_RETURN(1); + + /* Get real path for index file */ + fn_format(param->temp_filename,name,"", MARIA_NAME_IEXT,2+4+32); + if ((new_file=mysql_file_create(key_file_kfile, fn_format(param->temp_filename,param->temp_filename, + "", INDEX_TMP_EXT,2+4), + 0, param->tmpfile_createflag, MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + DBUG_RETURN(-1); + } + if (maria_filecopy(param, new_file, share->kfile.file, 0L, + (ulong) share->base.keystart, "headerblock")) + goto err; + + param->new_file_pos=share->base.keystart; + for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ; + key++,keyinfo++) + { + if (maria_is_key_active(share->state.key_map, key) && + share->state.key_root[key] != HA_OFFSET_ERROR) + { + index_pos[key]=param->new_file_pos; /* Write first block here */ + if (sort_one_index(param,info,keyinfo,share->state.key_root[key], + new_file)) + goto err; + } + else + index_pos[key]= HA_OFFSET_ERROR; /* No blocks */ + } + + /* Flush key cache for this file if we are calling this outside maria_chk */ + flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_IGNORE_CHANGED); + + share->state.version=(ulong) time((time_t*) 0); + old_state= share->state; /* save state if not stored */ + r_locks= share->r_locks; + w_locks= share->w_locks; + old_lock= info->lock_type; + + /* Put same locks as old file */ + share->r_locks= share->w_locks= share->tot_locks= 0; + (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE); + mysql_mutex_lock(&share->intern_lock); + mysql_file_close(share->kfile.file, MYF(MY_WME)); + share->kfile.file = -1; + mysql_mutex_unlock(&share->intern_lock); + mysql_file_close(new_file, MYF(MY_WME)); + if (maria_change_to_newfile(share->index_file_name.str, MARIA_NAME_IEXT, + INDEX_TMP_EXT, 0, sync_dir) || + _ma_open_keyfile(share)) + goto err2; + info->lock_type= F_UNLCK; /* Force maria_readinfo to lock */ + _ma_readinfo(info,F_WRLCK,0); /* Will lock the table */ + info->lock_type= old_lock; + share->r_locks= r_locks; + share->w_locks= w_locks; + share->tot_locks= r_locks+w_locks; + share->state= old_state; /* Restore old state */ + + share->state.state.key_file_length=param->new_file_pos; + info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + for (key=0 ; key < share->base.keys ; key++) + share->state.key_root[key]=index_pos[key]; + share->state.key_del= HA_OFFSET_ERROR; + + share->state.changed&= ~STATE_NOT_SORTED_PAGES; + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_sort_index", + { + DBUG_PRINT("maria_crash_sort_index", ("now")); + DBUG_SUICIDE(); + }); + DBUG_RETURN(0); + +err: + mysql_file_close(new_file, MYF(MY_WME)); +err2: + mysql_file_delete(key_file_tmp, param->temp_filename,MYF(MY_WME)); + DBUG_RETURN(-1); +} /* maria_sort_index */ + + +/** + @brief write a page directly to index file + +*/ + +static int write_page(MARIA_SHARE *share, File file, + uchar *buff, uint block_size, + my_off_t pos, int myf_rw) +{ + int res; + PAGECACHE_IO_HOOK_ARGS args; + args.page= buff; + args.pageno= (pgcache_page_no_t) (pos / share->block_size); + args.data= (uchar*) share; + args.crypt_buf= NULL; + (* share->kfile.pre_write_hook)(&args); + res= (int)my_pwrite(file, args.page, block_size, pos, myf_rw); + (* share->kfile.post_write_hook)(res, &args); + return res; +} + + +/* Sort index blocks recursive using one index */ + +static int sort_one_index(HA_CHECK *param, MARIA_HA *info, + MARIA_KEYDEF *keyinfo, + my_off_t pagepos, File new_file) +{ + uint length,nod_flag; + uchar *buff,*keypos,*endpos; + my_off_t new_page_pos,next_page; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + MARIA_PAGE page; + my_bool buff_alloced; + DBUG_ENTER("sort_one_index"); + + /* cannot walk over R-tree indices */ + DBUG_ASSERT(keyinfo->key_alg != HA_KEY_ALG_RTREE); + new_page_pos=param->new_file_pos; + param->new_file_pos+=keyinfo->block_length; + key.keyinfo= keyinfo; + + alloc_on_stack(*param->stack_end_ptr, buff, buff_alloced, + keyinfo->block_length + keyinfo->max_store_length); + if (!buff) + { + _ma_check_print_error(param,"Not enough memory for keyblock"); + DBUG_RETURN(-1); + } + key.data= buff + keyinfo->block_length; + + if (_ma_fetch_keypage(&page, info, keyinfo, pagepos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0)) + { + report_keypage_fault(param, info, pagepos); + goto err; + } + + if ((nod_flag= page.node) || keyinfo->flag & HA_FULLTEXT) + { + keypos= page.buff + share->keypage_header + nod_flag; + endpos= page.buff + page.size; + + for ( ;; ) + { + if (nod_flag) + { + next_page= _ma_kpos(nod_flag,keypos); + /* Save new pos */ + _ma_kpointer(info,keypos-nod_flag,param->new_file_pos); + if (sort_one_index(param,info,keyinfo,next_page,new_file)) + { + DBUG_PRINT("error", + ("From page: %ld, keyoffset: %lu used_length: %d", + (ulong) pagepos, (ulong) (keypos - buff), + (int) page.size)); + DBUG_DUMP("buff", page.buff, page.size); + goto err; + } + } + if (keypos >= endpos || + !(*keyinfo->get_key)(&key, page.flag, nod_flag, &keypos)) + break; + DBUG_ASSERT(keypos <= endpos); + if (keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + get_key_full_length_rdonly(off, key.data); + subkeys= ft_sintXkorr(key.data + off); + if (subkeys < 0) + { + next_page= _ma_row_pos_from_key(&key); + _ma_dpointer(share, keypos - nod_flag - share->rec_reflength, + param->new_file_pos); /* Save new pos */ + if (sort_one_index(param,info,&share->ft2_keyinfo, + next_page,new_file)) + goto err; + } + } + } + } + + /* Fill block with zero and write it to the new index file */ + length= page.size; + bzero(buff+length,keyinfo->block_length-length); + if (write_page(share, new_file, buff, keyinfo->block_length, + new_page_pos, MYF(MY_NABP | MY_WAIT_IF_FULL))) + { + _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno); + goto err; + } + stack_alloc_free(buff, buff_alloced); + DBUG_RETURN(0); +err: + stack_alloc_free(buff, buff_alloced); + DBUG_RETURN(1); +} /* sort_one_index */ + + +/** + @brief Fill empty space in index file with zeroes + + @return + @retval 0 Ok + @retval 1 Error +*/ + +static my_bool maria_zerofill_index(HA_CHECK *param, MARIA_HA *info, + const char *name) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + char llbuff[21]; + uchar *buff; + pgcache_page_no_t page; + my_off_t pos; + my_off_t key_file_length= share->state.state.key_file_length; + uint block_size= share->block_size; + my_bool zero_lsn= (share->base.born_transactional && + !(param->testflag & T_ZEROFILL_KEEP_LSN)); + int error= 1; + enum pagecache_page_type page_type= (share->base.born_transactional ? + PAGECACHE_LSN_PAGE : + PAGECACHE_PLAIN_PAGE); + DBUG_ENTER("maria_zerofill_index"); + + if (!(param->testflag & T_SILENT)) + printf("- Zerofilling index for Aria-table '%s'\n",name); + + /* Go through the index file */ + for (pos= share->base.keystart, page= (ulonglong) (pos / block_size); + pos < key_file_length; + pos+= block_size, page++) + { + uint length; + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, page, + DFLT_INIT_HITS, 0, + page_type, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + _ma_check_print_error(param, + "Page %9s: Got error %d when reading index file", + llstr(pos, llbuff), my_errno); + goto end; + } + if (zero_lsn) + bzero(buff, LSN_SIZE); + + if (share->base.born_transactional) + { + uint keynr= _ma_get_keynr(share, buff); + if (keynr < share->base.keys) + { + MARIA_PAGE page; + DBUG_ASSERT(keynr < share->base.keys); + + _ma_page_setup(&page, info, share->keyinfo + keynr, pos, buff); + if (_ma_compact_keypage(&page, ~(TrID) 0)) + { + _ma_check_print_error(param, + "Page %9s: Got error %d when reading index " + "file", + llstr(pos, llbuff), my_errno); + goto end; + } + } + } + + length= _ma_get_page_used(share, buff); + DBUG_ASSERT(length <= block_size); + if (length < block_size) + bzero(buff + length, block_size - length); + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + } + error= 0; /* ok */ + +end: + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_FORCE_WRITE)) + DBUG_RETURN(1); + DBUG_RETURN(error); +} + + +/** + @brief Fill empty space in data file with zeroes + + @todo + Zerofill all pages marked in bitmap as empty and change them to + be of type UNALLOCATED_PAGE + + @return + @retval 0 Ok + @retval 1 Error +*/ + +static my_bool maria_zerofill_data(HA_CHECK *param, MARIA_HA *info, + const char *name) +{ + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + char llbuff[21]; + my_off_t pos; + pgcache_page_no_t page; + uint block_size= share->block_size; + MARIA_FILE_BITMAP *bitmap= &share->bitmap; + my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN), error; + enum pagecache_page_type read_page_type= (share->base.born_transactional ? + PAGECACHE_LSN_PAGE : + PAGECACHE_PLAIN_PAGE); + DBUG_ENTER("maria_zerofill_data"); + + /* This works only with BLOCK_RECORD files */ + if (share->data_file_type != BLOCK_RECORD) + DBUG_RETURN(0); + + if (!(param->testflag & T_SILENT)) + printf("- Zerofilling data for Aria-table '%s'\n",name); + + /* Go through the record file */ + for (page= 1, pos= block_size; + pos < share->state.state.data_file_length; + pos+= block_size, page++) + { + uchar *buff; + enum en_page_type page_type; + + /* Ignore bitmap pages */ + if ((page % share->bitmap.pages_covered) == 0) + continue; + if (!(buff= pagecache_read(share->pagecache, + &info->dfile, + page, 1, 0, + read_page_type, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + _ma_check_print_error(param, + "Page %9s: Got error: %d when reading datafile", + llstr(pos, llbuff), my_errno); + goto err; + } + page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK); + switch (page_type) { + case UNALLOCATED_PAGE: + if (zero_lsn) + bzero(buff, block_size); + else + bzero(buff + LSN_SIZE, block_size - LSN_SIZE); + break; + case BLOB_PAGE: + if (_ma_bitmap_get_page_bits(info, bitmap, page) == 0) + { + /* Unallocated page */ + if (zero_lsn) + bzero(buff, block_size); + else + bzero(buff + LSN_SIZE, block_size - LSN_SIZE); + } + else + if (zero_lsn) + bzero(buff, LSN_SIZE); + break; + case HEAD_PAGE: + case TAIL_PAGE: + { + uint max_entry= (uint) buff[DIR_COUNT_OFFSET]; + uint offset, dir_start, empty_space; + uchar *dir; + + if (zero_lsn) + bzero(buff, LSN_SIZE); + if (max_entry != 0) + { + my_bool is_head_page= (page_type == HEAD_PAGE); + dir= dir_entry_pos(buff, block_size, max_entry - 1); + _ma_compact_block_page(share, + buff, max_entry -1, 0, + is_head_page ? ~(TrID) 0 : 0, + is_head_page ? + share->base.min_block_length : 0); + + /* compactation may have increased free space */ + empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); + if (!enough_free_entries_on_page(share, buff)) + empty_space= 0; /* Page is full */ + if (_ma_bitmap_set(info, page, is_head_page, + empty_space)) + goto err; + + /* Zerofill the not used part */ + offset= uint2korr(dir) + uint2korr(dir+2); + dir_start= (uint) (dir - buff); + DBUG_ASSERT(dir_start >= offset); + if (dir_start > offset) + bzero(buff + offset, dir_start - offset); + } + break; + } + default: + _ma_check_print_error(param, + "Page %9s: Found unrecognizable block of type %d", + llstr(pos, llbuff), page_type); + goto err; + } + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 1, FALSE); + } + error= _ma_bitmap_flush(share); + if (flush_pagecache_blocks(share->pagecache, &info->dfile, + FLUSH_FORCE_WRITE)) + error= 1; + DBUG_RETURN(error); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + /* flush what was changed so far */ + (void) _ma_bitmap_flush(share); + (void) flush_pagecache_blocks(share->pagecache, &info->dfile, + FLUSH_FORCE_WRITE); + + DBUG_RETURN(1); +} + + +/** + @brief Fill empty space in index and data files with zeroes + + @return + @retval 0 Ok + @retval 1 Error +*/ + +int maria_zerofill(HA_CHECK *param, MARIA_HA *info, const char *name) +{ + my_bool error, reenable_logging, + zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN); + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_zerofill"); + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + if (!(error= (maria_zerofill_index(param, info, name) || + maria_zerofill_data(param, info, name) || + _ma_set_uuid(info->s, 0)))) + { + /* + Mark that we have done zerofill of data and index. If we zeroed pages' + LSN, table is movable. + */ + share->state.changed&= ~STATE_NOT_ZEROFILLED; + if (zero_lsn) + { + share->state.changed&= ~(STATE_NOT_MOVABLE | STATE_MOVED); + /* Table should get new LSNs */ + share->state.create_rename_lsn= share->state.is_of_horizon= + share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS; + } + /* Ensure state is later flushed to disk, if within maria_chk */ + info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + /* + Reset create_trid to make file comparable and to ensure that new + trid's in the file starts from 0. + */ + share->state.create_trid= 0; + } + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + DBUG_RETURN(error); +} + + +/* + Let temporary file replace old file. + This assumes that the new file was created in the same + directory as given by realpath(filename). + This will ensure that any symlinks that are used will still work. + Copy stats from old file to new file, deletes orignal and + changes new file name to old file name +*/ + +int maria_change_to_newfile(const char * filename, const char * old_ext, + const char * new_ext, time_t backup_time, + myf MyFlags) +{ + char old_filename[FN_REFLEN],new_filename[FN_REFLEN]; + /* Get real path to filename */ + (void) fn_format(old_filename,filename,"",old_ext,2+4+32); + return my_redel(old_filename, + fn_format(new_filename,old_filename,"",new_ext,2+4), + backup_time, + MYF(MY_WME | MY_LINK_WARNING | MyFlags)); +} /* maria_change_to_newfile */ + + +/* Copy a block between two files */ + +int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start, + my_off_t length, const char *type) +{ + uchar tmp_buff[IO_SIZE], *buff; + ulong buff_length; + DBUG_ENTER("maria_filecopy"); + + buff_length=(ulong) MY_MIN(param->write_buffer_length,length); + if (!(buff=my_malloc(PSI_INSTRUMENT_ME, buff_length, + MYF(param->malloc_flags)))) + { + buff=tmp_buff; buff_length=IO_SIZE; + } + + mysql_file_seek(from, start, MY_SEEK_SET,MYF(0)); + while (length > buff_length) + { + if (mysql_file_read(from, buff, buff_length, MYF(MY_NABP)) || + mysql_file_write(to, buff, buff_length, param->myf_rw)) + goto err; + length-= buff_length; + } + if (mysql_file_read(from, buff, (size_t) length,MYF(MY_NABP)) || + mysql_file_write(to, buff, (size_t) length,param->myf_rw)) + goto err; + if (buff != tmp_buff) + my_free(buff); + DBUG_RETURN(0); +err: + if (buff != tmp_buff) + my_free(buff); + _ma_check_print_error(param,"Can't copy %s to tempfile, error %d", + type,my_errno); + DBUG_RETURN(1); +} + + +/* + Repair table or given index using sorting + + SYNOPSIS + maria_repair_by_sort() + param Repair parameters + info MARIA handler to repair + name Name of table (for warnings) + rep_quick set to <> 0 if we should not change data file + + RESULT + 0 ok + <>0 Error +*/ + +int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info, + const char * name, my_bool rep_quick) +{ + int got_error; + uint i, keys_to_repair; + ha_rows start_records; + my_off_t new_header_length, org_header_length, del; + File new_file; + MARIA_SORT_PARAM sort_param; + MARIA_SHARE *share= info->s; + HA_KEYSEG *keyseg; + double *rec_per_key_part; + char llbuff[22]; + MARIA_SORT_INFO sort_info; + ulonglong UNINIT_VAR(key_map); + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + my_bool scan_inited= 0, reenable_logging= 0; + MARIA_SHARE backup_share; + DBUG_ENTER("maria_repair_by_sort"); + + got_error= 1; + new_file= -1; + start_records= share->state.state.records; + if (!(param->testflag & T_SILENT)) + { + printf("- recovering (with sort) Aria-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records,llbuff)); + } + + if (initialize_variables_for_repair(param, &sort_info, &sort_param, info, + rep_quick, &backup_share)) + goto err; + + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + + org_header_length= share->pack.header_length; + new_header_length= (param->testflag & T_UNPACK) ? 0 : org_header_length; + sort_param.filepos= new_header_length; + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file=mysql_file_create(key_file_tmp, + fn_format(param->temp_filename, + share->data_file_name.str, "", + DATA_TMP_EXT, 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file, 0L, + new_header_length, "datafile-header")) + goto err; + + share->state.dellink= HA_OFFSET_ERROR; + info->rec_cache.file= new_file; /* For sort_delete_record */ + if (share->data_file_type == BLOCK_RECORD || + (param->testflag & T_UNPACK)) + { + if (create_new_data_handle(&sort_param, new_file)) + goto err; + sort_info.new_info->rec_cache.file= new_file; + } + } + + if (!(sort_info.key_block= + alloc_key_blocks(param, + (uint) param->sort_key_blocks, + share->base.max_key_block_length))) + goto err; + sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks; + + if (share->data_file_type != BLOCK_RECORD) + { + /* We need a read buffer to read rows in big blocks */ + if (init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, org_header_length, 1, MYF(MY_WME))) + goto err; + } + if (sort_info.new_info->s->data_file_type != BLOCK_RECORD) + { + /* When writing to not block records, we need a write buffer */ + if (!rep_quick) + { + if (init_io_cache(&sort_info.new_info->rec_cache, new_file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw)) + goto err; + sort_info.new_info->opt_flag|= WRITE_CACHE_USED; + } + } + + if (!(sort_param.record= + (uchar*) my_malloc(PSI_INSTRUMENT_ME, + (size_t) share->base.default_rec_buff_size, + MYF(param->malloc_flags))) || + _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size, + share->base.default_rec_buff_size, + MYF(param->malloc_flags))) + { + _ma_check_print_error(param, "Not enough memory for extra record"); + goto err; + } + + /* Optionally drop indexes and optionally modify the key_map */ + maria_drop_all_indexes(param, info, FALSE); + key_map= share->state.key_map; + if (param->testflag & T_CREATE_MISSING_KEYS) + { + /* Invert the copied key_map to recreate all disabled indexes. */ + key_map= ~key_map; + } + + param->read_cache.end_of_file= sort_info.filelength; + sort_param.wordlist=NULL; + init_alloc_root(PSI_INSTRUMENT_ME, &sort_param.wordroot, + FTPARSER_MEMROOT_ALLOC_SIZE, 0, + MYF(param->malloc_flags)); + + sort_param.key_cmp=sort_key_cmp; + sort_param.lock_in_memory=maria_lock_memory; + sort_param.tmpdir=param->tmpdir; + sort_param.master =1; + + del=share->state.state.del; + + /* Calculate number of keys to repair */ + keys_to_repair= 0; + for (sort_param.key=0 ; sort_param.key < share->base.keys ; + sort_param.key++) + { + if (maria_is_key_active(key_map, sort_param.key)) + keys_to_repair++; + } + /* For each key we scan and merge sort the keys */ + param->max_stage= keys_to_repair*2; + + rec_per_key_part= param->new_rec_per_key_part; + for (sort_param.key=0 ; sort_param.key < share->base.keys ; + rec_per_key_part+=sort_param.keyinfo->keysegs, sort_param.key++) + { + sort_param.keyinfo=share->keyinfo+sort_param.key; + /* + Skip this index if it is marked disabled in the copied + (and possibly inverted) key_map. + */ + if (! maria_is_key_active(key_map, sort_param.key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part + + (uint) (rec_per_key_part - param->new_rec_per_key_part)), + sort_param.keyinfo->keysegs*sizeof(*rec_per_key_part)); + DBUG_PRINT("repair", ("skipping seemingly disabled index #: %u", + sort_param.key)); + continue; + } + + if ((!(param->testflag & T_SILENT))) + printf ("- Fixing index %d\n",sort_param.key+1); + + sort_param.read_cache=param->read_cache; + sort_param.seg=sort_param.keyinfo->seg; + sort_param.max_pos= sort_param.pos= org_header_length; + keyseg=sort_param.seg; + bzero((char*) sort_param.unique,sizeof(sort_param.unique)); + sort_param.key_length=share->rec_reflength; + for (i=0 ; keyseg[i].type != HA_KEYTYPE_END; i++) + { + sort_param.key_length+=keyseg[i].length; + if (keyseg[i].flag & HA_SPACE_PACK) + sort_param.key_length+=get_pack_length(keyseg[i].length); + if (keyseg[i].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + sort_param.key_length+= 2 + MY_TEST(keyseg[i].length >= 127); + if (keyseg[i].flag & HA_NULL_PART) + sort_param.key_length++; + } + share->state.state.records=share->state.state.del=share->state.split=0; + share->state.state.empty=0; + + if (sort_param.keyinfo->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + sort_param.keyinfo->seg->charset->mbmaxlen; + sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + /* + fulltext indexes may have much more entries than the + number of rows in the table. We estimate the number here. + + Note, built-in parser is always nr. 0 - see ftparser_call_initializer() + */ + if (sort_param.keyinfo->ftkey_nr == 0) + { + /* + for built-in parser the number of generated index entries + cannot be larger than the size of the data file divided + by the minimal word's length + */ + sort_info.max_records= + (ha_rows) (sort_info.filelength/ft_min_word_len+1); + } + else + { + /* + for external plugin parser we cannot tell anything at all :( + so, we'll use all the sort memory and start from ~10 buffpeks. + (see _ma_create_index_by_sort) + */ + sort_info.max_records= + 10*param->sort_buffer_length/sort_param.key_length; + } + + sort_param.key_read= sort_maria_ft_key_read; + sort_param.key_write= sort_maria_ft_key_write; + } + else + { + sort_param.key_read= sort_key_read; + sort_param.key_write= sort_key_write; + } + + if (sort_info.new_info->s->data_file_type == BLOCK_RECORD) + { + scan_inited= 1; + if (maria_scan_init(sort_info.info)) + goto err; + } + if (_ma_create_index_by_sort(&sort_param, + (my_bool) (!(param->testflag & T_VERBOSE)), + (size_t) param->sort_buffer_length)) + { + if ((param->testflag & T_CREATE_UNIQUE_BY_SORT) && sort_param.sort_info->dupp) + share->state.dupp_key= sort_param.key; + else + param->retry_repair= 1; + _ma_check_print_error(param, "Create index by sort failed"); + goto err; + } + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_create_index_by_sort", + { + DBUG_PRINT("maria_crash_create_index_by_sort", ("now")); + DBUG_SUICIDE(); + }); + if (scan_inited) + { + scan_inited= 0; + maria_scan_end(sort_info.info); + } + + /* No need to calculate checksum again. */ + sort_param.calc_checksum= 0; + free_root(&sort_param.wordroot, MYF(0)); + + /* Set for next loop */ + sort_info.max_records= (ha_rows) sort_info.new_info->s->state.state.records; + param->stage++; /* Next stage */ + param->progress= 0; + + if (param->testflag & T_STATISTICS) + maria_update_key_parts(sort_param.keyinfo, rec_per_key_part, + sort_param.unique, + (param->stats_method == + MI_STATS_METHOD_IGNORE_NULLS ? + sort_param.notnull : NULL), + (ulonglong) share->state.state.records); + maria_set_key_active(share->state.key_map, sort_param.key); + DBUG_PRINT("repair", ("set enabled index #: %u", sort_param.key)); + + if (_ma_flush_table_files_before_swap(param, info)) + goto err; + + if (sort_param.fix_datafile) + { + param->read_cache.end_of_file=sort_param.filepos; + if (maria_write_data_suffix(&sort_info,1) || + end_io_cache(&sort_info.new_info->rec_cache)) + { + _ma_check_print_error(param, "Got error when flushing row cache"); + goto err; + } + sort_info.new_info->opt_flag&= ~WRITE_CACHE_USED; + + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (sort_info.new_info->s->state.state.records+1 < start_records) + { + _ma_check_print_error(param, + "Rows lost (Found %lu of %lu); Aborting " + "because safe repair was requested", + (ulong) sort_info.new_info->s-> + state.state.records, + (ulong) start_records); + share->state.state.records=start_records; + goto err; + } + } + + sort_info.new_info->s->state.state.data_file_length= sort_param.filepos; + if (sort_info.new_info != sort_info.info) + { + MARIA_STATE_INFO save_state= sort_info.new_info->s->state; + if (maria_close(sort_info.new_info)) + { + _ma_check_print_error(param, "Got error %d on close", my_errno); + goto err; + } + copy_data_file_state(&share->state, &save_state); + new_file= -1; + sort_info.new_info= info; + info->rec_cache.file= info->dfile.file; + } + + share->state.version=(ulong) time((time_t*) 0); /* Force reopen */ + + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + { + mysql_file_close(new_file, MYF(MY_WME)); + new_file= -1; + } + change_data_file_descriptor(info, -1); + if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT, + DATA_TMP_EXT, param->backup_time, + (param->testflag & T_BACKUP_DATA ? + MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) | + sync_dir) || + _ma_open_datafile(info, share)) + { + _ma_check_print_error(param, "Couldn't change to new data file"); + goto err; + } + if (param->testflag & T_UNPACK) + restore_data_file_type(share); + + org_header_length= share->pack.header_length; + sort_info.org_data_file_type= share->data_file_type; + sort_info.filelength= share->state.state.data_file_length; + sort_param.fix_datafile=0; + + /* Offsets are now in proportion to the new file length */ + param->max_progress= sort_info.filelength; + + } + else + share->state.state.data_file_length=sort_param.max_pos; + + param->read_cache.file= info->dfile.file; /* re-init read cache */ + if (share->data_file_type != BLOCK_RECORD) + reinit_io_cache(¶m->read_cache, READ_CACHE, + share->pack.header_length, 1, 1); + } + + if (param->testflag & T_WRITE_LOOP) + { + fputs(" \r",stdout); + fflush(stdout); + } + + if (rep_quick && del+sort_info.dupp != share->state.state.del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: " + "Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + got_error=1; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS)) + { + my_off_t skr= share->state.state.data_file_length + + ((sort_info.org_data_file_type == COMPRESSED_RECORD) ? + MEMMAP_EXTRA_MARGIN : 0); +#ifdef USE_RELOC + if (sort_info.org_data_file_type == STATIC_RECORD && + skr < share->base.reloc*share->base.min_pack_length) + skr=share->base.reloc*share->base.min_pack_length; +#endif + if (skr != sort_info.filelength) + if (mysql_file_chsize(info->dfile.file, skr, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of datafile, error: %d", + my_errno); + } + + if (param->testflag & T_CALC_CHECKSUM) + share->state.state.checksum=param->glob_crc; + + if (mysql_file_chsize(share->kfile.file, + share->state.state.key_file_length, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + + if (!(param->testflag & T_SILENT)) + { + if (start_records != share->state.state.records) + printf("Data records: %s\n", llstr(share->state.state.records,llbuff)); + } + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + got_error=0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + *info->state= *info->state_start= share->state.state; + +err: + if (scan_inited) + maria_scan_end(sort_info.info); + _ma_reset_state(info); + + if (sort_info.new_info) + { + end_io_cache(&sort_info.new_info->rec_cache); + sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + } + end_io_cache(¶m->read_cache); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d when fixing table",my_errno); + (void)_ma_flush_table_files_before_swap(param, info); + if (sort_info.new_info && sort_info.new_info != sort_info.info) + { + unuse_data_file_descriptor(sort_info.new_info); + maria_close(sort_info.new_info); + } + if (new_file >= 0) + { + mysql_file_close(new_file, MYF(0)); + mysql_file_delete(key_file_tmp, param->temp_filename, MYF(MY_WME)); + } + maria_mark_crashed_on_repair(info); + } + else + { + if (key_map == share->state.key_map) + share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS; + /* + Now that we have flushed and forced everything, we can bump + create_rename_lsn: + */ + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_repair", + { + DBUG_PRINT("maria_crash_repair", ("now")); + DBUG_SUICIDE(); + }); + } + share->state.changed|= STATE_NOT_SORTED_PAGES; + if (!rep_quick) + share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + /* If caller had disabled logging it's not up to us to re-enable it */ + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + restore_table_state_after_repair(info, &backup_share); + + my_free(sort_param.rec_buff); + my_free(sort_param.record); + my_free(sort_info.key_block); + my_free(sort_info.ft_buf); + my_free(sort_info.buff); + DBUG_RETURN(got_error); +} + + +/* + Threaded repair of table using sorting + + SYNOPSIS + maria_repair_parallel() + param Repair parameters + info MARIA handler to repair + name Name of table (for warnings) + rep_quick set to <> 0 if we should not change data file + + DESCRIPTION + Same as maria_repair_by_sort but do it multithreaded + Each key is handled by a separate thread. + TODO: make a number of threads a parameter + + In parallel repair we use one thread per index. There are two modes: + + Quick + + Only the indexes are rebuilt. All threads share a read buffer. + Every thread that needs fresh data in the buffer enters the shared + cache lock. The last thread joining the lock reads the buffer from + the data file and wakes all other threads. + + Non-quick + + The data file is rebuilt and all indexes are rebuilt to point to + the new record positions. One thread is the master thread. It + reads from the old data file and writes to the new data file. It + also creates one of the indexes. The other threads read from a + buffer which is filled by the master. If they need fresh data, + they enter the shared cache lock. If the masters write buffer is + full, it flushes it to the new data file and enters the shared + cache lock too. When all threads joined in the lock, the master + copies its write buffer to the read buffer for the other threads + and wakes them. + + RESULT + 0 ok + <>0 Error +*/ + +int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info, + const char * name, my_bool rep_quick) +{ + int got_error; + uint i,key, istep; + ha_rows start_records; + my_off_t new_header_length,del; + File new_file; + MARIA_SORT_PARAM *sort_param=0, tmp_sort_param; + MARIA_SHARE *share= info->s; + double *rec_per_key_part; + HA_KEYSEG *keyseg; + char llbuff[22]; + IO_CACHE new_data_cache; /* For non-quick repair. */ + IO_CACHE_SHARE io_share; + MARIA_SORT_INFO sort_info; + MARIA_SHARE backup_share; + ulonglong UNINIT_VAR(key_map); + pthread_attr_t thr_attr; + myf sync_dir= ((share->now_transactional && !share->temporary) ? + MY_SYNC_DIR : 0); + my_bool reenable_logging= 0; + DBUG_ENTER("maria_repair_parallel"); + + got_error= 1; + new_file= -1; + start_records= share->state.state.records; + if (!(param->testflag & T_SILENT)) + { + printf("- parallel recovering (with sort) Aria-table '%s'\n",name); + printf("Data records: %s\n", llstr(start_records, llbuff)); + } + + bzero(&new_data_cache, sizeof(new_data_cache)); + if (initialize_variables_for_repair(param, &sort_info, &tmp_sort_param, info, + rep_quick, &backup_share)) + goto err; + + if ((reenable_logging= share->now_transactional)) + _ma_tmp_disable_logging_for_table(info, 0); + + new_header_length= ((param->testflag & T_UNPACK) ? 0 : + share->pack.header_length); + + /* + Quick repair (not touching data file, rebuilding indexes): + { + Read cache is (HA_CHECK *param)->read_cache using info->dfile.file. + } + + Non-quick repair (rebuilding data file and indexes): + { + Master thread: + + Read cache is (HA_CHECK *param)->read_cache using info->dfile.file. + Write cache is (MARIA_INFO *info)->rec_cache using new_file. + + Slave threads: + + Read cache is new_data_cache synced to master rec_cache. + + The final assignment of the filedescriptor for rec_cache is done + after the cache creation. + + Don't check file size on new_data_cache, as the resulting file size + is not known yet. + + As rec_cache and new_data_cache are synced, write_buffer_length is + used for the read cache 'new_data_cache'. Both start at the same + position 'new_header_length'. + } + */ + DBUG_PRINT("info", ("is quick repair: %d", (int) rep_quick)); + if (!rep_quick) + my_b_clear(&new_data_cache); + + /* Initialize pthread structures before goto err. */ + mysql_mutex_init(key_SORT_INFO_mutex, &sort_info.mutex, MY_MUTEX_INIT_FAST); + mysql_cond_init(key_SORT_INFO_cond, &sort_info.cond, 0); + + if (!(sort_info.key_block= + alloc_key_blocks(param, (uint) param->sort_key_blocks, + share->base.max_key_block_length))) + goto err; + + if (init_io_cache(¶m->read_cache, info->dfile.file, + (uint) param->read_buffer_length, + READ_CACHE, share->pack.header_length, 1, MYF(MY_WME))) + goto err; + + sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks; + info->opt_flag|=WRITE_CACHE_USED; + info->rec_cache.file= info->dfile.file; /* for sort_delete_record */ + + if (!rep_quick) + { + /* Get real path for data file */ + if ((new_file= mysql_file_create(key_file_tmp, + fn_format(param->temp_filename, + share->data_file_name.str, "", + DATA_TMP_EXT, + 2+4), + 0,param->tmpfile_createflag, + MYF(0))) < 0) + { + _ma_check_print_error(param,"Can't create new tempfile: '%s'", + param->temp_filename); + goto err; + } + if (new_header_length && + maria_filecopy(param, new_file, info->dfile.file,0L,new_header_length, + "datafile-header")) + goto err; + if (param->testflag & T_UNPACK) + restore_data_file_type(share); + share->state.dellink= HA_OFFSET_ERROR; + + if (init_io_cache(&new_data_cache, -1, + (uint) param->write_buffer_length, + READ_CACHE, new_header_length, 1, + MYF(MY_WME | MY_DONT_CHECK_FILESIZE))) + goto err; + + if (init_io_cache(&info->rec_cache, new_file, + (uint) param->write_buffer_length, + WRITE_CACHE, new_header_length, 1, + MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw)) + goto err; + + } + + /* Optionally drop indexes and optionally modify the key_map. */ + maria_drop_all_indexes(param, info, FALSE); + key_map= share->state.key_map; + if (param->testflag & T_CREATE_MISSING_KEYS) + { + /* Invert the copied key_map to recreate all disabled indexes. */ + key_map= ~key_map; + } + + param->read_cache.end_of_file= sort_info.filelength; + + /* + +1 below is required hack for parallel repair mode. + The share->state.state.records value, that is compared later + to sort_info.max_records and cannot exceed it, is + increased in sort_key_write. In maria_repair_by_sort, sort_key_write + is called after sort_key_read, where the comparison is performed, + but in parallel mode master thread can call sort_key_write + before some other repair thread calls sort_key_read. + Furthermore I'm not even sure +1 would be enough. + May be sort_info.max_records shold be always set to max value in + parallel mode. + */ + sort_info.max_records++; + + del=share->state.state.del; + + if (!(sort_param=(MARIA_SORT_PARAM *) + my_malloc(PSI_INSTRUMENT_ME, (uint) share->base.keys * + (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength), + MYF(MY_ZEROFILL | param->malloc_flags)))) + { + _ma_check_print_error(param,"Not enough memory for key!"); + goto err; + } +#ifdef USING_SECOND_APPROACH + uint total_key_length=0; +#endif + rec_per_key_part= param->new_rec_per_key_part; + share->state.state.records=share->state.state.del=share->state.split=0; + share->state.state.empty=0; + + for (i=key=0, istep=1 ; key < share->base.keys ; + rec_per_key_part+=sort_param[i].keyinfo->keysegs, i+=istep, key++) + { + sort_param[i].key=key; + sort_param[i].keyinfo=share->keyinfo+key; + sort_param[i].seg=sort_param[i].keyinfo->seg; + /* + Skip this index if it is marked disabled in the copied + (and possibly inverted) key_map. + */ + if (! maria_is_key_active(key_map, key)) + { + /* Remember old statistics for key */ + memcpy((char*) rec_per_key_part, + (char*) (share->state.rec_per_key_part+ + (uint) (rec_per_key_part - param->new_rec_per_key_part)), + sort_param[i].keyinfo->keysegs*sizeof(*rec_per_key_part)); + istep=0; + continue; + } + istep=1; + if ((!(param->testflag & T_SILENT))) + printf ("- Fixing index %d\n",key+1); + if (sort_param[i].keyinfo->flag & HA_FULLTEXT) + { + sort_param[i].key_read=sort_maria_ft_key_read; + sort_param[i].key_write=sort_maria_ft_key_write; + } + else + { + sort_param[i].key_read=sort_key_read; + sort_param[i].key_write=sort_key_write; + } + sort_param[i].key_cmp=sort_key_cmp; + sort_param[i].lock_in_memory=maria_lock_memory; + sort_param[i].tmpdir=param->tmpdir; + sort_param[i].sort_info=&sort_info; + sort_param[i].master=0; + sort_param[i].fix_datafile=0; + sort_param[i].calc_checksum= 0; + + sort_param[i].filepos=new_header_length; + sort_param[i].max_pos=sort_param[i].pos=share->pack.header_length; + + sort_param[i].record= (((uchar *)(sort_param+share->base.keys))+ + (share->base.pack_reclength * i)); + if (_ma_alloc_buffer(&sort_param[i].rec_buff, &sort_param[i].rec_buff_size, + share->base.default_rec_buff_size, + MYF(param->malloc_flags))) + { + _ma_check_print_error(param,"Not enough memory!"); + goto err; + } + sort_param[i].key_length=share->rec_reflength; + for (keyseg=sort_param[i].seg; keyseg->type != HA_KEYTYPE_END; + keyseg++) + { + sort_param[i].key_length+=keyseg->length; + if (keyseg->flag & HA_SPACE_PACK) + sort_param[i].key_length+=get_pack_length(keyseg->length); + if (keyseg->flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + sort_param[i].key_length+= 2 + MY_TEST(keyseg->length >= 127); + if (keyseg->flag & HA_NULL_PART) + sort_param[i].key_length++; + } +#ifdef USING_SECOND_APPROACH + total_key_length+=sort_param[i].key_length; +#endif + + if (sort_param[i].keyinfo->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort= + (FT_MAX_WORD_LEN_FOR_SORT * + sort_param[i].keyinfo->seg->charset->mbmaxlen); + sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + init_alloc_root(PSI_INSTRUMENT_ME, &sort_param[i].wordroot, + FTPARSER_MEMROOT_ALLOC_SIZE, 0, + MYF(param->malloc_flags)); + } + } + sort_info.total_keys=i; + sort_param[0].master= 1; + sort_param[0].fix_datafile= ! rep_quick; + sort_param[0].calc_checksum= MY_TEST(param->testflag & T_CALC_CHECKSUM); + + if (!maria_ftparser_alloc_param(info)) + goto err; + + sort_info.got_error=0; + mysql_mutex_lock(&sort_info.mutex); + + /* + Initialize the I/O cache share for use with the read caches and, in + case of non-quick repair, the write cache. When all threads join on + the cache lock, the writer copies the write cache contents to the + read caches. + */ + if (i > 1) + { + if (rep_quick) + init_io_cache_share(¶m->read_cache, &io_share, NULL, i); + else + init_io_cache_share(&new_data_cache, &io_share, &info->rec_cache, i); + } + else + io_share.total_threads= 0; /* share not used */ + + (void) pthread_attr_init(&thr_attr); + (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED); + (void) my_setstacksize(&thr_attr, (size_t)my_thread_stack_size); + + /* + We cannot mark future memory allocations as thread specific when + doing parallel repair as we don't have a THD for each thread. Sharing the + same THD this would requre mutex locks around mallocs/reallocs to ensure + that two threads does not use the same THD at once. + */ + param->malloc_flags= 0; + for (i=0 ; i < sort_info.total_keys ; i++) + { + /* + Copy the properly initialized IO_CACHE structure so that every + thread has its own copy. In quick mode param->read_cache is shared + for use by all threads. In non-quick mode all threads but the + first copy the shared new_data_cache, which is synchronized to the + write cache of the first thread. The first thread copies + param->read_cache, which is not shared. + */ + sort_param[i].read_cache= ((rep_quick || !i) ? param->read_cache : + new_data_cache); + DBUG_PRINT("io_cache_share", ("thread: %u read_cache: %p", + i, &sort_param[i].read_cache)); + + /* + two approaches: the same amount of memory for each thread + or the memory for the same number of keys for each thread... + In the second one all the threads will fill their sort_buffers + (and call write_keys) at the same time, putting more stress on i/o. + */ + sort_param[i].sortbuff_size= +#ifndef USING_SECOND_APPROACH + param->sort_buffer_length/sort_info.total_keys; +#else + param->sort_buffer_length*sort_param[i].key_length/total_key_length; +#endif + set_if_bigger(sort_param[i].sortbuff_size, MARIA_MIN_SORT_MEMORY); + + if (mysql_thread_create(key_thread_find_all_keys, + &sort_param[i].thr, &thr_attr, + _ma_thr_find_all_keys, (void *) (sort_param+i))) + { + _ma_check_print_error(param,"Cannot start a repair thread"); + /* Cleanup: Detach from the share. Avoid others to be blocked. */ + if (io_share.total_threads) + remove_io_thread(&sort_param[i].read_cache); + DBUG_PRINT("error", ("Cannot start a repair thread")); + sort_info.got_error=1; + } + else + sort_info.threads_running++; + } + (void) pthread_attr_destroy(&thr_attr); + + /* waiting for all threads to finish */ + while (sort_info.threads_running) + mysql_cond_wait(&sort_info.cond, &sort_info.mutex); + mysql_mutex_unlock(&sort_info.mutex); + + if ((got_error= _ma_thr_write_keys(sort_param))) + { + param->retry_repair=1; + goto err; + } + got_error=1; /* Assume the following may go wrong */ + + if (_ma_flush_table_files_before_swap(param, info)) + goto err; + + if (sort_param[0].fix_datafile) + { + /* + Append some nulls to the end of a memory mapped file. Destroy the + write cache. The master thread did already detach from the share + by remove_io_thread() in sort.c:thr_find_all_keys(). + */ + if (maria_write_data_suffix(&sort_info,1) || + end_io_cache(&info->rec_cache)) + goto err; + if (param->testflag & T_SAFE_REPAIR) + { + /* Don't repair if we loosed more than one row */ + if (sort_info.new_info->s->state.state.records+1 < start_records) + { + _ma_check_print_error(param, + "Rows lost (Found %lu of %lu); Aborting " + "because safe repair was requested", + (ulong) share->state.state.records, + (ulong) start_records); + share->state.state.records=start_records; + goto err; + } + } + share->state.state.data_file_length= sort_param->filepos; + /* Only whole records */ + share->state.version= (ulong) time((time_t*) 0); + /* + Exchange the data file descriptor of the table, so that we use the + new file from now on. + */ + mysql_file_close(info->dfile.file, MYF(0)); + info->dfile.file= new_file; + share->pack.header_length=(ulong) new_header_length; + } + else + share->state.state.data_file_length=sort_param->max_pos; + + if (rep_quick && del+sort_info.dupp != share->state.state.del) + { + _ma_check_print_error(param,"Couldn't fix table with quick recovery: " + "Found wrong number of deleted records"); + _ma_check_print_error(param,"Run recovery again without -q"); + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + goto err; + } + + if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS)) + { + my_off_t skr= share->state.state.data_file_length + + ((sort_info.org_data_file_type == COMPRESSED_RECORD) ? + MEMMAP_EXTRA_MARGIN : 0); +#ifdef USE_RELOC + if (sort_info.org_data_file_type == STATIC_RECORD && + skr < share->base.reloc*share->base.min_pack_length) + skr=share->base.reloc*share->base.min_pack_length; +#endif + if (skr != sort_info.filelength) + if (mysql_file_chsize(info->dfile.file, skr, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of datafile, error: %d", + my_errno); + } + if (param->testflag & T_CALC_CHECKSUM) + share->state.state.checksum=param->glob_crc; + + if (mysql_file_chsize(share->kfile.file, + share->state.state.key_file_length, 0, MYF(0))) + _ma_check_print_warning(param, + "Can't change size of indexfile, error: %d", + my_errno); + + if (!(param->testflag & T_SILENT)) + { + if (start_records != share->state.state.records) + printf("Data records: %s\n", llstr(share->state.state.records,llbuff)); + } + if (sort_info.dupp) + _ma_check_print_warning(param, + "%s records have been removed", + llstr(sort_info.dupp,llbuff)); + got_error=0; + /* If invoked by external program that uses thr_lock */ + if (&share->state.state != info->state) + *info->state= *info->state_start= share->state.state; + +err: + _ma_reset_state(info); + + /* + Destroy the write cache. The master thread did already detach from + the share by remove_io_thread() or it was not yet started (if the + error happend before creating the thread). + */ + if (sort_info.new_info) + { + end_io_cache(&sort_info.new_info->rec_cache); + sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + } + end_io_cache(¶m->read_cache); + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + /* + Destroy the new data cache in case of non-quick repair. All slave + threads did either detach from the share by remove_io_thread() + already or they were not yet started (if the error happend before + creating the threads). + */ + if (!rep_quick && my_b_inited(&new_data_cache)) + end_io_cache(&new_data_cache); + if (!got_error) + { + /* Replace the actual file with the temporary file */ + if (new_file >= 0) + { + mysql_file_close(new_file,MYF(0)); + info->dfile.file= new_file= -1; + if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT, + DATA_TMP_EXT, param->backup_time, + MYF((param->testflag & T_BACKUP_DATA ? + MY_REDEL_MAKE_BACKUP : 0) | + sync_dir)) || + _ma_open_datafile(info,share)) + got_error=1; + } + } + if (got_error) + { + if (! param->error_printed) + _ma_check_print_error(param,"%d when fixing table",my_errno); + (void)_ma_flush_table_files_before_swap(param, info); + if (new_file >= 0) + { + mysql_file_close(new_file,MYF(0)); + mysql_file_delete(key_file_tmp, param->temp_filename, MYF(MY_WME)); + if (info->dfile.file == new_file) + info->dfile.file= -1; + } + maria_mark_crashed_on_repair(info); + } + else if (key_map == share->state.key_map) + share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS; + share->state.changed|= STATE_NOT_SORTED_PAGES; + if (!rep_quick) + share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + mysql_cond_destroy (&sort_info.cond); + mysql_mutex_destroy(&sort_info.mutex); + + /* If caller had disabled logging it's not up to us to re-enable it */ + if (reenable_logging) + _ma_reenable_logging_for_table(info, FALSE); + restore_table_state_after_repair(info, &backup_share); + + my_free(sort_info.ft_buf); + my_free(sort_info.key_block); + my_free(sort_param); + my_free(sort_info.buff); + if (!got_error && (param->testflag & T_UNPACK)) + restore_data_file_type(share); + DBUG_RETURN(got_error); +} + + /* Read next record and return next key */ + +static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key) +{ + int error; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + MARIA_HA *info= sort_info->info; + MARIA_KEY int_key; + DBUG_ENTER("sort_key_read"); + + if ((error=sort_get_next_record(sort_param))) + DBUG_RETURN(error); + if (info->s->state.state.records == sort_info->max_records) + { + _ma_check_print_error(sort_info->param, + "Key %d - Found too many records; Can't continue", + sort_param->key+1); + DBUG_RETURN(1); + } + if (_ma_sort_write_record(sort_param)) + DBUG_RETURN(1); + + (*info->s->keyinfo[sort_param->key].make_key)(info, &int_key, + sort_param->key, key, + sort_param->record, + sort_param->current_filepos, + 0); + sort_param->real_key_length= int_key.data_length + int_key.ref_length; +#ifdef HAVE_valgrind + bzero(key+sort_param->real_key_length, + (sort_param->key_length-sort_param->real_key_length)); +#endif + DBUG_RETURN(0); +} /* sort_key_read */ + + +static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key) +{ + int error; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + MARIA_HA *info=sort_info->info; + FT_WORD *wptr=0; + MARIA_KEY int_key; + DBUG_ENTER("sort_maria_ft_key_read"); + + if (!sort_param->wordlist) + { + for (;;) + { + free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE)); + if ((error=sort_get_next_record(sort_param))) + DBUG_RETURN(error); + if ((error= _ma_sort_write_record(sort_param))) + DBUG_RETURN(error); + if (!(wptr= _ma_ft_parserecord(info,sort_param->key,sort_param->record, + &sort_param->wordroot))) + + DBUG_RETURN(1); + if (wptr->pos) + break; + } + sort_param->wordptr=sort_param->wordlist=wptr; + } + else + { + error=0; + wptr=(FT_WORD*)(sort_param->wordptr); + } + + _ma_ft_make_key(info, &int_key, sort_param->key, key, wptr++, + sort_param->current_filepos); + sort_param->real_key_length= int_key.data_length + int_key.ref_length; + +#ifdef HAVE_valgrind + if (sort_param->key_length > sort_param->real_key_length) + bzero(key+sort_param->real_key_length, + (sort_param->key_length-sort_param->real_key_length)); +#endif + if (!wptr->pos) + { + free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE)); + sort_param->wordlist=0; + } + else + sort_param->wordptr=(void*)wptr; + + DBUG_RETURN(error); +} /* sort_maria_ft_key_read */ + + +/* + Read next record from file using parameters in sort_info. + + SYNOPSIS + sort_get_next_record() + sort_param Information about and for the sort process + + NOTES + Dynamic Records With Non-Quick Parallel Repair + + For non-quick parallel repair we use a synchronized read/write + cache. This means that one thread is the master who fixes the data + file by reading each record from the old data file and writing it + to the new data file. By doing this the records in the new data + file are written contiguously. Whenever the write buffer is full, + it is copied to the read buffer. The slaves read from the read + buffer, which is not associated with a file. Thus read_cache.file + is -1. When using _mi_read_cache(), the slaves must always set + flag to READING_NEXT so that the function never tries to read from + file. This is safe because the records are contiguous. There is no + need to read outside the cache. This condition is evaluated in the + variable 'parallel_flag' for quick reference. read_cache.file must + be >= 0 in every other case. + + RETURN + -1 end of file + 0 ok + sort_param->current_filepos points to record position. + sort_param->record contains record + sort_param->max_pos contains position to last byte read + > 0 error +*/ + +static int sort_get_next_record(MARIA_SORT_PARAM *sort_param) +{ + int searching; + int parallel_flag; + uint found_record,b_type,left_length; + my_off_t pos; + MARIA_BLOCK_INFO block_info; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_HA *info=sort_info->info; + MARIA_SHARE *share= info->s; + char llbuff[22],llbuff2[22]; + DBUG_ENTER("sort_get_next_record"); + + if (_ma_killed_ptr(param)) + DBUG_RETURN(1); + if (param->progress_counter++ >= WRITE_COUNT) + { + param->progress_counter= 0; + _ma_report_progress(param, param->progress, param->max_progress); + } + + switch (sort_info->org_data_file_type) { + case BLOCK_RECORD: + { + for (;;) + { + int flag; + /* + Assume table is transactional and it had LSN pages in the + cache. Repair has flushed them, left data pages stay in + cache, and disabled transactionality (so share's current page + type is PLAIN); page cache would assert if it finds a cached LSN page + while _ma_scan_block_record() requested a PLAIN page. So we use + UNKNOWN. + */ + enum pagecache_page_type save_page_type= share->page_type; + share->page_type= PAGECACHE_READ_UNKNOWN_PAGE; + if (info != sort_info->new_info) + { + /* Safe scanning */ + flag= _ma_safe_scan_block_record(sort_info, info, + sort_param->record); + } + else + { + /* + Scan on clean table. + It requires a reliable data_file_length so we set it. + */ + share->state.state.data_file_length= sort_info->filelength; + info->cur_row.trid= 0; + flag= _ma_scan_block_record(info, sort_param->record, + info->cur_row.nextpos, 1); + set_if_bigger(param->max_found_trid, info->cur_row.trid); + if (info->cur_row.trid > param->max_trid) + { + _ma_check_print_not_visible_error(param, info->cur_row.trid); + flag= HA_ERR_ROW_NOT_VISIBLE; + } + } + param->progress= (ma_recordpos_to_page(info->cur_row.lastpos)* + share->block_size); + + share->page_type= save_page_type; + if (!flag) + { + if (sort_param->calc_checksum) + { + ha_checksum checksum; + checksum= (*share->calc_check_checksum)(info, sort_param->record); + if (share->calc_checksum && + info->cur_row.checksum != (checksum & 255)) + { + if (param->testflag & T_VERBOSE) + { + _ma_check_print_info(param, + "Found record with wrong checksum at %s", + record_pos_to_txt(info, + info->cur_row.lastpos, + llbuff)); + + } + continue; + } + info->cur_row.checksum= checksum; + param->glob_crc+= checksum; + } + sort_param->start_recpos= sort_param->current_filepos= + info->cur_row.lastpos; + DBUG_RETURN(0); + } + if (flag == HA_ERR_END_OF_FILE) + { + sort_param->max_pos= share->state.state.data_file_length; + DBUG_RETURN(-1); + } + /* Retry only if wrong record, not if disk error */ + if (flag != HA_ERR_WRONG_IN_RECORD && flag != HA_ERR_WRONG_CRC && + flag != HA_ERR_DECRYPTION_FAILED) + { + retry_if_quick(sort_param, flag); + DBUG_RETURN(flag); + } + } + break; /* Impossible */ + } + case STATIC_RECORD: + for (;;) + { + if (my_b_read(&sort_param->read_cache,sort_param->record, + share->base.pack_reclength)) + { + if (sort_param->read_cache.error) + param->out_flag |= O_DATA_LOST; + retry_if_quick(sort_param, my_errno); + DBUG_RETURN(-1); + } + sort_param->start_recpos=sort_param->pos; + param->progress= sort_param->pos; + if (!sort_param->fix_datafile) + { + sort_param->current_filepos= sort_param->pos; + if (sort_param->master) + share->state.split++; + } + sort_param->max_pos=(sort_param->pos+=share->base.pack_reclength); + if (*sort_param->record) + { + if (sort_param->calc_checksum) + param->glob_crc+= (info->cur_row.checksum= + _ma_static_checksum(info,sort_param->record)); + DBUG_RETURN(0); + } + if (!sort_param->fix_datafile && sort_param->master) + { + share->state.state.del++; + share->state.state.empty+=share->base.pack_reclength; + } + } + case DYNAMIC_RECORD: + { + uchar *UNINIT_VAR(to); + ha_checksum checksum= 0; + + pos=sort_param->pos; + param->progress= pos; + searching=(sort_param->fix_datafile && (param->testflag & T_EXTEND)); + parallel_flag= (sort_param->read_cache.file < 0) ? READING_NEXT : 0; + for (;;) + { + found_record=block_info.second_read= 0; + left_length=1; + if (searching) + { + pos=MY_ALIGN(pos,MARIA_DYN_ALIGN_SIZE); + param->testflag|=T_RETRY_WITHOUT_QUICK; + sort_param->start_recpos=pos; + } + do + { + if (pos > sort_param->max_pos) + sort_param->max_pos=pos; + if (pos & (MARIA_DYN_ALIGN_SIZE-1)) + { + if ((param->testflag & T_VERBOSE) || searching == 0) + _ma_check_print_info(param,"Wrong aligned block at %s", + llstr(pos,llbuff)); + if (searching) + goto try_next; + } + if (found_record && pos == param->search_after_block) + _ma_check_print_info(param,"Block: %s used by record at %s", + llstr(param->search_after_block,llbuff), + llstr(sort_param->start_recpos,llbuff2)); + if (_ma_read_cache(info, &sort_param->read_cache, + block_info.header, pos, + MARIA_BLOCK_INFO_HEADER_LENGTH, + (! found_record ? READING_NEXT : 0) | + parallel_flag | READING_HEADER)) + { + if (found_record) + { + _ma_check_print_info(param, + "Can't read whole record at %s (errno: %d)", + llstr(sort_param->start_recpos,llbuff),errno); + goto try_next; + } + DBUG_RETURN(-1); + } + if (searching && ! sort_param->fix_datafile) + { + param->error_printed++; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + my_errno= HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); /* Something wrong with data */ + } + b_type= _ma_get_block_info(info, &block_info,-1,pos); + if ((b_type & (BLOCK_ERROR | BLOCK_FATAL_ERROR)) || + ((b_type & BLOCK_FIRST) && + (block_info.rec_len < (uint) share->base.min_pack_length || + block_info.rec_len > (uint) share->base.max_pack_length))) + { + uint i; + if (param->testflag & T_VERBOSE || searching == 0) + _ma_check_print_info(param, + "Wrong bytesec: %3d-%3d-%3d at %10s; Skipped", + block_info.header[0],block_info.header[1], + block_info.header[2],llstr(pos,llbuff)); + if (found_record) + goto try_next; + block_info.second_read=0; + searching=1; + /* Search after block in read header string */ + for (i=MARIA_DYN_ALIGN_SIZE ; + i < MARIA_BLOCK_INFO_HEADER_LENGTH ; + i+= MARIA_DYN_ALIGN_SIZE) + if (block_info.header[i] >= 1 && + block_info.header[i] <= MARIA_MAX_DYN_HEADER_BYTE) + break; + pos+=(ulong) i; + sort_param->start_recpos=pos; + continue; + } + if (b_type & BLOCK_DELETED) + { + my_bool error=0; + if (block_info.block_len+ (uint) (block_info.filepos-pos) < + share->base.min_block_length) + { + if (!searching) + _ma_check_print_info(param, + "Deleted block with impossible length %lu " + "at %s", + block_info.block_len,llstr(pos,llbuff)); + error=1; + } + else + { + if ((block_info.next_filepos != HA_OFFSET_ERROR && + block_info.next_filepos >= + share->state.state.data_file_length) || + (block_info.prev_filepos != HA_OFFSET_ERROR && + block_info.prev_filepos >= + share->state.state.data_file_length)) + { + if (!searching) + _ma_check_print_info(param, + "Delete link points outside datafile at " + "%s", + llstr(pos,llbuff)); + error=1; + } + } + if (error) + { + if (found_record) + goto try_next; + searching=1; + pos+= MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + block_info.second_read=0; + continue; + } + } + else + { + if (block_info.block_len+ (uint) (block_info.filepos-pos) < + share->base.min_block_length || + block_info.block_len > (uint) share->base.max_pack_length+ + MARIA_SPLIT_LENGTH) + { + if (!searching) + _ma_check_print_info(param, + "Found block with impossible length %lu " + "at %s; Skipped", + block_info.block_len+ + (uint) (block_info.filepos-pos), + llstr(pos,llbuff)); + if (found_record) + goto try_next; + searching=1; + pos+= MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + block_info.second_read=0; + continue; + } + } + if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + { + if (!sort_param->fix_datafile && sort_param->master && + (b_type & BLOCK_DELETED)) + { + share->state.state.empty+=block_info.block_len; + share->state.state.del++; + share->state.split++; + } + if (found_record) + goto try_next; + if (searching) + { + pos+=MARIA_DYN_ALIGN_SIZE; + sort_param->start_recpos=pos; + } + else + pos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; + } + + if (!sort_param->fix_datafile && sort_param->master) + share->state.split++; + if (! found_record++) + { + sort_param->find_length=left_length=block_info.rec_len; + sort_param->start_recpos=pos; + if (!sort_param->fix_datafile) + sort_param->current_filepos= sort_param->start_recpos; + if (sort_param->fix_datafile && (param->testflag & T_EXTEND)) + sort_param->pos=block_info.filepos+1; + else + sort_param->pos=block_info.filepos+block_info.block_len; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&sort_param->rec_buff, + &sort_param->rec_buff_size, + block_info.rec_len + + share->base.extra_rec_buff_size, + MYF(param->malloc_flags))) + + { + if (param->max_record_length >= block_info.rec_len) + { + _ma_check_print_error(param,"Not enough memory for blob at %s " + "(need %lu)", + llstr(sort_param->start_recpos,llbuff), + (ulong) block_info.rec_len); + DBUG_RETURN(1); + } + else + { + _ma_check_print_info(param,"Not enough memory for blob at %s " + "(need %lu); Row skipped", + llstr(sort_param->start_recpos,llbuff), + (ulong) block_info.rec_len); + goto try_next; + } + } + } + to= sort_param->rec_buff; + } + if (left_length < block_info.data_len || ! block_info.data_len) + { + _ma_check_print_info(param, + "Found block with too small length at %s; " + "Skipped", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + if (block_info.filepos + block_info.data_len > + sort_param->read_cache.end_of_file) + { + _ma_check_print_info(param, + "Found block that points outside data file " + "at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + /* + Copy information that is already read. Avoid accessing data + below the cache start. This could happen if the header + streched over the end of the previous buffer contents. + */ + { + uint header_len= (uint) (block_info.filepos - pos); + uint prefetch_len= (MARIA_BLOCK_INFO_HEADER_LENGTH - header_len); + + if (prefetch_len > block_info.data_len) + prefetch_len= block_info.data_len; + if (prefetch_len) + { + memcpy(to, block_info.header + header_len, prefetch_len); + block_info.filepos+= prefetch_len; + block_info.data_len-= prefetch_len; + left_length-= prefetch_len; + to+= prefetch_len; + } + } + if (block_info.data_len && + _ma_read_cache(info, &sort_param->read_cache,to,block_info.filepos, + block_info.data_len, + (found_record == 1 ? READING_NEXT : 0) | + parallel_flag)) + { + _ma_check_print_info(param, + "Read error for block at: %s (error: %d); " + "Skipped", + llstr(block_info.filepos,llbuff),my_errno); + goto try_next; + } + left_length-=block_info.data_len; + to+=block_info.data_len; + pos=block_info.next_filepos; + if (pos == HA_OFFSET_ERROR && left_length) + { + _ma_check_print_info(param, + "Wrong block with wrong total length " + "starting at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + if (pos + MARIA_BLOCK_INFO_HEADER_LENGTH > + sort_param->read_cache.end_of_file) + { + _ma_check_print_info(param, + "Found link that points at %s (outside data " + "file) at %s", + llstr(pos,llbuff2), + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + } while (left_length); + + if (_ma_rec_unpack(info,sort_param->record,sort_param->rec_buff, + sort_param->find_length) != MY_FILE_ERROR) + { + if (sort_param->read_cache.error < 0) + DBUG_RETURN(1); + if (sort_param->calc_checksum) + checksum= (share->calc_check_checksum)(info, sort_param->record); + if ((param->testflag & (T_EXTEND | T_REP)) || searching) + { + if (_ma_rec_check(info, sort_param->record, sort_param->rec_buff, + sort_param->find_length, + (param->testflag & T_QUICK) && + sort_param->calc_checksum && + MY_TEST(share->calc_checksum), checksum)) + { + _ma_check_print_info(param,"Found wrong packed record at %s", + llstr(sort_param->start_recpos,llbuff)); + goto try_next; + } + } + if (sort_param->calc_checksum) + param->glob_crc+= checksum; + DBUG_RETURN(0); + } + if (!searching) + _ma_check_print_info(param,"Key %d - Found wrong stored record at %s", + sort_param->key+1, + llstr(sort_param->start_recpos,llbuff)); + try_next: + pos=(sort_param->start_recpos+=MARIA_DYN_ALIGN_SIZE); + searching=1; + } + } + case COMPRESSED_RECORD: + param->progress= sort_param->pos; + for (searching=0 ;; searching=1, sort_param->pos++) + { + if (_ma_read_cache(info, &sort_param->read_cache, block_info.header, + sort_param->pos, + share->pack.ref_length,READING_NEXT)) + DBUG_RETURN(-1); + if (searching && ! sort_param->fix_datafile) + { + param->error_printed++; + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + my_errno= HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(1); /* Something wrong with data */ + } + sort_param->start_recpos=sort_param->pos; + if (_ma_pack_get_block_info(info, &sort_param->bit_buff, &block_info, + &sort_param->rec_buff, + &sort_param->rec_buff_size, -1, + sort_param->pos)) + DBUG_RETURN(-1); + if (!block_info.rec_len && + sort_param->pos + MEMMAP_EXTRA_MARGIN == + sort_param->read_cache.end_of_file) + DBUG_RETURN(-1); + if (block_info.rec_len < (uint) share->min_pack_length || + block_info.rec_len > (uint) share->max_pack_length) + { + if (! searching) + _ma_check_print_info(param, + "Found block with wrong recordlength: %lu " + "at %s\n", + block_info.rec_len, + llstr(sort_param->pos,llbuff)); + continue; + } + if (_ma_read_cache(info, &sort_param->read_cache, sort_param->rec_buff, + block_info.filepos, block_info.rec_len, + READING_NEXT)) + { + if (! searching) + _ma_check_print_info(param,"Couldn't read whole record from %s", + llstr(sort_param->pos,llbuff)); + continue; + } + sort_param->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */ + if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record, + sort_param->rec_buff, block_info.rec_len)) + { + if (! searching) + _ma_check_print_info(param,"Found wrong record at %s", + llstr(sort_param->pos,llbuff)); + continue; + } + if (!sort_param->fix_datafile) + { + sort_param->current_filepos= sort_param->pos; + if (sort_param->master) + share->state.split++; + } + sort_param->max_pos= (sort_param->pos=block_info.filepos+ + block_info.rec_len); + info->packed_length=block_info.rec_len; + + if (sort_param->calc_checksum) + { + info->cur_row.checksum= (*share->calc_check_checksum)(info, + sort_param-> + record); + param->glob_crc+= info->cur_row.checksum; + } + DBUG_RETURN(0); + } + case NO_RECORD: + DBUG_RETURN(1); /* Impossible */ + } + DBUG_RETURN(1); /* Impossible */ +} + + +/** + @brief Write record to new file. + + @fn _ma_sort_write_record() + @param sort_param Sort parameters. + + @note + This is only called by a master thread if parallel repair is used. + + @return + @retval 0 OK + sort_param->current_filepos points to inserted record for + block_records and to the place for the next record for + other row types. + sort_param->filepos points to end of file + @retval 1 Error +*/ + +int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param) +{ + int flag; + uint length; + ulong block_length,reclength; + uchar *from; + uchar block_buff[8]; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; + MARIA_HA *info= sort_info->new_info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_sort_write_record"); + + if (sort_param->fix_datafile) + { + sort_param->current_filepos= sort_param->filepos; + switch (sort_info->new_data_file_type) { + case BLOCK_RECORD: + if ((sort_param->current_filepos= + (*share->write_record_init)(info, sort_param->record)) == + HA_OFFSET_ERROR) + { + _ma_check_print_error(param, "%d when writing to datafile", my_errno); + DBUG_RETURN(1); + } + /* Pointer to end of file */ + sort_param->filepos= share->state.state.data_file_length; + break; + case STATIC_RECORD: + if (my_b_write(&info->rec_cache,sort_param->record, + share->base.pack_reclength)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=share->base.pack_reclength; + share->state.split++; + break; + case DYNAMIC_RECORD: + if (! info->blobs) + from=sort_param->rec_buff; + else + { + /* must be sure that local buffer is big enough */ + reclength=share->base.pack_reclength+ + _ma_calc_total_blob_length(info,sort_param->record)+ + ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER; + if (sort_info->buff_length < reclength) + { + if (!(sort_info->buff= my_realloc(PSI_INSTRUMENT_ME, sort_info->buff, + (uint) reclength, + MYF(MY_FREE_ON_ERROR | + MY_ALLOW_ZERO_PTR | + param->malloc_flags)))) + DBUG_RETURN(1); + sort_info->buff_length=reclength; + } + from= (uchar *) sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER); + } + /* We can use info->checksum here as only one thread calls this */ + info->cur_row.checksum= (*share->calc_check_checksum)(info, + sort_param-> + record); + if (!(reclength= _ma_rec_pack(info,from,sort_param->record))) + { + _ma_check_print_error(param,"Got error %d when packing record", + my_errno); + DBUG_RETURN(1); + } + flag=0; + + do + { + block_length= reclength + 3 + MY_TEST(reclength >= (65520 - 3)); + if (block_length < share->base.min_block_length) + block_length=share->base.min_block_length; + info->update|=HA_STATE_WRITE_AT_END; + block_length=MY_ALIGN(block_length,MARIA_DYN_ALIGN_SIZE); + if (block_length > MARIA_MAX_BLOCK_LENGTH) + block_length=MARIA_MAX_BLOCK_LENGTH; + if (_ma_write_part_record(info,0L,block_length, + sort_param->filepos+block_length, + &from,&reclength,&flag)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=block_length; + share->state.split++; + } while (reclength); + break; + case COMPRESSED_RECORD: + reclength=info->packed_length; + length= _ma_save_pack_length((uint) share->pack.version, block_buff, + reclength); + if (share->base.blobs) + length+= _ma_save_pack_length((uint) share->pack.version, + block_buff + length, info->blob_length); + if (my_b_write(&info->rec_cache,block_buff,length) || + my_b_write(&info->rec_cache, sort_param->rec_buff, reclength)) + { + _ma_check_print_error(param,"%d when writing to datafile",my_errno); + DBUG_RETURN(1); + } + sort_param->filepos+=reclength+length; + share->state.split++; + break; + case NO_RECORD: + DBUG_RETURN(1); /* Impossible */ + } + } + if (sort_param->master) + { + share->state.state.records++; + if ((param->testflag & T_WRITE_LOOP) && + (share->state.state.records % WRITE_COUNT) == 0) + { + char llbuff[22]; + printf("%s\r", llstr(share->state.state.records,llbuff)); + fflush(stdout); + } + } + DBUG_RETURN(0); +} /* _ma_sort_write_record */ + + +/* Compare two keys from _ma_create_index_by_sort */ + +static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a, + const void *b) +{ + uint not_used[2]; + return (ha_key_cmp(sort_param->seg, *((uchar* const *) a), + *((uchar* const *) b), + USE_WHOLE_KEY, SEARCH_SAME, not_used)); +} /* sort_key_cmp */ + + +static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a) +{ + uint diff_pos[2]; + char llbuff[22],llbuff2[22]; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param= sort_info->param; + MARIA_HA *info= sort_info->info; + int cmp; + + if (sort_info->key_block->inited) + { + cmp= ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey, + a, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT, + diff_pos); + if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL) + ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey, + a, USE_WHOLE_KEY, + SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos); + else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + { + diff_pos[0]= maria_collect_stats_nonulls_next(sort_param->seg, + sort_param->notnull, + sort_info->key_block->lastkey, + a); + } + sort_param->unique[diff_pos[0]-1]++; + } + else + { + cmp= -1; + if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS) + maria_collect_stats_nonulls_first(sort_param->seg, sort_param->notnull, + a); + } + if ((sort_param->keyinfo->flag & HA_NOSAME) && cmp == 0) + { + DBUG_EXECUTE("key", _ma_print_keydata(DBUG_FILE, sort_param->seg, a, + USE_WHOLE_KEY);); + sort_info->dupp++; + sort_info->info->cur_row.lastpos= get_record_for_key(sort_param->keyinfo, + a); + if ((param->testflag & (T_CREATE_UNIQUE_BY_SORT | T_SUPPRESS_ERR_HANDLING)) + == T_CREATE_UNIQUE_BY_SORT) + param->testflag|= T_SUPPRESS_ERR_HANDLING; + _ma_check_print_warning(param, + "Duplicate key %2u for record at %10s against " + "record at %10s", + sort_param->key + 1, + record_pos_to_txt(info, + sort_info->info->cur_row.lastpos, + llbuff), + record_pos_to_txt(info, + get_record_for_key(sort_param-> + keyinfo, + sort_info->key_block->lastkey), + llbuff2)); + param->testflag|=T_RETRY_WITHOUT_QUICK; + if (sort_info->param->testflag & T_VERBOSE) + _ma_print_keydata(stdout,sort_param->seg, a, USE_WHOLE_KEY); + return (sort_delete_record(sort_param)); + } +#ifndef DBUG_OFF + if (cmp > 0) + { + _ma_check_print_error(param, + "Internal error: Keys are not in order from sort"); + return(1); + } +#endif + return (sort_insert_key(sort_param, sort_info->key_block, + a, HA_OFFSET_ERROR)); +} /* sort_key_write */ + + +int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param) +{ + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + MA_SORT_KEY_BLOCKS *key_block=sort_info->key_block; + MARIA_SHARE *share=sort_info->info->s; + uint val_off, val_len; + int error; + MA_SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf; + uchar *from, *to; + + val_len=share->ft2_keyinfo.keylength; + get_key_full_length_rdonly(val_off, maria_ft_buf->lastkey); + to= maria_ft_buf->lastkey+val_off; + + if (maria_ft_buf->buf) + { + /* flushing first-level tree */ + error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey, + HA_OFFSET_ERROR); + for (from=to+val_len; + !error && from < maria_ft_buf->buf; + from+= val_len) + { + memcpy(to, from, val_len); + error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey, + HA_OFFSET_ERROR); + } + return error; + } + /* flushing second-level tree keyblocks */ + error=_ma_flush_pending_blocks(sort_param); + /* updating lastkey with second-level tree info */ + ft_intXstore(maria_ft_buf->lastkey+val_off, -maria_ft_buf->count); + _ma_dpointer(sort_info->info->s, maria_ft_buf->lastkey+val_off+HA_FT_WLEN, + share->state.key_root[sort_param->key]); + /* restoring first level tree data in sort_info/sort_param */ + sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks; + sort_param->keyinfo=share->keyinfo+sort_param->key; + share->state.key_root[sort_param->key]=HA_OFFSET_ERROR; + /* writing lastkey in first-level tree */ + return error ? error : + sort_insert_key(sort_param,sort_info->key_block, + maria_ft_buf->lastkey,HA_OFFSET_ERROR); +} + + +static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param, + const uchar *a) +{ + uint a_len, val_off, val_len, error; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + MA_SORT_FT_BUF *ft_buf= sort_info->ft_buf; + MA_SORT_KEY_BLOCKS *key_block= sort_info->key_block; + MARIA_SHARE *share= sort_info->info->s; + + val_len=HA_FT_WLEN+share->rec_reflength; + get_key_full_length_rdonly(a_len, a); + + if (!ft_buf) + { + /* + use two-level tree only if key_reflength fits in rec_reflength place + and row format is NOT static - for _ma_dpointer not to garble offsets + */ + if ((share->base.key_reflength <= + share->rec_reflength) && + (share->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD))) + ft_buf= (MA_SORT_FT_BUF *)my_malloc(PSI_INSTRUMENT_ME, + sort_param->keyinfo->block_length + + sizeof(MA_SORT_FT_BUF), + MYF(MY_WME | + sort_param->sort_info->param-> + malloc_flags)); + + if (!ft_buf) + { + sort_param->key_write=sort_key_write; + return sort_key_write(sort_param, a); + } + sort_info->ft_buf= ft_buf; + goto word_init_ft_buf; /* no need to duplicate the code */ + } + get_key_full_length_rdonly(val_off, ft_buf->lastkey); + + if (ha_compare_word(sort_param->seg->charset, + a + 1, a_len - 1, + ft_buf->lastkey + 1, val_off - 1) == 0) + { + uchar *p; + if (!ft_buf->buf) /* store in second-level tree */ + { + ft_buf->count++; + return sort_insert_key(sort_param,key_block, + a + a_len, HA_OFFSET_ERROR); + } + + /* storing the key in the buffer. */ + memcpy (ft_buf->buf, (const char *)a+a_len, val_len); + ft_buf->buf+=val_len; + if (ft_buf->buf < ft_buf->end) + return 0; + + /* converting to two-level tree */ + p=ft_buf->lastkey+val_off; + + while (key_block->inited) + key_block++; + sort_info->key_block=key_block; + sort_param->keyinfo= &share->ft2_keyinfo; + ft_buf->count=(uint)(ft_buf->buf - p)/val_len; + + /* flushing buffer to second-level tree */ + for (error=0; !error && p < ft_buf->buf; p+= val_len) + error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR); + ft_buf->buf=0; + return error; + } + + /* flushing buffer */ + if ((error=_ma_sort_ft_buf_flush(sort_param))) + return error; + +word_init_ft_buf: + a_len+=val_len; + memcpy(ft_buf->lastkey, a, a_len); + ft_buf->buf=ft_buf->lastkey+a_len; + /* + 32 is just a safety margin here + (at least MY_MAX(val_len, sizeof(nod_flag)) should be there). + May be better performance could be achieved if we'd put + (sort_info->keyinfo->block_length-32)/XXX + instead. + TODO: benchmark the best value for XXX. + */ + ft_buf->end= ft_buf->lastkey+ (sort_param->keyinfo->block_length-32); + return 0; +} /* sort_maria_ft_key_write */ + + +/* get pointer to record from a key */ + +static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, + const uchar *key_data) +{ + MARIA_KEY key; + key.keyinfo= keyinfo; + key.data= (uchar*) key_data; + key.data_length= (_ma_keylength(keyinfo, key_data) - + keyinfo->share->rec_reflength); + return _ma_row_pos_from_key(&key); +} /* get_record_for_key */ + + +/* Insert a key in sort-key-blocks */ + +static int sort_insert_key(MARIA_SORT_PARAM *sort_param, + register MA_SORT_KEY_BLOCKS *key_block, + const uchar *key, + my_off_t prev_block) +{ + uint a_length,t_length,nod_flag; + my_off_t filepos; + uchar *anc_buff,*lastkey; + MARIA_KEY_PARAM s_temp; + MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEY tmp_key; + MARIA_HA *info= sort_info->info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("sort_insert_key"); + + anc_buff= key_block->buff; + lastkey=key_block->lastkey; + nod_flag= (key_block == sort_info->key_block ? 0 : + share->base.key_reflength); + + if (!key_block->inited) + { + key_block->inited=1; + if (key_block == sort_info->key_block_end) + { + _ma_check_print_error(param, + "To many key-block-levels; " + "Try increasing sort_key_blocks"); + DBUG_RETURN(1); + } + a_length= share->keypage_header + nod_flag; + key_block->end_pos= anc_buff + share->keypage_header; + bzero(anc_buff, share->keypage_header); + _ma_store_keynr(share, anc_buff, sort_param->keyinfo->key_nr); + lastkey=0; /* No previous key in block */ + } + else + a_length= _ma_get_page_used(share, anc_buff); + + /* Save pointer to previous block */ + if (nod_flag) + { + _ma_store_keypage_flag(share, anc_buff, KEYPAGE_FLAG_ISNOD); + _ma_kpointer(info,key_block->end_pos,prev_block); + } + + tmp_key.keyinfo= keyinfo; + tmp_key.data= (uchar*) key; + tmp_key.data_length= _ma_keylength(keyinfo, key) - share->rec_reflength; + tmp_key.ref_length= share->rec_reflength; + + t_length= (*keyinfo->pack_key)(&tmp_key, nod_flag, + (uchar*) 0, lastkey, lastkey, &s_temp); + (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp); + a_length+=t_length; + _ma_store_page_used(share, anc_buff, a_length); + key_block->end_pos+=t_length; + if (a_length <= share->max_index_block_size) + { + MARIA_KEY tmp_key2; + tmp_key2.data= key_block->lastkey; + _ma_copy_key(&tmp_key2, &tmp_key); + key_block->last_length=a_length-t_length; + DBUG_RETURN(0); + } + + /* Fill block with end-zero and write filled block */ + _ma_store_page_used(share, anc_buff, key_block->last_length); + bzero(anc_buff+key_block->last_length, + keyinfo->block_length- key_block->last_length); + if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR) + DBUG_RETURN(1); + _ma_fast_unlock_key_del(info); + + /* If we read the page from the key cache, we have to write it back to it */ + if (page_link->changed) + { + MARIA_PAGE page; + pop_dynamic(&info->pinned_pages); + _ma_page_setup(&page, info, keyinfo, filepos, anc_buff); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK, DFLT_INIT_HITS)) + DBUG_RETURN(1); + } + else + { + if (write_page(share, share->kfile.file, anc_buff, + keyinfo->block_length, filepos, param->myf_rw)) + DBUG_RETURN(1); + } + DBUG_DUMP("buff", anc_buff, _ma_get_page_used(share, anc_buff)); + + /* Write separator-key to block in next level */ + if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos)) + DBUG_RETURN(1); + + /* clear old block and write new key in it */ + key_block->inited=0; + DBUG_RETURN(sort_insert_key(sort_param, key_block,key,prev_block)); +} /* sort_insert_key */ + + +/* Delete record when we found a duplicated key */ + +static int sort_delete_record(MARIA_SORT_PARAM *sort_param) +{ + uint i; + int old_file,error; + uchar *key; + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + MARIA_HA *row_info= sort_info->new_info, *key_info= sort_info->info; + DBUG_ENTER("sort_delete_record"); + + if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK) + { + _ma_check_print_error(param, + "Quick-recover aborted; Run recovery without switch " + "-q or with switch -qq"); + DBUG_RETURN(1); + } + if (key_info->s->options & HA_OPTION_COMPRESS_RECORD) + { + _ma_check_print_error(param, + "Recover aborted; Can't run standard recovery on " + "compressed tables with errors in data-file. " + "Use 'aria_chk --safe-recover' to fix it"); + DBUG_RETURN(1); + } + + old_file= row_info->dfile.file; + /* This only affects static and dynamic row formats */ + row_info->dfile.file= row_info->rec_cache.file; + if (flush_io_cache(&row_info->rec_cache)) + DBUG_RETURN(1); + + key= key_info->lastkey_buff + key_info->s->base.max_key_length; + if ((error=(*row_info->s->read_record)(row_info, sort_param->record, + key_info->cur_row.lastpos)) && + error != HA_ERR_RECORD_DELETED) + { + _ma_check_print_error(param,"Can't read record to be removed"); + row_info->dfile.file= old_file; + DBUG_RETURN(1); + } + row_info->cur_row.lastpos= key_info->cur_row.lastpos; + + for (i=0 ; i < sort_info->current_key ; i++) + { + MARIA_KEY tmp_key; + (*key_info->s->keyinfo[i].make_key)(key_info, &tmp_key, i, key, + sort_param->record, + key_info->cur_row.lastpos, 0); + if (_ma_ck_delete(key_info, &tmp_key)) + { + _ma_check_print_error(param, + "Can't delete key %d from record to be removed", + i+1); + row_info->dfile.file= old_file; + DBUG_RETURN(1); + } + } + if (sort_param->calc_checksum) + param->glob_crc-=(*key_info->s->calc_check_checksum)(key_info, + sort_param->record); + error= (*row_info->s->delete_record)(row_info, sort_param->record); + if (error) + _ma_check_print_error(param,"Got error %d when deleting record", + my_errno); + row_info->dfile.file= old_file; /* restore actual value */ + row_info->s->state.state.records--; + DBUG_RETURN(error); +} /* sort_delete_record */ + + +/* Fix all pending blocks and flush everything to disk */ + +int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param) +{ + uint nod_flag,length; + my_off_t filepos; + MA_SORT_KEY_BLOCKS *key_block; + MARIA_SORT_INFO *sort_info= sort_param->sort_info; + myf myf_rw=sort_info->param->myf_rw; + MARIA_HA *info=sort_info->info; + MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + DBUG_ENTER("_ma_flush_pending_blocks"); + + filepos= HA_OFFSET_ERROR; /* if empty file */ + nod_flag=0; + for (key_block=sort_info->key_block ; key_block->inited ; key_block++) + { + key_block->inited=0; + length= _ma_get_page_used(info->s, key_block->buff); + if (nod_flag) + _ma_kpointer(info,key_block->end_pos,filepos); + bzero(key_block->buff+length, keyinfo->block_length-length); + if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + goto err; + + /* If we read the page from the key cache, we have to write it back */ + if (page_link->changed) + { + MARIA_PAGE page; + pop_dynamic(&info->pinned_pages); + + _ma_page_setup(&page, info, keyinfo, filepos, key_block->buff); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK, + DFLT_INIT_HITS)) + goto err; + } + else + { + if (write_page(info->s, info->s->kfile.file, key_block->buff, + keyinfo->block_length, filepos, myf_rw)) + goto err; + } + DBUG_DUMP("buff",key_block->buff,length); + nod_flag=1; + } + info->s->state.key_root[sort_param->key]=filepos; /* Last is root for tree */ + _ma_fast_unlock_key_del(info); + DBUG_RETURN(0); + +err: + _ma_fast_unlock_key_del(info); + DBUG_RETURN(1); +} /* _ma_flush_pending_blocks */ + + /* alloc space and pointers for key_blocks */ + +static MA_SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks, + uint buffer_length) +{ + reg1 uint i; + MA_SORT_KEY_BLOCKS *block; + DBUG_ENTER("alloc_key_blocks"); + + if (!(block= (MA_SORT_KEY_BLOCKS*) + my_malloc(PSI_INSTRUMENT_ME, + (sizeof(MA_SORT_KEY_BLOCKS)+buffer_length+IO_SIZE)*blocks, + MYF(param->malloc_flags)))) + { + _ma_check_print_error(param,"Not enough memory for sort-key-blocks"); + return(0); + } + for (i=0 ; i < blocks ; i++) + { + block[i].inited=0; + block[i].buff= (uchar*) (block+blocks)+(buffer_length+IO_SIZE)*i; + } + DBUG_RETURN(block); +} /* alloc_key_blocks */ + + + /* Check if file is almost full */ + +int maria_test_if_almost_full(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + if (share->options & HA_OPTION_COMPRESS_RECORD) + return 0; + return mysql_file_seek(share->kfile.file, 0L, MY_SEEK_END, + MYF(MY_THREADSAFE))/10*9 > + (my_off_t) share->base.max_key_file_length || + mysql_file_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 > + (my_off_t) share->base.max_data_file_length; +} + + +/* Recreate table with bigger more alloced record-data */ + +int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename) +{ + int error; + MARIA_HA info; + MARIA_SHARE share; + MARIA_KEYDEF *keyinfo,*key,*key_end; + HA_KEYSEG *keysegs,*keyseg; + MARIA_COLUMNDEF *columndef,*column,*end; + MARIA_UNIQUEDEF *uniquedef,*u_ptr,*u_end; + MARIA_STATUS_INFO status_info; + uint unpack,key_parts; + ha_rows max_records; + ulonglong file_length,tmp_length; + MARIA_CREATE_INFO create_info; + DBUG_ENTER("maria_recreate_table"); + + if ((!(param->testflag & T_SILENT))) + printf("Recreating table '%s'\n", param->isam_file_name); + + error=1; /* Default error */ + info= **org_info; + status_info= (*org_info)->state[0]; + info.state= &status_info; + share= *(*org_info)->s; + unpack= ((share.data_file_type == COMPRESSED_RECORD) && + (param->testflag & T_UNPACK)); + if (!(keyinfo=(MARIA_KEYDEF*) my_alloca(sizeof(MARIA_KEYDEF) * + share.base.keys))) + DBUG_RETURN(0); + memcpy((uchar*) keyinfo,(uchar*) share.keyinfo, + (size_t) (sizeof(MARIA_KEYDEF)*share.base.keys)); + + key_parts= share.base.all_key_parts; + if (!(keysegs=(HA_KEYSEG*) my_alloca(sizeof(HA_KEYSEG)* + (key_parts+share.base.keys)))) + { + my_afree(keyinfo); + DBUG_RETURN(1); + } + if (!(columndef=(MARIA_COLUMNDEF*) + my_alloca(sizeof(MARIA_COLUMNDEF)*(share.base.fields+1)))) + { + my_afree(keyinfo); + my_afree(keysegs); + DBUG_RETURN(1); + } + if (!(uniquedef=(MARIA_UNIQUEDEF*) + my_alloca(sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques+1)))) + { + my_afree(columndef); + my_afree(keyinfo); + my_afree(keysegs); + DBUG_RETURN(1); + } + + /* Copy the column definitions in their original order */ + for (column= share.columndef, end= share.columndef+share.base.fields; + column != end ; + column++) + columndef[column->column_nr]= *column; + + /* Change the new key to point at the saved key segments */ + memcpy((uchar*) keysegs,(uchar*) share.keyparts, + (size_t) (sizeof(HA_KEYSEG)*(key_parts+share.base.keys+ + share.state.header.uniques))); + keyseg=keysegs; + for (key=keyinfo,key_end=keyinfo+share.base.keys; key != key_end ; key++) + { + key->seg=keyseg; + for (; keyseg->type ; keyseg++) + { + if (param->language) + keyseg->language=param->language; /* change language */ + } + keyseg++; /* Skip end pointer */ + } + + /* + Copy the unique definitions and change them to point at the new key + segments + */ + memcpy((uchar*) uniquedef,(uchar*) share.uniqueinfo, + (size_t) (sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques))); + for (u_ptr=uniquedef,u_end=uniquedef+share.state.header.uniques; + u_ptr != u_end ; u_ptr++) + { + u_ptr->seg=keyseg; + keyseg+=u_ptr->keysegs+1; + } + + file_length=(ulonglong) mysql_file_seek(info.dfile.file, 0L, MY_SEEK_END, MYF(0)); + if (share.options & HA_OPTION_COMPRESS_RECORD) + share.base.records=max_records=info.state->records; + else if (share.base.min_pack_length) + max_records=(ha_rows) (file_length / share.base.min_pack_length); + else + max_records=0; + share.options&= ~HA_OPTION_TEMP_COMPRESS_RECORD; + + tmp_length= file_length+file_length/10; + set_if_bigger(file_length,param->max_data_file_length); + set_if_bigger(file_length,tmp_length); + set_if_bigger(file_length,(ulonglong) share.base.max_data_file_length); + + maria_close(*org_info); + + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=MY_MAX(max_records,share.base.records); + create_info.reloc_rows=share.base.reloc; + create_info.old_options=(share.options | + (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0)); + + create_info.data_file_length=file_length; + create_info.auto_increment=share.state.auto_increment; + create_info.language = (param->language ? param->language : + share.base.language); + create_info.key_file_length= status_info.key_file_length; + create_info.org_data_file_type= ((enum data_file_type) + share.state.header.org_data_file_type); + + /* + Allow for creating an auto_increment key. This has an effect only if + an auto_increment key exists in the original table. + */ + create_info.with_auto_increment= TRUE; + create_info.null_bytes= share.base.null_bytes; + create_info.transactional= share.base.born_transactional; + + /* + We don't have to handle symlinks here because we are using + HA_DONT_TOUCH_DATA + */ + if (maria_create(filename, share.data_file_type, + share.base.keys - share.state.header.uniques, + keyinfo, share.base.fields, columndef, + share.state.header.uniques, uniquedef, + &create_info, + HA_DONT_TOUCH_DATA)) + { + _ma_check_print_error(param, + "Got error %d when trying to recreate indexfile", + my_errno); + goto end; + } + *org_info= maria_open(filename,O_RDWR, + (HA_OPEN_FOR_REPAIR | + ((param->testflag & T_WAIT_FOREVER) ? + HA_OPEN_WAIT_IF_LOCKED : + (param->testflag & T_DESCRIPT) ? + HA_OPEN_IGNORE_IF_LOCKED : + HA_OPEN_ABORT_IF_LOCKED)), 0); + if (!*org_info) + { + _ma_check_print_error(param, + "Got error %d when trying to open re-created " + "indexfile", my_errno); + goto end; + } + /* We are modifing */ + (*org_info)->s->options&= ~HA_OPTION_READ_ONLY_DATA; + _ma_readinfo(*org_info,F_WRLCK,0); + (*org_info)->s->state.state.records= info.state->records; + if (share.state.create_time) + (*org_info)->s->state.create_time=share.state.create_time; +#ifdef MARIA_EXTERNAL_LOCKING + (*org_info)->s->state.unique= (*org_info)->this_unique= share.state.unique; +#endif + (*org_info)->s->state.state.checksum= info.state->checksum; + (*org_info)->s->state.state.del= info.state->del; + (*org_info)->s->state.dellink= share.state.dellink; + (*org_info)->s->state.state.empty= info.state->empty; + (*org_info)->s->state.state.data_file_length= info.state->data_file_length; + *(*org_info)->state= (*org_info)->s->state.state; + if (maria_update_state_info(param,*org_info,UPDATE_TIME | UPDATE_STAT | + UPDATE_OPEN_COUNT)) + goto end; + error=0; +end: + my_afree(uniquedef); + my_afree(keyinfo); + my_afree(columndef); + my_afree(keysegs); + DBUG_RETURN(error); +} + + +/* Write suffix to data file if needed */ + +int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile) +{ + MARIA_HA *info=sort_info->new_info; + + if (info->s->data_file_type == COMPRESSED_RECORD && fix_datafile) + { + uchar buff[MEMMAP_EXTRA_MARGIN]; + bzero(buff,sizeof(buff)); + if (my_b_write(&info->rec_cache,buff,sizeof(buff))) + { + _ma_check_print_error(sort_info->param, + "%d when writing to datafile",my_errno); + return 1; + } + sort_info->param->read_cache.end_of_file+=sizeof(buff); + } + return 0; +} + + +/* Update state and maria_chk time of indexfile */ + +int maria_update_state_info(HA_CHECK *param, MARIA_HA *info,uint update) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_update_state_info"); + + if (info->s->no_status_updates) + DBUG_RETURN(0); /* S3 readonly table */ + + if (update & UPDATE_OPEN_COUNT) + { + share->state.open_count=0; + share->global_changed=0; + share->changed= 1; + } + if (update & UPDATE_STAT) + { + uint i, key_parts= mi_uint2korr(share->state.header.key_parts); + share->state.records_at_analyze= share->state.state.records; + share->state.changed&= ~STATE_NOT_ANALYZED; + if (share->state.state.records) + { + for (i=0; i<key_parts; i++) + { + if (!(share->state.rec_per_key_part[i]=param->new_rec_per_key_part[i])) + share->state.changed|= STATE_NOT_ANALYZED; + } + } + } + if (update & (UPDATE_STAT | UPDATE_SORT | UPDATE_TIME | UPDATE_AUTO_INC)) + { + if (update & UPDATE_TIME) + { + share->state.check_time= time((time_t*) 0); + if (!share->state.create_time) + share->state.create_time= share->state.check_time; + } + if (_ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)) + goto err; + } + { /* Force update of status */ + int error; + uint r_locks=share->r_locks,w_locks=share->w_locks; + share->r_locks= share->w_locks= share->tot_locks= 0; + error= _ma_writeinfo(info,WRITEINFO_NO_UNLOCK); + share->r_locks=r_locks; + share->w_locks=w_locks; + share->tot_locks=r_locks+w_locks; + if (!error) + DBUG_RETURN(0); + } +err: + _ma_check_print_error(param,"%d when updating keyfile",my_errno); + DBUG_RETURN(1); +} + +/* + Update auto increment value for a table + When setting the 'repair_only' flag we only want to change the + old auto_increment value if its wrong (smaller than some given key). + The reason is that we shouldn't change the auto_increment value + for a table without good reason when only doing a repair; If the + user have inserted and deleted rows, the auto_increment value + may be bigger than the biggest current row and this is ok. + + If repair_only is not set, we will update the flag to the value in + param->auto_increment is bigger than the biggest key. +*/ + +void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info, + my_bool repair_only) +{ + MARIA_SHARE *share= info->s; + uchar *record; + DBUG_ENTER("update_auto_increment_key"); + + if (!share->base.auto_key || + ! maria_is_key_active(share->state.key_map, share->base.auto_key - 1)) + { + if (!(param->testflag & T_VERY_SILENT)) + _ma_check_print_info(param, + "Table: %s doesn't have an auto increment key\n", + param->isam_file_name); + DBUG_VOID_RETURN; + } + if (!(param->testflag & T_SILENT) && + !(param->testflag & T_REP)) + printf("Updating Aria file: %s\n", param->isam_file_name); + /* + We have to use an allocated buffer instead of info->rec_buff as + _ma_put_key_in_record() may use info->rec_buff + */ + if (!(record= (uchar*) my_malloc(PSI_INSTRUMENT_ME, + (size_t) share->base.default_rec_buff_size, + MYF(param->malloc_flags)))) + { + _ma_check_print_error(param,"Not enough memory for extra record"); + DBUG_VOID_RETURN; + } + + maria_extra(info,HA_EXTRA_KEYREAD,0); + if (maria_rlast(info, record, share->base.auto_key-1)) + { + if (my_errno != HA_ERR_END_OF_FILE) + { + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + my_free(record); + _ma_check_print_error(param,"%d when reading last record",my_errno); + DBUG_VOID_RETURN; + } + if (!repair_only) + share->state.auto_increment=param->auto_increment_value; + } + else + { + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + ulonglong auto_increment= + ma_retrieve_auto_increment(record + keyseg->start, keyseg->type); + set_if_bigger(share->state.auto_increment,auto_increment); + if (!repair_only) + set_if_bigger(share->state.auto_increment, param->auto_increment_value); + } + maria_extra(info,HA_EXTRA_NO_KEYREAD,0); + my_free(record); + maria_update_state_info(param, info, UPDATE_AUTO_INC); + DBUG_VOID_RETURN; +} + + +/* + Update statistics for each part of an index + + SYNOPSIS + maria_update_key_parts() + keyinfo IN Index information (only key->keysegs used) + rec_per_key_part OUT Store statistics here + unique IN Array of (#distinct tuples) + notnull_tuples IN Array of (#tuples), or NULL + records Number of records in the table + + DESCRIPTION + This function is called produce index statistics values from unique and + notnull_tuples arrays after these arrays were produced with sequential + index scan (the scan is done in two places: chk_index() and + sort_key_write()). + + This function handles all 3 index statistics collection methods. + + Unique is an array: + unique[0]= (#different values of {keypart1}) - 1 + unique[1]= (#different values of {keypart1,keypart2} tuple)-unique[0]-1 + ... + + For MI_STATS_METHOD_IGNORE_NULLS method, notnull_tuples is an array too: + notnull_tuples[0]= (#of {keypart1} tuples such that keypart1 is not NULL) + notnull_tuples[1]= (#of {keypart1,keypart2} tuples such that all + keypart{i} are not NULL) + ... + For all other statistics collection methods notnull_tuples==NULL. + + Output is an array: + rec_per_key_part[k] = + = E(#records in the table such that keypart_1=c_1 AND ... AND + keypart_k=c_k for arbitrary constants c_1 ... c_k) + + = {assuming that values have uniform distribution and index contains all + tuples from the domain (or that {c_1, ..., c_k} tuple is chosen from + index tuples} + + = #tuples-in-the-index / #distinct-tuples-in-the-index. + + The #tuples-in-the-index and #distinct-tuples-in-the-index have different + meaning depending on which statistics collection method is used: + + MI_STATS_METHOD_* how are nulls compared? which tuples are counted? + NULLS_EQUAL NULL == NULL all tuples in table + NULLS_NOT_EQUAL NULL != NULL all tuples in table + IGNORE_NULLS n/a tuples that don't have NULLs +*/ + +void maria_update_key_parts(MARIA_KEYDEF *keyinfo, double *rec_per_key_part, + ulonglong *unique, ulonglong *notnull, + ulonglong records) +{ + ulonglong count=0, unique_tuples; + ulonglong tuples= records; + uint parts; + double tmp; + for (parts=0 ; parts < keyinfo->keysegs ; parts++) + { + count+=unique[parts]; + unique_tuples= count + 1; + if (notnull) + { + tuples= notnull[parts]; + /* + #(unique_tuples not counting tuples with NULLs) = + #(unique_tuples counting tuples with NULLs as different) - + #(tuples with NULLs) + */ + unique_tuples -= (records - notnull[parts]); + } + + if (unique_tuples == 0) + tmp= 1; + else if (count == 0) + tmp= ulonglong2double(tuples); /* 1 unique tuple */ + else + tmp= ulonglong2double(tuples) / ulonglong2double(unique_tuples); + + /* + for some weird keys (e.g. FULLTEXT) tmp can be <1 here. + let's ensure it is not + */ + set_if_bigger(tmp,1); + + *rec_per_key_part++= tmp; + } +} + + +static ha_checksum maria_byte_checksum(const uchar *buf, uint length) +{ + ha_checksum crc; + const uchar *end=buf+length; + for (crc=0; buf != end; buf++) + crc=((crc << 1) + *buf) + + MY_TEST(crc & (((ha_checksum) 1) << (8 * sizeof(ha_checksum) - 1))); + return crc; +} + +my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows) +{ + uint key_maxlength=key->maxlength; + if (key->flag & HA_FULLTEXT) + { + uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT* + key->seg->charset->mbmaxlen; + key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN; + } + return (key->flag & HA_SPATIAL) || + (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) && + ((ulonglong) rows * key_maxlength > + (ulonglong) maria_max_temp_length)); +} + +/* + Return TRUE if we can use repair by sorting + One can set the force argument to force to use sorting + even if the temporary file would be quite big! +*/ + +my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows, + ulonglong key_map, my_bool force) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *key=share->keyinfo; + uint i; + + /* + maria_repair_by_sort only works if we have at least one key. If we don't + have any keys, we should use the normal repair. + */ + if (! maria_is_any_key_active(key_map)) + return FALSE; /* Can't use sort */ + for (i=0 ; i < share->base.keys ; i++,key++) + { + if (!force && maria_too_big_key_for_sort(key,rows)) + return FALSE; + } + return TRUE; +} + + +/** + @brief Create a new handle for manipulation the new record file + + @note + It's ok for Recovery to have two MARIA_SHARE on the same index file + because the one we create here is not transactional +*/ + +static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file) +{ + + MARIA_SORT_INFO *sort_info= param->sort_info; + MARIA_HA *info= sort_info->info; + MARIA_HA *new_info; + DBUG_ENTER("create_new_data_handle"); + + if (!(sort_info->new_info= maria_open(info->s->open_file_name.str, O_RDWR, + HA_OPEN_COPY | HA_OPEN_FOR_REPAIR | + HA_OPEN_INTERNAL_TABLE, 0))) + DBUG_RETURN(1); + + new_info= sort_info->new_info; + _ma_bitmap_set_pagecache_callbacks(&new_info->s->bitmap.file, + new_info->s); + _ma_set_data_pagecache_callbacks(&new_info->dfile, new_info->s); + change_data_file_descriptor(new_info, new_file); + maria_lock_database(new_info, F_EXTRA_LCK); + if ((sort_info->param->testflag & T_UNPACK) && + info->s->data_file_type == COMPRESSED_RECORD) + { + (*new_info->s->once_end)(new_info->s); + (*new_info->s->end)(new_info); + restore_data_file_type(new_info->s); + _ma_setup_functions(new_info->s); + if ((*new_info->s->once_init)(new_info->s, new_file) || + (*new_info->s->init)(new_info)) + DBUG_RETURN(1); + } + _ma_reset_status(new_info); + if (_ma_initialize_data_file(new_info->s, new_file)) + DBUG_RETURN(1); + + /* Take into account any bitmap page created above: */ + param->filepos= new_info->s->state.state.data_file_length; + + /* Use new virtual functions for key generation */ + info->s->keypos_to_recpos= new_info->s->keypos_to_recpos; + info->s->recpos_to_keypos= new_info->s->recpos_to_keypos; + DBUG_RETURN(0); +} + + +static void +set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share) +{ + if ((sort_info->new_data_file_type=share->data_file_type) == + COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK) + { + MARIA_SHARE tmp; + sort_info->new_data_file_type= share->state.header.org_data_file_type; + /* Set delete_function for sort_delete_record() */ + tmp= *share; + tmp.state.header.data_file_type= tmp.state.header.org_data_file_type; + tmp.options= ~HA_OPTION_COMPRESS_RECORD; + _ma_setup_functions(&tmp); + share->delete_record=tmp.delete_record; + } +} + +static void restore_data_file_type(MARIA_SHARE *share) +{ + MARIA_SHARE tmp_share; + share->options&= ~HA_OPTION_COMPRESS_RECORD; + mi_int2store(share->state.header.options,share->options); + share->state.header.data_file_type= + share->state.header.org_data_file_type; + share->data_file_type= share->state.header.data_file_type; + share->pack.header_length= 0; + + /* Use new virtual functions for key generation */ + tmp_share= *share; + _ma_setup_functions(&tmp_share); + share->keypos_to_recpos= tmp_share.keypos_to_recpos; + share->recpos_to_keypos= tmp_share.recpos_to_keypos; +} + + +static void change_data_file_descriptor(MARIA_HA *info, File new_file) +{ + mysql_file_close(info->dfile.file, MYF(MY_WME)); + info->dfile.file= info->s->bitmap.file.file= new_file; + _ma_bitmap_reset_cache(info->s); +} + + +/** + @brief Mark the data file to not be used + + @note + This is used in repair when we want to ensure the handler will not + write anything to the data file anymore +*/ + +static void unuse_data_file_descriptor(MARIA_HA *info) +{ + (void) flush_pagecache_blocks(info->s->pagecache, + &info->s->bitmap.file, + FLUSH_IGNORE_CHANGED); + info->dfile.file= info->s->bitmap.file.file= -1; + _ma_bitmap_reset_cache(info->s); +} + + +/* + Copy all states that has to do with the data file + + NOTES + This is done to copy the state from the data file generated from + repair to the original handler +*/ + +static void copy_data_file_state(MARIA_STATE_INFO *to, + MARIA_STATE_INFO *from) +{ + to->state.records= from->state.records; + to->state.del= from->state.del; + to->state.empty= from->state.empty; + to->state.data_file_length= from->state.data_file_length; + to->split= from->split; + to->dellink= from->dellink; + to->first_bitmap_with_space= from->first_bitmap_with_space; +} + + +/* + Read 'safely' next record while scanning table. + + SYNOPSIS + _ma_safe_scan_block_record() + info Maria handler + record Store found here + + NOTES + - One must have called mi_scan() before this + + Differences compared to _ma_scan_block_records() are: + - We read all blocks, not only blocks marked by the bitmap to be safe + - In case of errors, next read will read next record. + - More sanity checks + + RETURN + 0 ok + HA_ERR_END_OF_FILE End of file + # error number +*/ + + +static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, + MARIA_HA *info, uchar *record) +{ + MARIA_SHARE *share= info->s; + MARIA_RECORD_POS record_pos= info->cur_row.nextpos; + pgcache_page_no_t page= sort_info->page; + DBUG_ENTER("_ma_safe_scan_block_record"); + + for (;;) + { + /* Find next row in current page */ + if (likely(record_pos < info->scan.number_of_rows)) + { + uint length, offset; + uchar *data, *end_of_data; + char llbuff[22]; + + while (!(offset= uint2korr(info->scan.dir))) + { + info->scan.dir-= DIR_ENTRY_SIZE; + record_pos++; + if (info->scan.dir < info->scan.dir_end) + { + _ma_check_print_info(sort_info->param, + "Wrong directory on page %s", + llstr(page, llbuff)); + goto read_next_page; + } + } + /* found row */ + info->cur_row.lastpos= info->scan.row_base_page + record_pos; + info->cur_row.nextpos= record_pos + 1; + data= info->scan.page_buff + offset; + length= uint2korr(info->scan.dir + 2); + end_of_data= data + length; + info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */ + + if (end_of_data > info->scan.dir_end || + offset < PAGE_HEADER_SIZE(info->s) || + length < share->base.min_block_length) + { + _ma_check_print_info(sort_info->param, + "Wrong directory entry %3u at page %s", + (uint) record_pos, llstr(page, llbuff)); + record_pos++; + continue; + } + else + { + DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data)); + } + } + +read_next_page: + /* Read until we find next head page */ + for (;;) + { + uint page_type; + char llbuff[22]; + + sort_info->page++; /* In case of errors */ + page++; + if (!(page % share->bitmap.pages_covered)) + { + /* Skip bitmap */ + page++; + sort_info->page++; + } + if ((my_off_t) (page + 1) * share->block_size > sort_info->filelength) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (!(pagecache_read(share->pagecache, + &info->dfile, + page, 0, info->scan.page_buff, + PAGECACHE_READ_UNKNOWN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) + { + if (my_errno == HA_ERR_WRONG_CRC || + my_errno == HA_ERR_DECRYPTION_FAILED) + { + /* + Don't give errors for zero filled blocks. These can + sometimes be found at end of a bitmap when we wrote a big + record last that was moved to the next bitmap. + */ + if (_ma_check_bitmap_data(info, UNALLOCATED_PAGE, 0, + _ma_bitmap_get_page_bits(info, + &share->bitmap, + page))) + { + _ma_check_print_info(sort_info->param, + "Wrong CRC on datapage at %s", + llstr(page, llbuff)); + } + continue; + } + DBUG_RETURN(my_errno); + } + page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] & + PAGE_TYPE_MASK); + if (page_type == HEAD_PAGE) + { + if ((info->scan.number_of_rows= + (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) != 0) + break; + _ma_check_print_info(sort_info->param, + "Wrong head page at page %s", + llstr(page, llbuff)); + } + else if (page_type >= MAX_PAGE_TYPE) + { + _ma_check_print_info(sort_info->param, + "Found wrong page type: %d at page %s", + page_type, llstr(page, llbuff)); + } + } + + /* New head page */ + info->scan.dir= (info->scan.page_buff + share->block_size - + PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE); + info->scan.dir_end= (info->scan.dir - + (info->scan.number_of_rows - 1) * + DIR_ENTRY_SIZE); + info->scan.row_base_page= ma_recordpos(page, 0); + record_pos= 0; + } +} + + +/** + @brief Writes a LOGREC_REPAIR_TABLE record and updates create_rename_lsn + if needed (so that maria_read_log does not redo the repair). + + @param param description of the REPAIR operation + @param info table + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + /* in case this is maria_chk or recovery... */ + if (translog_status == TRANSLOG_OK && !maria_in_recovery && + share->base.born_transactional) + { + my_bool save_now_transactional= share->now_transactional; + + /* + For now this record is only informative. It could serve when applying + logs to a backup, but that needs more thought. Assume table became + corrupted. It is repaired, then some writes happen to it. + Later we restore an old backup, and want to apply this REDO_REPAIR_TABLE + record. For it to give the same result as originally, the table should + be corrupted the same way, so applying previous REDOs should produce the + same corruption; that's really not guaranteed (different execution paths + in execution of REDOs vs runtime code so not same bugs hit, temporary + hardware issues not repeatable etc). Corruption may not be repeatable. + A reasonable solution is to execute the REDO_REPAIR_TABLE record and + check if the checksum of the resulting table matches what it was at the + end of the original repair (should be stored in log record); or execute + the REDO_REPAIR_TABLE if the checksum of the table-before-repair matches + was it was at the start of the original repair (should be stored in log + record). + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[FILEID_STORE_SIZE + 8 + 8]; + LSN lsn; + + /* + testflag gives an idea of what REPAIR did (in particular T_QUICK + or not: did it touch the data file or not?). + */ + int8store(log_data + FILEID_STORE_SIZE, param->testflag); + /* org_key_map is used when recreating index after a load data infile */ + int8store(log_data + FILEID_STORE_SIZE + 8, param->org_key_map); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + share->now_transactional= 1; + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_REPAIR_TABLE, + &dummy_transaction_object, info, + (translog_size_t) sizeof(log_data), + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data, NULL) || + translog_flush(lsn))) + return TRUE; + /* + The table's existence was made durable earlier (MY_SYNC_DIR passed to + maria_change_to_newfile()). All pages have been flushed, state too, we + need to force it to disk. Old REDOs should not be applied to the table, + which is already enforced as skip_redos_lsn was increased in + protect_against_repair_crash(). But if this is an explicit repair, + even UNDO phase should ignore this table: create_rename_lsn should be + increased, and this also serves for the REDO_REPAIR to be ignored by + maria_read_log. + The fully correct order would be: sync data and index file, remove crash + mark and update LSNs then write state and sync index file. But at this + point state (without crash mark) is already written. + */ + if ((!(param->testflag & T_NO_CREATE_RENAME_LSN) && + _ma_update_state_lsns(share, lsn, share->state.create_trid, FALSE, + FALSE)) || + _ma_sync_table_files(info)) + return TRUE; + share->now_transactional= save_now_transactional; + } + return FALSE; +} + + +/** + Writes an UNDO record which if executed in UNDO phase, will empty the + table. Such record is thus logged only in certain cases of bulk insert + (table needs to be empty etc). +*/ +my_bool write_log_record_for_bulk_insert(MARIA_HA *info) +{ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE]; + LSN lsn; + lsn_store(log_data, info->trn->undo_lsn); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + return translog_write_record(&lsn, LOGREC_UNDO_BULK_INSERT, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE, NULL) || + translog_flush(lsn); /* WAL */ +} + + +/* Give error message why reading of key page failed */ + +static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info, + my_off_t position) +{ + char buff[11]; + uint32 block_size= info->s->block_size; + + if (my_errno == HA_ERR_CRASHED) + _ma_check_print_error(param, + "Wrong base information on indexpage at page: %s", + llstr(position / block_size, buff)); + else + _ma_check_print_error(param, + "Can't read indexpage from page: %s, " + "error: %d", + llstr(position / block_size, buff), my_errno); +} + + +static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid) +{ + char buff[22], buff2[22]; + if (!param->not_visible_rows_found++) + { + if (!ma_control_file_inited()) + { + _ma_check_print_warning(param, + "Found row with transaction id %s but no " + "aria_control_file was used or specified. " + "The table may be corrupted", + llstr(used_trid, buff)); + } + else + { + _ma_check_print_error(param, + "Found row with transaction id %s when max " + "transaction id according to aria_control_file " + "is %s", + llstr(used_trid, buff), + llstr(param->max_trid, buff2)); + } + } +} + + +/** + Mark that we can retry normal repair if we used quick repair + + We shouldn't do this in case of disk error as in this case we are likely + to loose much more than expected. +*/ + +void retry_if_quick(MARIA_SORT_PARAM *sort_param, int error) +{ + HA_CHECK *param=sort_param->sort_info->param; + + if (!sort_param->fix_datafile && error >= HA_ERR_FIRST) + { + param->retry_repair=1; + param->testflag|=T_RETRY_WITHOUT_QUICK; + } +} + +/* Print information about bitmap page */ + +static void print_bitmap_description(MARIA_SHARE *share, + pgcache_page_no_t page, + uchar *bitmap_data) +{ + char *tmp= my_malloc(PSI_INSTRUMENT_ME, MAX_BITMAP_INFO_LENGTH, + MYF(MY_WME | MY_THREADSAFE)); + if (!tmp) + return; + _ma_get_bitmap_description(&share->bitmap, bitmap_data, page, tmp); + printf("Bitmap page %lu\n%s", (ulong) page, tmp); + my_free(tmp); +} diff --git a/storage/maria/ma_check.h b/storage/maria/ma_check.h new file mode 100644 index 00000000..fa78ada6 --- /dev/null +++ b/storage/maria/ma_check.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2019, 2022, MariaDB Corporation AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + MA 02110-1335 USA +*/ + +/* + Types that are different in Aria from those used by MyISAM check tables + in myisamchk.h +*/ + +struct st_sort_key_blocks /* Used when sorting */ +{ + uchar *buff, *end_pos; + uchar lastkey[MARIA_MAX_POSSIBLE_KEY_BUFF]; + uint last_length; + int inited; +}; + +struct st_sort_ftbuf +{ + uchar *buf, *end; + int count; + uchar lastkey[MARIA_MAX_KEY_BUFF]; +}; diff --git a/storage/maria/ma_check_standalone.h b/storage/maria/ma_check_standalone.h new file mode 100644 index 00000000..9442800a --- /dev/null +++ b/storage/maria/ma_check_standalone.h @@ -0,0 +1,163 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <my_check_opt.h> + +/* almost every standalone maria program will need it */ +void _mi_report_crashed(void *file __attribute__((unused)), + const char *message __attribute__((unused)), + const char *sfile __attribute__((unused)), + uint sline __attribute__((unused))) +{ +} + +static unsigned int no_key(unsigned int not_used __attribute__((unused))) +{ + return ENCRYPTION_KEY_VERSION_INVALID; +} + +struct encryption_service_st encryption_handler= +{ + no_key, 0, 0, 0, 0, 0, 0 +}; + +int encryption_scheme_encrypt(const unsigned char* src __attribute__((unused)), + unsigned int slen __attribute__((unused)), + unsigned char* dst __attribute__((unused)), + unsigned int* dlen __attribute__((unused)), + struct st_encryption_scheme *scheme __attribute__((unused)), + unsigned int key_version __attribute__((unused)), + unsigned int i32_1 __attribute__((unused)), + unsigned int i32_2 __attribute__((unused)), + unsigned long long i64 __attribute__((unused))) +{ + return -1; +} + + +int encryption_scheme_decrypt(const unsigned char* src __attribute__((unused)), + unsigned int slen __attribute__((unused)), + unsigned char* dst __attribute__((unused)), + unsigned int* dlen __attribute__((unused)), + struct st_encryption_scheme *scheme __attribute__((unused)), + unsigned int key_version __attribute__((unused)), + unsigned int i32_1 __attribute__((unused)), + unsigned int i32_2 __attribute__((unused)), + unsigned long long i64 __attribute__((unused))) +{ + return -1; +} + +/* only those that included myisamchk.h may need and can use the below */ +#ifdef _myisamchk_h +/* + All standalone programs which need to use functions from ma_check.c + (like maria_repair()) must define their version of _ma_killed_ptr() + and _ma_check_print_info|warning|error(). Indeed, linking with ma_check.o + brings in the dependencies of ma_check.o which are definitions of the above + functions; if the program does not define them then the ones of + ha_maria.o are used i.e. ha_maria.o is linked into the program, and this + brings dependencies of ha_maria.o on mysqld.o into the program's linking + which thus fails, as the program is not linked with mysqld.o. + This file contains the versions of these functions used by maria_chk and + maria_read_log. +*/ + +/* + Check if check/repair operation was killed by a signal +*/ + +int _ma_killed_ptr(HA_CHECK *param __attribute__((unused))) +{ + return 0; +} + + +void _ma_report_progress(HA_CHECK *param __attribute__((unused)), + ulonglong progress __attribute__((unused)), + ulonglong max_progress __attribute__((unused))) +{ +} + + /* print warnings and errors */ + /* VARARGS */ + +void _ma_check_print_info(HA_CHECK *param __attribute__((unused)), + const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_info"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + va_start(args,fmt); + vfprintf(stdout, fmt, args); + fputc('\n',stdout); + va_end(args); + DBUG_VOID_RETURN; +} + +/* VARARGS */ + +void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_warning"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + fflush(stdout); + if (!param->warning_printed && !param->error_printed) + { + if (param->testflag & T_SILENT) + fprintf(stderr,"%s: Aria file %s\n",my_progname_short, + param->isam_file_name); + param->out_flag|= O_DATA_LOST; + } + param->warning_printed++; + va_start(args,fmt); + fprintf(stderr,"%s: warning: ",my_progname_short); + vfprintf(stderr, fmt, args); + fputc('\n',stderr); + fflush(stderr); + va_end(args); + DBUG_VOID_RETURN; +} + +/* VARARGS */ + +void _ma_check_print_error(HA_CHECK *param, const char *fmt,...) +{ + va_list args; + DBUG_ENTER("_ma_check_print_error"); + DBUG_PRINT("enter", ("format: %s", fmt)); + + fflush(stdout); + if (!param->warning_printed && !param->error_printed) + { + if (param->testflag & T_SILENT) + fprintf(stderr,"%s: Aria file %s\n",my_progname_short,param->isam_file_name); + param->out_flag|= O_DATA_LOST; + } + param->error_printed++; + va_start(args,fmt); + fprintf(stderr,"%s: error: ",my_progname_short); + vfprintf(stderr, fmt, args); + fputc('\n',stderr); + fflush(stderr); + va_end(args); + DBUG_VOID_RETURN; +} + +#endif + diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c new file mode 100644 index 00000000..2741f54d --- /dev/null +++ b/storage/maria/ma_checkpoint.c @@ -0,0 +1,1254 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* Here is the implementation of this module */ + +/** @todo RECOVERY BUG this is unreviewed code */ +/* + Summary: + checkpoints are done either by a background thread (checkpoint every Nth + second) or by a client. + In ha_maria, it's not made available to clients, and will soon be done by a + background thread (periodically taking checkpoints and flushing dirty + pages). +*/ + +#include "maria_def.h" +#include "ma_pagecache.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "ma_loghandler_lsn.h" +#include "ma_servicethread.h" +#include "ma_crypt.h" + +/** @brief type of checkpoint currently running */ +static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE; +/** @brief protects checkpoint_in_progress */ +static mysql_mutex_t LOCK_checkpoint; +/** @brief for killing the background checkpoint thread */ +static mysql_cond_t COND_checkpoint; +/** @brief control structure for checkpoint background thread */ +static MA_SERVICE_THREAD_CONTROL checkpoint_control= + {0, FALSE, FALSE, &LOCK_checkpoint, &COND_checkpoint}; +/* is ulong like pagecache->blocks_changed */ +static uint pages_to_flush_before_next_checkpoint; +static PAGECACHE_FILE *dfiles, /**< data files to flush in background */ + *dfiles_end; /**< list of data files ends here */ +static PAGECACHE_FILE *kfiles, /**< index files to flush in background */ + *kfiles_end; /**< list of index files ends here */ +/* those two statistics below could serve in SHOW GLOBAL STATUS */ +static uint checkpoints_total= 0, /**< all checkpoint requests made */ + checkpoints_ok_total= 0; /**< all checkpoints which succeeded */ + +struct st_filter_param +{ + LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */ + uint max_pages; /**< stop after flushing this number pages */ +}; /**< information to determine which dirty pages should be flushed */ + +static enum pagecache_flush_filter_result +filter_flush_file_medium(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static enum pagecache_flush_filter_result +filter_flush_file_full(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); +static enum pagecache_flush_filter_result +filter_flush_file_evenly(enum pagecache_page_type type, + pgcache_page_no_t pageno, + LSN rec_lsn, void *arg); +static int really_execute_checkpoint(void); +pthread_handler_t ma_checkpoint_background(void *arg); +static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon); + +/** + @brief Does a checkpoint + + @param level what level of checkpoint to do + @param no_wait if another checkpoint of same or stronger level + is already running, consider our job done + + @note In ha_maria, there can never be two threads trying a checkpoint at + the same time. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait) +{ + int result= 0; + DBUG_ENTER("ma_checkpoint_execute"); + + if (!checkpoint_control.inited) + { + /* + If ha_maria failed to start, maria_panic_hton is called, we come here. + */ + DBUG_RETURN(0); + } + DBUG_ASSERT(level > CHECKPOINT_NONE); + + /* look for already running checkpoints */ + mysql_mutex_lock(&LOCK_checkpoint); + while (checkpoint_in_progress != CHECKPOINT_NONE) + { + if (no_wait && (checkpoint_in_progress >= level)) + { + /* + If we are the checkpoint background thread, we don't wait (it's + smarter to flush pages instead of waiting here while the other thread + finishes its checkpoint). + */ + mysql_mutex_unlock(&LOCK_checkpoint); + goto end; + } + mysql_cond_wait(&COND_checkpoint, &LOCK_checkpoint); + } + + checkpoint_in_progress= level; + mysql_mutex_unlock(&LOCK_checkpoint); + /* from then on, we are sure to be and stay the only checkpointer */ + + result= really_execute_checkpoint(); + DBUG_EXECUTE_IF("maria_crash_after_checkpoint", + { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); }); + + mysql_cond_broadcast(&COND_checkpoint); +end: + DBUG_RETURN(result); +} + + +/** + @brief Does a checkpoint, really; expects no other checkpoints + running. + + Checkpoint level requested is read from checkpoint_in_progress. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +static int really_execute_checkpoint(void) +{ + uint i, error= 0; + int error_errno= 0; + /** @brief checkpoint_start_log_horizon will be stored there */ + char *ptr; + const char *error_place= 0; + LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */ + LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn; + TRANSLOG_ADDRESS checkpoint_start_log_horizon; + char checkpoint_start_log_horizon_char[LSN_STORE_SIZE]; + DBUG_ENTER("really_execute_checkpoint"); + DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress)); + bzero(&record_pieces, sizeof(record_pieces)); + + /* + STEP 1: record current end-of-log position using log's lock. It is + critical for the correctness of Checkpoint (related to memory visibility + rules, the log's lock is a mutex). + "Horizon" is a lower bound of the LSN of the next log record. + */ + checkpoint_start_log_horizon= translog_get_horizon(); + DBUG_PRINT("info",("checkpoint_start_log_horizon " LSN_FMT "", + LSN_IN_PARTS(checkpoint_start_log_horizon))); + lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); + + /* + STEP 2: fetch information about transactions. + We must fetch transactions before dirty pages. Indeed, a transaction + first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn + to 0. If we fetched pages first, we may see no dirty page yet, then we + fetch transactions but the transaction has already reset its rec_lsn to 0 + so we miss rec_lsn again. + For a similar reason (over-allocated bitmap pages) we have to fetch + transactions before flushing bitmap pages. + + min_trn_rec_lsn will serve to lower the starting point of the REDO phase + (down from checkpoint_start_log_horizon). + */ + if (unlikely(trnman_collect_transactions(&record_pieces[0], + &record_pieces[1], + &min_trn_rec_lsn, + &min_first_undo_lsn))) + { + error_place= "trnman_collect_transaction"; + goto err; + } + + + /* STEP 3: fetch information about table files */ + if (unlikely(collect_tables(&record_pieces[2], + checkpoint_start_log_horizon))) + { + error_place= "collect_tables"; + goto err; + } + + + /* STEP 4: fetch information about dirty pages */ + /* + It's better to do it _after_ having flushed some data pages (which + collect_tables() may have done), because those are now non-dirty and so we + have a more up-to-date dirty pages list to put into the checkpoint record, + and thus we will have less work at Recovery. + */ + /* Using default pagecache for now */ + if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache, + &record_pieces[3], + &min_page_rec_lsn))) + { + error_place= "collect_pages"; + goto err; + } + + + /* LAST STEP: now write the checkpoint log record */ + { + LSN lsn; + translog_size_t total_rec_length; + /* + the log handler is allowed to modify "str" and "length" (but not "*str") + of its argument, so we must not pass it record_pieces directly, + otherwise we would later not know what memory pieces to my_free(). + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= + (uchar*) checkpoint_start_log_horizon_char; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length= + sizeof(checkpoint_start_log_horizon_char); + for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) + { + log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].str= (uchar*)record_pieces[i].str; + log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].length= record_pieces[i].length; + total_rec_length+= (translog_size_t) record_pieces[i].length; + } + if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT, + &dummy_transaction_object, NULL, + total_rec_length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + { + error_place= "translog_write_record"; + goto err; + } + translog_lock(); + /* + This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because + such hook would be called before translog_flush (and we must be sure + that log was flushed before we write to the control file). + */ + if (unlikely(ma_control_file_write_and_force(lsn, last_logno, + max_trid_in_control_file, + recovery_failures))) + { + error_place= "ma_control_file_write"; + error_errno= my_errno; + translog_unlock(); + goto err; + } + translog_unlock(); + } + + /* + Note that we should not alter memory structures until we have successfully + written the checkpoint record and control file. + */ + /* checkpoint succeeded */ + ptr= record_pieces[3].str; + pages_to_flush_before_next_checkpoint= uint4korr(ptr); + DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint", + pages_to_flush_before_next_checkpoint)); + + /* compute log's low-water mark */ + { + TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn; + set_if_smaller(log_low_water_mark, min_trn_rec_lsn); + set_if_smaller(log_low_water_mark, min_first_undo_lsn); + set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon); + /** + Now purge unneeded logs. + As some systems have an unreliable fsync (drive lying), we could try to + be robust against that: remember a few previous checkpoints in the + control file, and not purge logs immediately... Think about it. + */ + if (translog_purge(log_low_water_mark)) + ma_message_no_user(0, "log purging failed"); + } + + goto end; + +err: + error= 1; + my_printf_error(HA_ERR_GENERIC, "Aria engine: checkpoint failed at %s with " + "error %d", MYF(ME_ERROR_LOG), + error_place, (error_errno ? error_errno : my_errno)); + /* we were possibly not able to determine what pages to flush */ + pages_to_flush_before_next_checkpoint= 0; + +end: + for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++) + my_free(record_pieces[i].str); + mysql_mutex_lock(&LOCK_checkpoint); + checkpoint_in_progress= CHECKPOINT_NONE; + checkpoints_total++; + checkpoints_ok_total+= !error; + mysql_mutex_unlock(&LOCK_checkpoint); + DBUG_RETURN(error); +} + + +/** + @brief Initializes the checkpoint module + + @param interval If one wants the module to create a + thread which will periodically do + checkpoints, and flush dirty pages, in the + background, it should specify a non-zero + interval in seconds. The thread will then be + created and will take checkpoints separated by + approximately 'interval' second. + + @note A checkpoint is taken only if there has been some significant + activity since the previous checkpoint. Between checkpoint N and N+1 the + thread flushes all dirty pages which were already dirty at the time of + checkpoint N. + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int ma_checkpoint_init(ulong interval) +{ + int res= 0; + DBUG_ENTER("ma_checkpoint_init"); + if (ma_service_thread_control_init(&checkpoint_control)) + res= 1; + else if (interval > 0) + { + size_t intv= interval; + compile_time_assert(sizeof(void *) >= sizeof(ulong)); + if ((res= mysql_thread_create(key_thread_checkpoint, + &checkpoint_control.thread, NULL, + ma_checkpoint_background, + (void*) intv))) + checkpoint_control.killed= TRUE; + } + else + checkpoint_control.killed= TRUE; + DBUG_RETURN(res); +} + + +#ifndef DBUG_OFF +/** + Function used to test recovery: flush some table pieces and then caller + crashes. + + @param what_to_flush 0: current bitmap and all data pages + 1: state + 2: all bitmap pages +*/ +static void flush_all_tables(int what_to_flush) +{ + int res= 0; + LIST *pos; /**< to iterate over open tables */ + mysql_mutex_lock(&THR_LOCK_maria); + for (pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + if (info->s->now_transactional) + { + switch (what_to_flush) + { + case 0: + res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP); + break; + case 1: + res= _ma_state_info_write(info->s, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET| + MA_STATE_INFO_WRITE_LOCK); + DBUG_PRINT("maria_flush_states", + ("is_of_horizon: LSN " LSN_FMT, + LSN_IN_PARTS(info->s->state.is_of_horizon))); + break; + case 2: + res= _ma_bitmap_flush_all(info->s); + break; + } + } + DBUG_ASSERT(res == 0); + } + mysql_mutex_unlock(&THR_LOCK_maria); +} +#endif + + +/** + @brief Destroys the checkpoint module +*/ + +void ma_checkpoint_end(void) +{ + DBUG_ENTER("ma_checkpoint_end"); + /* + Some intentional crash methods, usually triggered by + SET MARIA_CHECKPOINT_INTERVAL=X + */ + DBUG_EXECUTE_IF("maria_flush_bitmap", + { + DBUG_PRINT("maria_flush_bitmap", ("now")); + flush_all_tables(2); + }); + DBUG_EXECUTE_IF("maria_flush_whole_page_cache", + { + DBUG_PRINT("maria_flush_whole_page_cache", ("now")); + flush_all_tables(0); + }); + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + /* + Note that for WAL reasons, maria_flush_states requires + maria_flush_whole_log. + */ + DBUG_EXECUTE_IF("maria_flush_states", + { + DBUG_PRINT("maria_flush_states", ("now")); + flush_all_tables(1); + }); + DBUG_EXECUTE_IF("maria_crash", + { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); }); + + if (checkpoint_control.inited) + { + ma_service_thread_control_end(&checkpoint_control); + my_free(dfiles); + my_free(kfiles); + dfiles= kfiles= NULL; + } + DBUG_VOID_RETURN; +} + + +/** + @brief dirty-page filtering criteria for MEDIUM checkpoint. + + We flush data/index pages which have been dirty since the previous + checkpoint (this is the two-checkpoint rule: the REDO phase will not have + to start from earlier than the next-to-last checkpoint). + Bitmap pages are handled by _ma_bitmap_flush_all(). + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param +*/ + +static enum pagecache_flush_filter_result +filter_flush_file_medium(enum pagecache_page_type type, + pgcache_page_no_t pageno __attribute__ ((unused)), + LSN rec_lsn, void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + return (type == PAGECACHE_LSN_PAGE) && + (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0); +} + + +/** + @brief dirty-page filtering criteria for FULL checkpoint. + + We flush all dirty data/index pages. + Bitmap pages are handled by _ma_bitmap_flush_all(). + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param +*/ + +static enum pagecache_flush_filter_result +filter_flush_file_full(enum pagecache_page_type type, + pgcache_page_no_t pageno __attribute__ ((unused)), + LSN rec_lsn __attribute__ ((unused)), + void *arg __attribute__ ((unused))) +{ + return (type == PAGECACHE_LSN_PAGE); +} + + +/** + @brief dirty-page filtering criteria for background flushing thread. + + We flush data/index pages which have been dirty since the previous + checkpoint (this is the two-checkpoint rule: the REDO phase will not have + to start from earlier than the next-to-last checkpoint), and no + bitmap pages. But we flush no more than a certain number of pages (to have + an even flushing, no write burst). + The reason to not flush bitmap pages is that they may not be in a flushable + state at this moment and we don't want to wait for them. + + @param type Page's type + @param pageno Page's number + @param rec_lsn Page's rec_lsn + @param arg filter_param +*/ + +static enum pagecache_flush_filter_result +filter_flush_file_evenly(enum pagecache_page_type type, + pgcache_page_no_t pageno __attribute__ ((unused)), + LSN rec_lsn, void *arg) +{ + struct st_filter_param *param= (struct st_filter_param *)arg; + if (unlikely(param->max_pages == 0)) /* all flushed already */ + return FLUSH_FILTER_SKIP_ALL; + if ((type == PAGECACHE_LSN_PAGE) && + (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0)) + { + param->max_pages--; + return FLUSH_FILTER_OK; + } + return FLUSH_FILTER_SKIP_TRY_NEXT; +} + + +/** + @brief Background thread which does checkpoints and flushes periodically. + + Takes a checkpoint. After this, all pages dirty at the time of that + checkpoint are flushed evenly until it is time to take another checkpoint. + This ensures that the REDO phase starts at earliest (in LSN time) at the + next-to-last checkpoint record ("two-checkpoint rule"). + + @note MikaelR questioned why the same thread does two different jobs, the + risk could be that while a checkpoint happens no LRD flushing happens. +*/ + +static ulong maria_checkpoint_min_cache_activity= 10*1024*1024; +/* Set in ha_maria.cc */ +ulong maria_checkpoint_min_log_activity= 1*1024*1024; + +pthread_handler_t ma_checkpoint_background(void *arg) +{ + /** @brief At least this of log/page bytes written between checkpoints */ + /* + If the interval could be changed by the user while we are in this thread, + it could be annoying: for example it could cause "case 2" to be executed + right after "case 0", thus having 'dfile' unset. So the thread cares only + about the interval's value when it started. + */ + const size_t interval= (size_t)arg; + size_t sleeps, sleep_time; + TRANSLOG_ADDRESS log_horizon_at_last_checkpoint= + translog_get_horizon(); + ulonglong pagecache_flushes_at_last_checkpoint= + maria_pagecache->global_cache_write; + uint UNINIT_VAR(pages_bunch_size); + struct st_filter_param filter_param; + PAGECACHE_FILE *UNINIT_VAR(dfile); /**< data file currently being flushed */ + PAGECACHE_FILE *UNINIT_VAR(kfile); /**< index file currently being flushed */ + + my_thread_init(); + DBUG_PRINT("info",("Maria background checkpoint thread starts")); + DBUG_ASSERT(interval > 0); + + PSI_CALL_set_thread_account(0,0,0,0); + + /* + Recovery ended with all tables closed and a checkpoint: no need to take + one immediately. + */ + sleeps= 1; + pages_to_flush_before_next_checkpoint= 0; + + for(;;) /* iterations of checkpoints and dirty page flushing */ + { +#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ + sleeps=0; +#endif + switch (sleeps % interval) + { + case 0: + { + /* If checkpoints are disabled, wait 1 second and try again */ + if (maria_checkpoint_disabled) + { + sleep_time= 1; + break; + } + { + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + + /* + With background flushing evenly distributed over the time + between two checkpoints, we should have only little flushing to do + in the checkpoint. + */ + /* + No checkpoint if little work of interest for recovery was done + since last checkpoint. Such work includes log writing (lengthens + recovery, checkpoint would shorten it), page flushing (checkpoint + would decrease the amount of read pages in recovery). + In case of one short statement per minute (very low load), we don't + want to checkpoint every minute, hence the positive + maria_checkpoint_min_activity. + */ + if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <= + maria_checkpoint_min_log_activity && + ((ulonglong) (maria_pagecache->global_cache_write - + pagecache_flushes_at_last_checkpoint) * + maria_pagecache->block_size) <= + maria_checkpoint_min_cache_activity) + { + /* + Not enough has happend since last checkpoint. + Sleep for a while and try again later + */ + sleep_time= interval; + break; + } + sleep_time= 1; + ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE); + /* + Snapshot this kind of "state" of the engine. Note that the value + below is possibly greater than last_checkpoint_lsn. + */ + log_horizon_at_last_checkpoint= translog_get_horizon(); + pagecache_flushes_at_last_checkpoint= + maria_pagecache->global_cache_write; + /* + If the checkpoint above succeeded it has set d|kfiles and + d|kfiles_end. If is has failed, it has set + pages_to_flush_before_next_checkpoint to 0 so we will skip flushing + and sleep until the next checkpoint. + */ + } + break; + } + case 1: + /* set up parameters for background page flushing */ + filter_param.up_to_lsn= last_checkpoint_lsn; + pages_bunch_size= pages_to_flush_before_next_checkpoint / (uint)interval; + dfile= dfiles; + kfile= kfiles; + /* fall through */ + default: + if (pages_bunch_size > 0) + { + DBUG_PRINT("checkpoint", + ("Maria background checkpoint thread: %u pages", + pages_bunch_size)); + /* flush a bunch of dirty pages */ + filter_param.max_pages= pages_bunch_size; + while (dfile != dfiles_end) + { + /* + We use FLUSH_KEEP_LAZY: if a file is already in flush, it's + smarter to move to the next file than wait for this one to be + completely flushed, which may take long. + StaleFilePointersInFlush: notice how below we use "dfile" which + is an OS file descriptor plus some function and MARIA_SHARE + pointers; this data dates from a previous checkpoint; since then, + the table may have been closed (so MARIA_SHARE* became stale), and + the file descriptor reassigned to another table which does not + have the same CRC-read-set callbacks: it is thus important that + flush_pagecache_blocks_with_filter() does not use the pointers, + only the OS file descriptor. + */ + int res= + flush_pagecache_blocks_with_filter(maria_pagecache, + dfile, FLUSH_KEEP_LAZY, + filter_flush_file_evenly, + &filter_param); + if (unlikely(res & PCFLUSH_ERROR)) + ma_message_no_user(0, "background data page flush failed"); + if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ + break; /* and we will continue with the same file */ + dfile++; /* otherwise all this file is flushed, move to next file */ + /* + MikaelR noted that he observed that Linux's file cache may never + fsync to disk until this cache is full, at which point it decides + to empty the cache, making the machine very slow. A solution was + to fsync after writing 2 MB. So we might want to fsync() here if + we wrote enough pages. + */ + } + while (kfile != kfiles_end) + { + int res= + flush_pagecache_blocks_with_filter(maria_pagecache, + kfile, FLUSH_KEEP_LAZY, + filter_flush_file_evenly, + &filter_param); + if (unlikely(res & PCFLUSH_ERROR)) + ma_message_no_user(0, "background index page flush failed"); + if (filter_param.max_pages == 0) /* bunch all flushed, sleep */ + break; /* and we will continue with the same file */ + kfile++; /* otherwise all this file is flushed, move to next file */ + } + sleep_time= 1; + } + else + { + /* Can directly sleep until the next checkpoint moment */ + sleep_time= interval - (sleeps % interval); + } + } + if (my_service_thread_sleep(&checkpoint_control, + sleep_time * 1000000000ULL)) + break; + sleeps+= sleep_time; + } + DBUG_PRINT("info",("Maria background checkpoint thread ends")); + { + CHECKPOINT_LEVEL level= CHECKPOINT_FULL; + /* + That's the final one, which guarantees that a clean shutdown always ends + with a checkpoint. + */ + DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;); + ma_checkpoint_execute(level, FALSE); + } + my_thread_end(); + return 0; +} + + +/** + @brief Allocates buffer and stores in it some info about open tables, + does some flushing on those. + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + The caller is taking a checkpoint. + + @param[out] str pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about open tables + @param checkpoint_start_log_horizon Of the in-progress checkpoint + record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) +{ + MARIA_SHARE **distinct_shares= NULL; + char *ptr; + uint error= 1, sync_error= 0, nb, nb_stored, i; + my_bool unmark_tables= TRUE; + size_t total_names_length; + LIST *pos; /**< to iterate over open tables */ + struct st_state_copy { + uint index; + MARIA_STATE_INFO state; + }; + struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */ + *state_copies_end, /**< cache ends here */ + *state_copy; /**< iterator in cache */ + TRANSLOG_ADDRESS UNINIT_VAR(state_copies_horizon); /**< horizon of states' _copies_ */ + struct st_filter_param filter_param; + PAGECACHE_FLUSH_FILTER filter; + DBUG_ENTER("collect_tables"); + + /* let's make a list of distinct shares */ + mysql_mutex_lock(&THR_LOCK_maria); + for (nb= 0, pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + MARIA_SHARE *share= info->s; + /* the first three variables below can never change */ + if (share->base.born_transactional && !share->temporary && + share->mode != O_RDONLY && + !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)) + { + /* + Apart from us, only maria_close() reads/sets in_checkpoint but cannot + run now as we hold THR_LOCK_maria. + */ + /* + This table is relevant for checkpoint and not already seen. Mark it, + so that it is not seen again in the loop. + */ + nb++; + DBUG_ASSERT(share->in_checkpoint == 0); + /* This flag ensures that we count only _distinct_ shares. */ + share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP; + } + } + if (unlikely((distinct_shares= + (MARIA_SHARE **)my_malloc(PSI_INSTRUMENT_ME, nb * sizeof(MARIA_SHARE *), + MYF(MY_WME))) == NULL)) + goto err; + for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next) + { + MARIA_HA *info= (MARIA_HA*)pos->data; + MARIA_SHARE *share= info->s; + if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP) + { + distinct_shares[i++]= share; + /* + With this we prevent the share from going away while we later flush + and force it without holding THR_LOCK_maria. For example if the share + could be my_free()d by maria_close() we would have a problem when we + access it to flush the table. We "pin" the share pointer. + And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is + not seen again in the loop. + */ + share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME; + total_names_length+= share->open_file_name.length; + } + } + + DBUG_ASSERT(i == nb); + mysql_mutex_unlock(&THR_LOCK_maria); + DBUG_PRINT("info",("found %u table shares", nb)); + + str->length= + 4 + /* number of tables */ + (2 + /* short id */ + LSN_STORE_SIZE + /* first_log_write_at_lsn */ + 1 /* end-of-name 0 */ + ) * nb + total_names_length; + if (unlikely((str->str= my_malloc(PSI_INSTRUMENT_ME, str->length, MYF(MY_WME))) == NULL)) + goto err; + + ptr= str->str; + ptr+= 4; /* real number of stored tables is not yet know */ + + /* only possible checkpointer, so can do the read below without mutex */ + filter_param.up_to_lsn= last_checkpoint_lsn; + switch(checkpoint_in_progress) + { + case CHECKPOINT_MEDIUM: + filter= &filter_flush_file_medium; + break; + case CHECKPOINT_FULL: + filter= &filter_flush_file_full; + break; + case CHECKPOINT_INDIRECT: + filter= NULL; + break; + default: + DBUG_ASSERT(0); + goto err; + } + + /* + The principle of reading/writing the state below is explained in + ma_recovery.c, look for "Recovery of the state". + */ +#define STATE_COPIES 1024 + state_copies= (struct st_state_copy *) + my_malloc(PSI_INSTRUMENT_ME, STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME)); + dfiles= (PAGECACHE_FILE *)my_realloc(PSI_INSTRUMENT_ME, (uchar *)dfiles, + /* avoid size of 0 for my_realloc */ + MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + kfiles= (PAGECACHE_FILE *)my_realloc(PSI_INSTRUMENT_ME, (uchar *)kfiles, + /* avoid size of 0 for my_realloc */ + MY_MAX(1, nb) * sizeof(PAGECACHE_FILE), + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + if (unlikely((state_copies == NULL) || + (dfiles == NULL) || (kfiles == NULL))) + goto err; + state_copy= state_copies_end= NULL; + dfiles_end= dfiles; + kfiles_end= kfiles; + + for (nb_stored= 0, i= 0; i < nb; i++) + { + MARIA_SHARE *share= distinct_shares[i]; + PAGECACHE_FILE kfile, dfile; + my_bool ignore_share; + if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) + { + /* + No need for a mutex to read the above, only us can write *this* bit of + the in_checkpoint bitmap + */ + continue; + } + /** + @todo We should not look at tables which didn't change since last + checkpoint. + */ + DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str)); + if (state_copy == state_copies_end) /* we have no more cached states */ + { + /* + Collect and cache a bunch of states. We do this for many states at a + time, to not lock/unlock the log's lock too often. + */ + uint j, bound= MY_MIN(nb, i + STATE_COPIES); + state_copy= state_copies; + /* part of the state is protected by log's lock */ + translog_lock(); + state_copies_horizon= translog_get_horizon_no_lock(); + for (j= i; j < bound; j++) + { + MARIA_SHARE *share2= distinct_shares[j]; + if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)) + continue; + state_copy->index= j; + state_copy->state= share2->state; /* we copy the state */ + state_copy++; + /* + data_file_length is not updated under log's lock by the bitmap + code, but writing a wrong data_file_length is ok: a next + maria_close() will correct it; if we crash before, Recovery will + set it to the true physical size. + */ + } + translog_unlock(); + if (state_copy == state_copies) + break; /* Nothing to do */ + + /** + We are going to flush these states. + Before, all records describing how to undo such state must be + in the log (WAL). Usually this means UNDOs. In the special case of + data|key_file_length, recovery just needs to open the table to fix the + length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to + understand it must open a table, is enough; so as long as + data|key_file_length is updated after writing any log record it's ok: + if we copied new value above, it means the record was before + state_copies_horizon and we flush such record below. + Apart from data|key_file_length which are easily recoverable from the + real file's size, all other state members must be updated only when + writing the UNDO; otherwise, if updated before, if their new value is + flushed by a checkpoint and there is a crash before UNDO is written, + their REDO group will be missing or at least incomplete and skipped + by recovery, so bad state value will stay. For example, setting + key_root before writing the UNDO: the table would have old index + pages (they were pinned at time of crash) and a new, thus wrong, + key_root. + @todo RECOVERY BUG check that all code honours that. + */ + if (translog_flush(state_copies_horizon)) + goto err; + /* now we have cached states and they are WAL-safe*/ + state_copies_end= state_copy-1; + state_copy= state_copies; + } + + /* locate our state among these cached ones */ + for ( ; state_copy->index != i; state_copy++) + DBUG_ASSERT(state_copy <= state_copies_end); + + /* OS file descriptors are ints which we stored in 4 bytes */ + compile_time_assert(sizeof(int) <= 4); + /* + Protect against maria_close() (which does some memory freeing in + MARIA_FILE_BITMAP) with close_lock. intern_lock is not + sufficient as we, as well as maria_close(), are going to unlock + intern_lock in the middle of manipulating the table. Serializing us and + maria_close() should help avoid problems. + */ + mysql_mutex_lock(&share->close_lock); + mysql_mutex_lock(&share->intern_lock); + /* + Tables in a normal state have their two file descriptors open. + In some rare cases like REPAIR, some descriptor may be closed or even + -1. If that happened, the _ma_state_info_write() may fail. This is + prevented by enclosing all all places which close/change kfile.file with + intern_lock. + */ + kfile= share->kfile; + dfile= share->bitmap.file; + /* + Ignore table which has no logged writes (all its future log records will + be found naturally by Recovery). Ignore obsolete shares (_before_ + setting themselves to last_version=0 they already did all flush and + sync; if we flush their state now we may be flushing an obsolete state + onto a newer one (assuming the table has been reopened with a different + share but of course same physical index file). + */ + ignore_share= (share->id == 0) | (share->last_version == 0); + DBUG_PRINT("info", ("ignore_share: %d", ignore_share)); + if (!ignore_share) + { + size_t open_file_name_len= share->open_file_name.length + 1; + /* remember the descriptors for background flush */ + *(dfiles_end++)= dfile; + *(kfiles_end++)= kfile; + /* we will store this table in the record */ + nb_stored++; + int2store(ptr, share->id); + ptr+= 2; + lsn_store(ptr, share->lsn_of_file_id); + ptr+= LSN_STORE_SIZE; + /* + first_bitmap_with_space is not updated under log's lock, and is + important. We would need the bitmap's lock to get it right. Recovery + of this is not clear, so we just play safe: write it out as + unknown: if crash, _ma_bitmap_init() at next open (for example in + Recovery) will convert it to 0 and thus the first insertion will + search for free space from the file's first bitmap (0) - + under-optimal but safe. + If no crash, maria_close() will write the exact value. + */ + state_copy->state.first_bitmap_with_space= ~(ulonglong)0; + memcpy(ptr, share->open_file_name.str, open_file_name_len); + ptr+= open_file_name_len; + if (cmp_translog_addr(share->state.is_of_horizon, + checkpoint_start_log_horizon) >= 0) + { + /* + State was flushed recently, it does not hold down the log's + low-water mark and will not give avoidable work to Recovery. So we + needn't flush it. Also, it is possible that while we copied the + state above (under log's lock, without intern_lock) it was being + modified in memory or flushed to disk (without log's lock, under + intern_lock, like in maria_extra()), so our copy may be incorrect + and we should not flush it. + It may also be a share which got last_version==0 since we checked + last_version; in this case, it flushed its state and the LSN test + above will catch it. + */ + } + else + { + /* + We could do the state flush only if share->changed, but it's + tricky. + Consider a maria_write() which has written REDO,UNDO, and before it + calls _ma_writeinfo() (setting share->changed=1), checkpoint + happens and sees share->changed=0, does not flush state. It is + possible that Recovery does not start from before the REDO and thus + the state is not recovered. A solution may be to set + share->changed=1 under log mutex when writing log records. + + The current solution is to keep a copy the last saved state and + not write the state if it was same as last time. It's ok if + is_of_horizon would be different on disk if all other data is + the same. + */ + DBUG_ASSERT(share->last_version != 0); + state_copy->state.is_of_horizon= share->state.is_of_horizon= + share->checkpoint_state.is_of_horizon= state_copies_horizon; + if (kfile.file >= 0 && memcmp(&share->checkpoint_state, + &state_copy->state, + sizeof(state_copy->state))) + { + sync_error|= + _ma_state_info_write_sub(kfile.file, &state_copy->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET); + memcpy(&share->checkpoint_state, + &state_copy->state, sizeof(state_copy->state)); + } + /* + We don't set share->changed=0 because it may interfere with a + concurrent _ma_writeinfo() doing share->changed=1 (cancel its + effect). The sad consequence is that we will flush the same state at + each checkpoint if the table was once written and then not anymore. + */ + } + } +#ifdef EXTRA_DEBUG_BITMAP + else + { + DBUG_ASSERT(share->bitmap.changed == 0 && + share->bitmap.changed_not_flushed == 0); + } +#endif + + /* + _ma_bitmap_flush_all() may wait, so don't keep intern_lock as + otherwise this would deadlock with allocate_and_write_block_record() + calling _ma_set_share_data_file_length() + */ + mysql_mutex_unlock(&share->intern_lock); + + if (!ignore_share) + { + /* + share->bitmap is valid because it's destroyed under close_lock which + we hold. + */ + if (_ma_bitmap_flush_all(share)) + { + sync_error= 1; + /** @todo all write failures should mark table corrupted */ + ma_message_no_user(0, "checkpoint bitmap page flush failed"); + } + DBUG_ASSERT(share->pagecache == maria_pagecache); + } + /* + Clean up any unused states. + TODO: Only do this call if there has been # (10?) ended transactions + since last call. + We had to release intern_lock to respect lock order with LOCK_trn_list. + */ + _ma_remove_not_visible_states_with_lock(share, FALSE); + + if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) + { + /* + maria_close() left us free the share. When it run it set share->id + to 0. As it run before we locked close_lock, we should have seen this + and so this assertion should be true: + */ + DBUG_ASSERT(ignore_share); + mysql_mutex_destroy(&share->intern_lock); + mysql_mutex_unlock(&share->close_lock); + mysql_mutex_destroy(&share->close_lock); + ma_crypt_free(share); + my_free(share); + } + else + { + /* share goes back to normal state */ + share->in_checkpoint= 0; + mysql_mutex_unlock(&share->close_lock); + } + + /* + We do the big disk writes out of intern_lock to not block other + users of this table (intern_lock is taken at the start and end of + every statement). This means that file descriptors may be invalid + (files may have been closed for example by HA_EXTRA_PREPARE_FOR_* + under Windows, or REPAIR). This should not be a problem as we use + MY_IGNORE_BADFD. Descriptors may even point to other files but then + the old blocks (of before the close) must have been flushed for sure, + so our flush will flush new blocks (of after the latest open) and that + should do no harm. + */ + /* + If CHECKPOINT_MEDIUM, this big flush below may result in a + serious write burst. Realize that all pages dirtied between the + last checkpoint and the one we are doing now, will be flushed at + next checkpoint, except those evicted by LRU eviction (depending on + the size of the page cache compared to the size of the working data + set, eviction may be rare or frequent). + We avoid that burst by anticipating: those pages are flushed + in bunches spanned regularly over the time interval between now and + the next checkpoint, by a background thread. Thus the next checkpoint + will have only little flushing to do (CHECKPOINT_MEDIUM should thus be + only a little slower than CHECKPOINT_INDIRECT). + */ + + /* + PageCacheFlushConcurrencyBugs + Inside the page cache, calls to flush_pagecache_blocks_int() on the same + file are serialized. Examples of concurrency bugs which happened when we + didn't have this serialization: + - maria_chk_size() (via CHECK TABLE) happens concurrently with + Checkpoint: Checkpoint is flushing a page: it pins the page and is + pre-empted, maria_chk_size() wants to flush this page too so gets an + error because Checkpoint pinned this page. Such error makes + maria_chk_size() mark the table as corrupted. + - maria_close() happens concurrently with Checkpoint: + Checkpoint is flushing a page: it registers a request on the page, is + pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE: + FLUSH_RELEASE will cause a free_block() which assumes the page is in the + LRU, but it is not (as Checkpoint registered a request). Crash. + - one thread is evicting a page of the file out of the LRU: it marks it + iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes + of the same file concurrently (like above). Then one flusher sees the + page is in switch, removes it from changed_blocks[] and puts it in its + first_in_switch, so the other flusher will not see the page at all and + return too early. If it's maria_close() which returns too early, then + maria_close() may close the file descriptor, and the other flusher, and + the evicter will fail to write their page: corruption. + */ + + if (!ignore_share) + { + if (filter != NULL) + { + if ((flush_pagecache_blocks_with_filter(maria_pagecache, + &dfile, FLUSH_KEEP_LAZY, + filter, &filter_param) & + PCFLUSH_ERROR)) + ma_message_no_user(0, "checkpoint data page flush failed"); + if ((flush_pagecache_blocks_with_filter(maria_pagecache, + &kfile, FLUSH_KEEP_LAZY, + filter, &filter_param) & + PCFLUSH_ERROR)) + ma_message_no_user(0, "checkpoint index page flush failed"); + } + /* + fsyncs the fd, that's the loooong operation (e.g. max 150 fsync + per second, so if you have touched 1000 files it's 7 seconds). + */ + sync_error|= + mysql_file_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) | + mysql_file_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD)); + /* + in case of error, we continue because writing other tables to disk is + still useful. + */ + } + } + + if (sync_error) + goto err; + /* We maybe over-estimated (due to share->id==0 or last_version==0) */ + DBUG_ASSERT(str->length >= (uint)(ptr - str->str)); + str->length= (uint)(ptr - str->str); + /* + As we support max 65k tables open at a time (2-byte short id), we + assume uint is enough for the cumulated length of table names; and + LEX_STRING::length is uint. + */ + int4store(str->str, nb_stored); + error= unmark_tables= 0; + +err: + if (unlikely(unmark_tables)) + { + /* maria_close() uses THR_LOCK_maria from start to end */ + mysql_mutex_lock(&THR_LOCK_maria); + for (i= 0; i < nb; i++) + { + MARIA_SHARE *share= distinct_shares[i]; + if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME) + { + share->in_checkpoint&= ~MARIA_CHECKPOINT_SHOULD_FREE_ME; + /* maria_close() left us to free the share */ + free_maria_share(share); + } + else + { + /* share goes back to normal state */ + share->in_checkpoint= 0; + } + } + mysql_mutex_unlock(&THR_LOCK_maria); + } + my_free(distinct_shares); + my_free(state_copies); + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_checkpoint.h b/storage/maria/ma_checkpoint.h new file mode 100644 index 00000000..c719c3cf --- /dev/null +++ b/storage/maria/ma_checkpoint.h @@ -0,0 +1,92 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +/* This is the interface of this module. */ + +typedef enum enum_ma_checkpoint_level { + CHECKPOINT_NONE= 0, + /* just write dirty_pages, transactions table and sync files */ + CHECKPOINT_INDIRECT, + /* also flush all dirty pages which were already dirty at prev checkpoint */ + CHECKPOINT_MEDIUM, + /* also flush all dirty pages */ + CHECKPOINT_FULL +} CHECKPOINT_LEVEL; + +C_MODE_START +int ma_checkpoint_init(ulong interval); +void ma_checkpoint_end(void); +int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait); +C_MODE_END + +/** + @brief reads some LSNs with special trickery + + If a 64-bit variable transitions between both halves being zero to both + halves being non-zero, and back, this function can be used to do a read of + it (without mutex, without atomic load) which always produces a correct + (though maybe slightly old) value (even on 32-bit CPUs). The value is at + least as new as the latest mutex unlock done by the calling thread. + The assumption is that the system sets both 4-byte halves either at the + same time, or one after the other (in any order), but NOT some bytes of the + first half then some bytes of the second half then the rest of bytes of the + first half. With this assumption, the function can detect when it is + seeing an inconsistent value. + + @param LSN pointer to the LSN variable to read + + @return LSN part (most significant byte always 0) +*/ +#if ( SIZEOF_CHARP >= 8 ) +/* 64-bit CPU, 64-bit reads are atomic */ +#define lsn_read_non_atomic LSN_WITH_FLAGS_TO_LSN +#else +static inline LSN lsn_read_non_atomic_32(const volatile LSN *x) +{ + /* + 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old + low bits and new high bits, or the contrary). + */ + for (;;) /* loop until no atomicity problems */ + { + /* + Remove most significant byte in case this is a LSN_WITH_FLAGS object. + Those flags in TRN::first_undo_lsn break the condition on transitions so + they must be removed below. + */ + LSN y= LSN_WITH_FLAGS_TO_LSN(*x); + if (likely((y == LSN_IMPOSSIBLE) || LSN_VALID(y))) + return y; + } +} +#define lsn_read_non_atomic(x) lsn_read_non_atomic_32(&x) +#endif + +/** + prints a message from a task not connected to any user (checkpoint + and recovery for example). + + @param level 0 if error, ME_WARNING if warning, + ME_NOTE if info + @param sentence text to write +*/ +#define ma_message_no_user(level, sentence) \ + my_printf_error(HA_ERR_GENERIC, "Aria engine: %s", MYF(level), sentence) diff --git a/storage/maria/ma_checksum.c b/storage/maria/ma_checksum.c new file mode 100644 index 00000000..baac18af --- /dev/null +++ b/storage/maria/ma_checksum.c @@ -0,0 +1,89 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Calculate a checksum for a row */ + +#include "maria_def.h" + +/** + Calculate a checksum for the record + + _ma_checksum() + @param info Maria handler + @param record Record + + @note + To ensure that the checksum is independent of the row format + we need to always calculate the checksum in the original field order. + + @return checksum +*/ + +ha_checksum _ma_checksum(MARIA_HA *info, const uchar *record) +{ + ha_checksum crc=0; + uint i,end; + MARIA_COLUMNDEF *base_column= info->s->columndef; + uint16 *column_nr= info->s->column_nr; + + if (info->s->base.null_bytes) + crc= my_checksum(crc, record, info->s->base.null_bytes); + + for (i= 0, end= info->s->base.fields ; i < end ; i++) + { + MARIA_COLUMNDEF *column= base_column + column_nr[i]; + const uchar *pos; + ulong length; + + if (record[column->null_pos] & column->null_bit) + continue; /* Null field */ + + pos= record + column->offset; + switch (column->type) { + case FIELD_BLOB: + { + uint blob_size_length= column->length- portable_sizeof_char_ptr; + length= _ma_calc_blob_length(blob_size_length, pos); + if (length) + { + memcpy((char**) &pos, pos + blob_size_length, sizeof(char*)); + crc= my_checksum(crc, pos, length); + } + continue; + } + case FIELD_VARCHAR: + { + uint pack_length= column->fill_length; + if (pack_length == 1) + length= (ulong) *pos; + else + length= uint2korr(pos); + pos+= pack_length; /* Skip length information */ + break; + } + default: + length= column->length; + break; + } + crc= my_checksum(crc, pos, length); + } + return crc; +} + + +ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *pos) +{ + return my_checksum(0, pos, info->s->base.reclength); +} diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c new file mode 100644 index 00000000..7441e29a --- /dev/null +++ b/storage/maria/ma_close.c @@ -0,0 +1,300 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2010, 2020, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* close a isam-database */ +/* + TODO: + We need to have a separate mutex on the closed file to allow other threads + to open other files during the time we flush the cache and close this file +*/ + +#include "ma_ftdefs.h" +#include "ma_crypt.h" +#ifdef WITH_S3_STORAGE_ENGINE +#include "s3_func.h" +#endif /* WITH_S3_STORAGE_ENGINE */ + +int maria_close(register MARIA_HA *info) +{ + int error=0,flag; + MARIA_SHARE *share= info->s; + my_bool internal_table= share->internal_table; + DBUG_ENTER("maria_close"); + DBUG_PRINT("enter",("name: '%s' base: %p reopen: %u locks: %u", + share->open_file_name.str, + info, (uint) share->reopen, + (uint) share->tot_locks)); + + /* Check that we have unlocked key delete-links properly */ + DBUG_ASSERT(info->key_del_used == 0); + /* Check that file is not part of any uncommitted transactions */ + DBUG_ASSERT(info->trn == 0 || info->trn == &dummy_transaction_object); + + if (share->reopen == 1) + { + /* + If we are going to close the file, flush page cache without + a global mutex + */ + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE)) + error= my_errno; + } + + /* Ensure no one can open this file while we are closing it */ + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_maria); + if (info->lock_type == F_EXTRA_LCK) + info->lock_type=F_UNLCK; /* HA_EXTRA_NO_USER_CHANGE */ + + if (info->lock_type != F_UNLCK) + { + if (maria_lock_database(info,F_UNLCK)) + error=my_errno; + } + if (!internal_table) + { + mysql_mutex_lock(&share->close_lock); + mysql_mutex_lock(&share->intern_lock); + } + + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + share->r_locks--; + share->tot_locks--; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + if (end_io_cache(&info->rec_cache)) + error=my_errno; + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + } + flag= !--share->reopen; + if (!internal_table) + { + maria_open_list= list_delete(maria_open_list,&info->open_list); + share->open_list= list_delete(share->open_list, &info->share_list); + } + + maria_ftparser_call_deinitializer(info); + my_free(info->rec_buff); + (*share->end)(info); + + if (flag) + { + /* Last close of file */ + + /* + Check that we don't have any dangling open files + We may still have some open transactions. In this case the share + will be kept around until the transaction has closed + */ + DBUG_ASSERT(share->open_list == 0); + + /* Flush everything */ + if (share->kfile.file >= 0) + { + my_bool save_global_changed= share->global_changed; + + /* Avoid _ma_mark_file_changed() when flushing pages */ + share->global_changed= 1; + + /* Flush page cache if BLOCK format */ + if ((*share->once_end)(share)) + error= my_errno; + /* + Extra flush, just in case someone opened and closed the file + since the start of the function (very unlikely) + */ + if (flush_pagecache_blocks(share->pagecache, &share->kfile, + share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE)) + error= my_errno; + unmap_file(info); + if (!internal_table && + (((share->changed && share->base.born_transactional) || + maria_is_crashed(info) || + (share->temporary && !share->deleting)))) + { + if (save_global_changed) + { + /* + Reset effect of _ma_mark_file_changed(). Better to do it + here than in _ma_decrement_open_count(), as + _ma_state_info_write() will write the open_count. + */ + save_global_changed= 0; + share->state.open_count--; + } + /* + State must be written to file as it was not done at table's + unlocking. + */ + if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)) + error= my_errno; + } + DBUG_ASSERT(maria_is_crashed(info) || !share->base.born_transactional || + share->state.open_count == 0 || + share->open_count_not_zero_on_open); + + /* Ensure that open_count is zero on close */ + share->global_changed= save_global_changed; + _ma_decrement_open_count(info, 0); + + /* Ensure that open_count really is zero */ + DBUG_ASSERT(maria_is_crashed(info) || share->temporary || + share->state.open_count == 0 || + share->open_count_not_zero_on_open); + + /* + File must be synced as it is going out of the maria_open_list and so + becoming unknown to future Checkpoints. + */ + if (share->now_transactional && + mysql_file_sync(share->kfile.file, MYF(MY_WME))) + error= my_errno; + if (!share->s3_path && mysql_file_close(share->kfile.file, MYF(0))) + error= my_errno; + } + thr_lock_delete(&share->lock); + mysql_mutex_destroy(&share->key_del_lock); + + { + int i,keys; + keys = share->state.header.keys; + mysql_rwlock_destroy(&share->mmap_lock); + for(i=0; i<keys; i++) { + mysql_rwlock_destroy(&share->keyinfo[i].root_lock); + } + } + DBUG_ASSERT(share->now_transactional == share->base.born_transactional || + share->internal_table); + /* + We assign -1 because checkpoint does not need to flush (in case we + have concurrent checkpoint if no then we do not need it here also) + */ + share->kfile.file= -1; + + /* + Remember share->history for future opens + + We have to unlock share->intern_lock then lock it after + LOCK_trn_list (trnman_lock()) to avoid dead locks. + */ + if (!internal_table) + mysql_mutex_unlock(&share->intern_lock); + _ma_remove_not_visible_states_with_lock(share, TRUE); + if (!internal_table) + mysql_mutex_lock(&share->intern_lock); + + if (share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME) + { + /* we cannot my_free() the share, Checkpoint would see a bad pointer */ + share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME; + } + + if (share->state_history) + { + if (share->state_history->trid) /* If not visible for all */ + { + MARIA_STATE_HISTORY_CLOSED *history; + DBUG_PRINT("info", ("Storing state history")); + /* + Here we ignore the unlikely case that we don't have memory + to store the state. In the worst case what happens is that + any transaction that tries to access this table will get a + wrong status information. + */ + if ((history= (MARIA_STATE_HISTORY_CLOSED *) + my_malloc(PSI_INSTRUMENT_ME, sizeof(*history), MYF(MY_WME)))) + { + history->create_rename_lsn= share->state.create_rename_lsn; + history->state_history= share->state_history; + if (my_hash_insert(&maria_stored_state, (uchar*) history)) + my_free(history); + } + } + else + my_free(share->state_history); + /* Marker for concurrent checkpoint */ + share->state_history= 0; + } + } + if (!internal_table) + { + mysql_mutex_unlock(&THR_LOCK_maria); + mysql_mutex_unlock(&share->close_lock); + } + + /* free_maria_share will free share->internal_lock */ + free_maria_share(share); + + my_free(info->ftparser_param); + if (info->dfile.file >= 0 && ! info->s3) + { + /* + This is outside of mutex so would confuse a concurrent + Checkpoint. Fortunately in BLOCK_RECORD we close earlier under mutex. + */ + if (mysql_file_close(info->dfile.file, MYF(0))) + error= my_errno; + } + + delete_dynamic(&info->pinned_pages); +#ifdef WITH_S3_STORAGE_ENGINE + if (info->s3) + s3f.deinit(info->s3); +#endif /* WITH_S3_STORAGE_ENGINE */ + my_free(info); + + if (error) + { + DBUG_PRINT("error", ("Got error on close: %d", my_errno)); + DBUG_RETURN(my_errno= error); + } + DBUG_RETURN(0); +} /* maria_close */ + + +/** + Free Aria table share + + Note that share will not be freed a long as there are active checkpoints + or transactions pointing at the shared object +*/ + +void free_maria_share(MARIA_SHARE *share) +{ + if (!share->internal_table) + mysql_mutex_assert_owner(&share->intern_lock); + + if (!share->reopen && !share->in_trans && + !(share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)) + { + /* No one can access this share anymore, time to delete it ! */ + if (!share->internal_table) + mysql_mutex_unlock(&share->intern_lock); + ma_crypt_free(share); + my_free(share->s3_path); + (void) mysql_mutex_destroy(&share->intern_lock); + (void) mysql_mutex_destroy(&share->close_lock); + (void) mysql_cond_destroy(&share->key_del_cond); + my_free(share); + return; + } + if (!share->internal_table) + mysql_mutex_unlock(&share->intern_lock); + return; +} diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c new file mode 100644 index 00000000..4bd64bfd --- /dev/null +++ b/storage/maria/ma_commit.c @@ -0,0 +1,132 @@ +/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_trnman.h" + +/** + writes a COMMIT record to log and commits transaction in memory + + @param trn transaction + + @return Operation status + @retval 0 ok + @retval 1 error (disk error or out of memory) +*/ + +int ma_commit(TRN *trn) +{ + int res; + LSN commit_lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS]; + DBUG_ENTER("ma_commit"); + + DBUG_ASSERT(trn->rec_lsn == LSN_IMPOSSIBLE); + if (trn->undo_lsn == 0) /* no work done, rollback (cheaper than commit) */ + DBUG_RETURN(trnman_rollback_trn(trn)); + /* + - if COMMIT record is written before trnman_commit_trn(): + if Checkpoint comes in the middle it will see trn is not committed, + then if crash, Recovery might roll back trn (if MY_MIN(rec_lsn) is after + COMMIT record) and this is not an issue as + * transaction's updates were not made visible to other transactions + * "commit ok" was not sent to client + Alternatively, Recovery might commit trn (if MY_MIN(rec_lsn) is before + COMMIT record), which is ok too. All in all it means that "trn committed" + is not 100% equal to "COMMIT record written". + - if COMMIT record is written after trnman_commit_trn(): + if crash happens between the two, trn will be rolled back which is an + issue (transaction's updates were made visible to other transactions). + So we need to go the first way. + + Note that we have to use | here to ensure that all calls are made. + */ + + /* + We do not store "thd->transaction.xid_state.xid" for now, it will be + needed only when we support XA. + */ + res= (translog_write_record(&commit_lsn, LOGREC_COMMIT, + trn, NULL, 0, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) | + translog_flush(commit_lsn)); + + DBUG_EXECUTE_IF("maria_sleep_in_commit", + { + DBUG_PRINT("info", ("maria_sleep_in_commit")); + sleep(3); + }); + res|= trnman_commit_trn(trn); + + + /* + Note: if trnman_commit_trn() fails above, we have already + written the COMMIT record, so Checkpoint and Recovery will see the + transaction as committed. + */ + DBUG_RETURN(res); +} + + +/** + Writes a COMMIT record for a transaciton associated with a file + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval # error (disk error or out of memory) +*/ + +int maria_commit(MARIA_HA *info) +{ + TRN *trn; + if (!info->s->now_transactional) + return 0; + trn= info->trn; + _ma_reset_trn_for_table(info); + return ma_commit(trn); +} + + +/** + Starts a transaction on a file handle + + @param info Maria handler + + @return Operation status + @retval 0 ok + @retval # Error code. + + @note this can be used only in single-threaded programs (tests), + because we create a transaction (trnman_new_trn) with WT_THD=0. + XXX it needs to be fixed when we'll start using maria_begin from SQL. +*/ + +int maria_begin(MARIA_HA *info) +{ + DBUG_ENTER("maria_begin"); + + if (info->s->now_transactional) + { + TRN *trn= trnman_new_trn(0); + if (unlikely(!trn)) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + _ma_set_trn_for_table(info, trn); + } + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_commit.h b/storage/maria/ma_commit.h new file mode 100644 index 00000000..77114bbb --- /dev/null +++ b/storage/maria/ma_commit.h @@ -0,0 +1,18 @@ +/* Copyright (C) 2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +C_MODE_START +int ma_commit(TRN *trn); +C_MODE_END diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c new file mode 100644 index 00000000..21befb70 --- /dev/null +++ b/storage/maria/ma_control_file.c @@ -0,0 +1,738 @@ +/* Copyright (C) 2007 MySQL AB & Guilhem Bichot & Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + WL#3234 Maria control file + First version written by Guilhem Bichot on 2006-04-27. +*/ + +#ifndef EXTRACT_DEFINITIONS +#include "maria_def.h" +#include "ma_checkpoint.h" +#endif + +/* + A control file contains the following objects: + +Start of create time variables (at start of file): + - Magic string (including version number of Maria control file) + - Uuid + - Size of create time part + - Size of dynamic part + - Maria block size +..... Here we can add new variables without changing format + - Checksum of create time part (last of block) + +Start of changeable part: + - Checksum of changeable part + - LSN of last checkpoint + - Number of last log file + - Max trid in control file (since Maria 1.5 May 2008) + - Number of consecutive recovery failures (since Maria 1.5 May 2008) +..... Here we can add new variables without changing format + +The idea is that one can add new variables to the control file and still +use it with old program versions. If one needs to do an incompatible change +one should increment the control file version number. +*/ + +/* Total size should be < sector size for atomic write operation */ +#define CF_MAX_SIZE 512 +#define CF_MIN_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \ + CF_CHECKSUM_SIZE * 2 + CF_LSN_SIZE + CF_FILENO_SIZE) + +/* Create time variables */ +#define CF_MAGIC_STRING "\xfe\xfe\xc" +#define CF_MAGIC_STRING_OFFSET 0 +#define CF_MAGIC_STRING_SIZE (sizeof(CF_MAGIC_STRING)-1) +#define CF_VERSION_OFFSET (CF_MAGIC_STRING_OFFSET + CF_MAGIC_STRING_SIZE) +#define CF_VERSION_SIZE 1 +#define CF_UUID_OFFSET (CF_VERSION_OFFSET + CF_VERSION_SIZE) +#define CF_UUID_SIZE MY_UUID_SIZE +#define CF_CREATE_TIME_SIZE_OFFSET (CF_UUID_OFFSET + CF_UUID_SIZE) +#define CF_SIZE_SIZE 2 +#define CF_CHANGEABLE_SIZE_OFFSET (CF_CREATE_TIME_SIZE_OFFSET + CF_SIZE_SIZE) +#define CF_BLOCKSIZE_OFFSET (CF_CHANGEABLE_SIZE_OFFSET + CF_SIZE_SIZE) +#define CF_BLOCKSIZE_SIZE 2 + +#define CF_CREATE_TIME_TOTAL_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \ + CF_CHECKSUM_SIZE) + +/* + Start of the part that changes during execution + This is stored at offset uint2korr(file[CF_CHANGEABLE_SIZE]) +*/ +#define CF_CHECKSUM_OFFSET 0 +#define CF_CHECKSUM_SIZE 4 +#define CF_LSN_OFFSET (CF_CHECKSUM_OFFSET + CF_CHECKSUM_SIZE) +#define CF_LSN_SIZE LSN_STORE_SIZE +#define CF_FILENO_OFFSET (CF_LSN_OFFSET + CF_LSN_SIZE) +#define CF_FILENO_SIZE 4 +#define CF_MAX_TRID_OFFSET (CF_FILENO_OFFSET + CF_FILENO_SIZE) +#define CF_MAX_TRID_SIZE TRANSID_SIZE +#define CF_RECOV_FAIL_OFFSET (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE) +#define CF_RECOV_FAIL_SIZE 1 +#define CF_CHANGEABLE_TOTAL_SIZE (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE) + +/* + The following values should not be changed, except when changing version + number of the maria control file. These are the minimum sizes of the + parts the code can handle. +*/ + +#define CF_MIN_CREATE_TIME_TOTAL_SIZE \ +(CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + CF_CHECKSUM_SIZE) +#define CF_MIN_CHANGEABLE_TOTAL_SIZE \ +(CF_FILENO_OFFSET + CF_FILENO_SIZE) + +#ifndef EXTRACT_DEFINITIONS + +/* This module owns these two vars. */ +/** + This LSN serves for the two-checkpoint rule, and also to find the + checkpoint record when doing a recovery. +*/ +LSN last_checkpoint_lsn= LSN_IMPOSSIBLE; +uint32 last_logno= FILENO_IMPOSSIBLE; +/** + The maximum transaction id given to a transaction. It is only updated at + clean shutdown (in case of crash, logs have better information). +*/ +TrID max_trid_in_control_file= 0; + +/** + Number of consecutive log or recovery failures. Reset to 0 after recovery's + success. +*/ +uint8 recovery_failures= 0; + +/** + @brief If log's lock should be asserted when writing to control file. + + Can be re-used by any function which needs to be thread-safe except when + it is called at startup. +*/ +my_bool maria_multi_threaded= FALSE; +/** @brief if currently doing a recovery */ +my_bool maria_in_recovery= FALSE; + +/** + Control file is less then 512 bytes (a disk sector), + to be as atomic as possible +*/ +static int control_file_fd= -1; + +static uint cf_create_time_size; +static uint cf_changeable_size; + +/** + @brief Create Maria control file +*/ + +static CONTROL_FILE_ERROR create_control_file(const char *name, + int open_flags) +{ + uint32 sum; + uchar buffer[CF_CREATE_TIME_TOTAL_SIZE]; + ulong rnd1,rnd2; + + DBUG_ENTER("maria_create_control_file"); + + if ((control_file_fd= mysql_file_create(key_file_control, name, 0, + open_flags, MYF(MY_SYNC_DIR | MY_WME))) < 0) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + /* Reset variables, as we are creating the file */ + cf_create_time_size= CF_CREATE_TIME_TOTAL_SIZE; + cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; + + /* Create unique uuid for the control file */ + my_random_bytes((uchar *)&rnd1, sizeof (rnd1)); + my_random_bytes((uchar *)&rnd2, sizeof (rnd2)); + my_uuid_init(rnd1, rnd2); + my_uuid(maria_uuid); + + /* Prepare and write the file header */ + memcpy(buffer, CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE); + buffer[CF_VERSION_OFFSET]= CONTROL_FILE_VERSION; + memcpy(buffer + CF_UUID_OFFSET, maria_uuid, CF_UUID_SIZE); + int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, cf_create_time_size); + int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, cf_changeable_size); + + /* Write create time variables */ + int2store(buffer + CF_BLOCKSIZE_OFFSET, maria_block_size); + + /* Store checksum for create time parts */ + sum= (uint32) my_checksum(0, buffer, cf_create_time_size - + CF_CHECKSUM_SIZE); + int4store(buffer + cf_create_time_size - CF_CHECKSUM_SIZE, sum); + + if (my_pwrite(control_file_fd, buffer, cf_create_time_size, + 0, MYF(MY_FNABP | MY_WME))) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + /* + To be safer we should make sure that there are no logs or data/index + files around (indeed it could be that the control file alone was deleted + or not restored, and we should not go on with life at this point). + + Things should still be relatively safe as if someone tries to use + an old table with a new control file the different uuid:s between + the files will cause ma_open() to generate an HA_ERR_OLD_FILE + error. When used from mysqld this will cause the table to be open + in repair mode which will remove all dependencies between the + table and the old control file. + + We could have a tool which can rebuild the control file, by reading the + directory of logs, finding the newest log, reading it to find last + checkpoint... Slow but can save your db. For this to be possible, we + must always write to the control file right after writing the checkpoint + log record, and do nothing in between (i.e. the checkpoint must be + usable as soon as it has been written to the log). + */ + + /* init the file with these "undefined" values */ + DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE, + FILENO_IMPOSSIBLE, 0, 0)); +} + + +/** + Locks control file exclusively. This is kept for the duration of the engine + process, to prevent another Maria instance to write to our logs or control + file. +*/ + +static int lock_control_file(const char *name, my_bool do_retry) +{ + /* + On Windows, my_lock() uses locking() which is mandatory locking and so + prevents maria-recovery.test from copying the control file. And in case of + crash, it may take a while for Windows to unlock file, causing downtime. + */ + /** + @todo BUG We should explore my_sopen(_SH_DENYWRD) to open or create the + file under Windows. + */ +#ifndef _WIN32 + uint retry= 0; + uint retry_count= do_retry ? MARIA_MAX_CONTROL_FILE_LOCK_RETRY : 0; + + /* + We can't here use the automatic wait in my_lock() as the alarm thread + may not yet exists. + */ + while (my_lock(control_file_fd, F_WRLCK, 0L, F_TO_EOF, + MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK | MY_NO_WAIT))) + { + if (retry == 0) + my_printf_error(HA_ERR_INITIALIZATION, + "Can't lock aria control file '%s' for exclusive use, " + "error: %d. Will retry for %d seconds", 0, + name, my_errno, retry_count); + if (++retry > retry_count) + return 1; + sleep(1); + } +#endif + return 0; +} + + +/* + @brief Initialize control file subsystem + + Looks for the control file. If none and creation is requested, creates file. + If present, reads it to find out last checkpoint's LSN and last log, updates + the last_checkpoint_lsn and last_logno global variables. + Called at engine's start. + + @note + The format of the control file is defined in the comments and defines + at the start of this file. + + @param create_if_missing create file if not found + + @return Operation status + @retval 0 OK + @retval 1 Error (in which case the file is left closed) +*/ + +CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing, + my_bool print_error, + my_bool wait_for_lock) +{ + uchar buffer[CF_MAX_SIZE]; + char name[FN_REFLEN], errmsg_buff[256]; + const char *errmsg, *lock_failed_errmsg= "Could not get an exclusive lock;" + " file is probably in use by another process"; + uint new_cf_create_time_size, new_cf_changeable_size, new_block_size; + my_off_t file_size; + int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR | O_CLOEXEC; + int error= CONTROL_FILE_UNKNOWN_ERROR; + DBUG_ENTER("ma_control_file_open"); + + /* + If you change sizes in the #defines, you at least have to change the + "*store" and "*korr" calls in this file, and can even create backward + compatibility problems. Beware! + */ + DBUG_ASSERT(CF_LSN_SIZE == (3+4)); + DBUG_ASSERT(CF_FILENO_SIZE == 4); + + if (control_file_fd >= 0) /* already open */ + DBUG_RETURN(0); + + if (fn_format(name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + if (my_access(name,F_OK)) + { + CONTROL_FILE_ERROR create_error; + if (!create_if_missing) + { + error= CONTROL_FILE_MISSING; + errmsg= "Can't find file"; + goto err; + } + if ((create_error= create_control_file(name, open_flags))) + { + error= create_error; + errmsg= "Can't create file"; + goto err; + } + if (!aria_readonly && lock_control_file(name, wait_for_lock)) + { + error= CONTROL_FILE_LOCKED; + errmsg= lock_failed_errmsg; + goto err; + } + goto ok; + } + + /* Otherwise, file exists */ + if ((control_file_fd= mysql_file_open(key_file_control, name, + open_flags, MYF(MY_WME))) < 0) + { + errmsg= "Can't open file"; + goto err; + } + + /* lock it before reading content */ + if (!aria_readonly && lock_control_file(name, wait_for_lock)) + { + error= CONTROL_FILE_LOCKED; + errmsg= lock_failed_errmsg; + goto err; + } + + file_size= mysql_file_seek(control_file_fd, 0, SEEK_END, MYF(MY_WME)); + if (file_size == MY_FILEPOS_ERROR) + { + errmsg= "Can't read size"; + goto err; + } + if (file_size < CF_MIN_SIZE) + { + /* + Given that normally we write only a sector and it's atomic, the only + possibility for a file to be of too short size is if we crashed at the + very first startup, between file creation and file write. Quite unlikely + (and can be made even more unlikely by doing this: create a temp file, + write it, and then rename it to be the control file). + What's more likely is if someone forgot to restore the control file, + just did a "touch control" to try to get Maria to start, or if the + disk/filesystem has a problem. + So let's be rigid. + */ + error= CONTROL_FILE_TOO_SMALL; + errmsg= "Size of control file is smaller than expected"; + goto err; + } + + /* Check if control file is unexpectedly big */ + if (file_size > CF_MAX_SIZE) + { + error= CONTROL_FILE_TOO_BIG; + errmsg= "File size bigger than expected"; + goto err; + } + + if (mysql_file_pread(control_file_fd, buffer, (size_t)file_size, 0, MYF(MY_FNABP))) + { + errmsg= "Can't read file"; + goto err; + } + + if (memcmp(buffer + CF_MAGIC_STRING_OFFSET, + CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE)) + { + error= CONTROL_FILE_BAD_MAGIC_STRING; + errmsg= "Missing valid id at start of file. File is not a valid aria control file"; + goto err; + } + + if (buffer[CF_VERSION_OFFSET] > CONTROL_FILE_VERSION) + { + error= CONTROL_FILE_BAD_VERSION; + sprintf(errmsg_buff, "File is from a future aria system: %d. Current version is: %d", + (int) buffer[CF_VERSION_OFFSET], CONTROL_FILE_VERSION); + errmsg= errmsg_buff; + goto err; + } + + new_cf_create_time_size= uint2korr(buffer + CF_CREATE_TIME_SIZE_OFFSET); + new_cf_changeable_size= uint2korr(buffer + CF_CHANGEABLE_SIZE_OFFSET); + + if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE || + new_cf_changeable_size < CF_MIN_CHANGEABLE_TOTAL_SIZE || + new_cf_create_time_size + new_cf_changeable_size != file_size) + { + error= CONTROL_FILE_INCONSISTENT_INFORMATION; + errmsg= "Sizes stored in control file are inconsistent"; + goto err; + } + + new_block_size= uint2korr(buffer + CF_BLOCKSIZE_OFFSET); + if (new_block_size != maria_block_size && maria_block_size) + { + error= CONTROL_FILE_WRONG_BLOCKSIZE; + sprintf(errmsg_buff, + "Block size in control file (%u) is different than given aria_block_size: %u", + new_block_size, (uint) maria_block_size); + errmsg= errmsg_buff; + goto err; + } + maria_block_size= new_block_size; + + if (my_checksum(0, buffer, new_cf_create_time_size - CF_CHECKSUM_SIZE) != + uint4korr(buffer + new_cf_create_time_size - CF_CHECKSUM_SIZE)) + { + error= CONTROL_FILE_BAD_HEAD_CHECKSUM; + errmsg= "Fixed part checksum mismatch"; + goto err; + } + + if (my_checksum(0, buffer + new_cf_create_time_size + CF_CHECKSUM_SIZE, + new_cf_changeable_size - CF_CHECKSUM_SIZE) != + uint4korr(buffer + new_cf_create_time_size)) + { + error= CONTROL_FILE_BAD_CHECKSUM; + errmsg= "Changeable part (end of control file) checksum mismatch"; + goto err; + } + + memcpy(maria_uuid, buffer + CF_UUID_OFFSET, CF_UUID_SIZE); + cf_create_time_size= new_cf_create_time_size; + cf_changeable_size= new_cf_changeable_size; + last_checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size + + CF_LSN_OFFSET); + last_logno= uint4korr(buffer + new_cf_create_time_size + CF_FILENO_OFFSET); + if (new_cf_changeable_size >= (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE)) + max_trid_in_control_file= + transid_korr(buffer + new_cf_create_time_size + CF_MAX_TRID_OFFSET); + if (new_cf_changeable_size >= (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE)) + recovery_failures= + (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0]; + +ok: + DBUG_RETURN(0); + +err: + if (print_error) + my_printf_error(HA_ERR_INITIALIZATION, + "Got error '%s' when trying to use aria control file " + "'%s'", 0, errmsg, name); + ma_control_file_end(); /* will unlock file if needed */ + DBUG_RETURN(error); +} + + +/* + Write information durably to the control file; stores this information into + the last_checkpoint_lsn, last_logno, max_trid_in_control_file, + recovery_failures global variables. + Called when we have created a new log (after syncing this log's creation), + when we have written a checkpoint (after syncing this log record), at + shutdown (for storing trid in case logs are soon removed by user), and + before and after recovery (to store recovery_failures). + Variables last_checkpoint_lsn and last_logno must be protected by caller + using log's lock, unless this function is called at startup. + + SYNOPSIS + ma_control_file_write_and_force() + last_checkpoint_lsn_arg LSN of last checkpoint + last_logno_arg last log file number + max_trid_arg maximum transaction longid + recovery_failures_arg consecutive recovery failures + + NOTE + We always want to do one single my_pwrite() here to be as atomic as + possible. + + RETURN + 0 - OK + 1 - Error +*/ + +int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg, + uint32 last_logno_arg, + TrID max_trid_arg, + uint8 recovery_failures_arg) +{ + uchar buffer[CF_MAX_SIZE]; + uint32 sum; + my_bool no_need_sync; + DBUG_ENTER("ma_control_file_write_and_force"); + + /* + We don't need to sync if this is just an increase of + recovery_failures: it's even good if that counter is not increased on disk + in case of power or hardware failure (less false positives when removing + logs). + */ + no_need_sync= ((last_checkpoint_lsn == last_checkpoint_lsn_arg) && + (last_logno == last_logno_arg) && + (max_trid_in_control_file == max_trid_arg) && + (recovery_failures_arg > 0)); + + if (control_file_fd < 0) + DBUG_RETURN(1); + +#ifndef DBUG_OFF + if (maria_multi_threaded) + translog_lock_handler_assert_owner(); +#endif + + lsn_store(buffer + CF_LSN_OFFSET, last_checkpoint_lsn_arg); + int4store(buffer + CF_FILENO_OFFSET, last_logno_arg); + transid_store(buffer + CF_MAX_TRID_OFFSET, max_trid_arg); + (buffer + CF_RECOV_FAIL_OFFSET)[0]= recovery_failures_arg; + + if (cf_changeable_size > CF_CHANGEABLE_TOTAL_SIZE) + { + /* + More room than needed for us. Must be a newer version. Clear part which + we cannot maintain, so that any future version notices we didn't + maintain its extra data. + */ + uint zeroed= cf_changeable_size - CF_CHANGEABLE_TOTAL_SIZE; + char msg[150]; + bzero(buffer + CF_CHANGEABLE_TOTAL_SIZE, zeroed); + my_snprintf(msg, sizeof(msg), + "Control file must be from a newer version; zero-ing out %u" + " unknown bytes in control file at offset %u", zeroed, + cf_changeable_size + cf_create_time_size); + ma_message_no_user(ME_WARNING, msg); + } + else + { + /* not enough room for what we need to store: enlarge */ + cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; + } + /* Note that the create-time portion is not touched */ + + /* Checksum is stored first */ + compile_time_assert(CF_CHECKSUM_OFFSET == 0); + sum= my_checksum(0, buffer + CF_CHECKSUM_SIZE, + cf_changeable_size - CF_CHECKSUM_SIZE); + int4store(buffer, sum); + + if (my_pwrite(control_file_fd, buffer, cf_changeable_size, + cf_create_time_size, MYF(MY_FNABP | MY_WME)) || + (!no_need_sync && mysql_file_sync(control_file_fd, MYF(MY_WME)))) + DBUG_RETURN(1); + + last_checkpoint_lsn= last_checkpoint_lsn_arg; + last_logno= last_logno_arg; + max_trid_in_control_file= max_trid_arg; + recovery_failures= recovery_failures_arg; + + cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; /* no more warning */ + DBUG_RETURN(0); +} + + +/* + Free resources taken by control file subsystem + + SYNOPSIS + ma_control_file_end() +*/ + +int ma_control_file_end(void) +{ + int close_error; + DBUG_ENTER("ma_control_file_end"); + + if (control_file_fd < 0) /* already closed */ + DBUG_RETURN(0); + +#ifndef _WIN32 + (void) my_lock(control_file_fd, F_UNLCK, 0L, F_TO_EOF, + MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK)); +#endif + + close_error= mysql_file_close(control_file_fd, MYF(MY_WME)); + /* + As mysql_file_close() frees structures even if close() fails, we do the + same, i.e. we mark the file as closed in all cases. + */ + control_file_fd= -1; + /* + As this module owns these variables, closing the module forbids access to + them (just a safety): + */ + last_checkpoint_lsn= LSN_IMPOSSIBLE; + last_logno= FILENO_IMPOSSIBLE; + max_trid_in_control_file= recovery_failures= 0; + + DBUG_RETURN(close_error); +} + + +/** + Tells if control file is initialized. +*/ + +my_bool ma_control_file_inited(void) +{ + return (control_file_fd >= 0); +} + +/** + Print content of aria_log_control file +*/ + +my_bool print_aria_log_control() +{ + uchar buffer[CF_MAX_SIZE]; + char name[FN_REFLEN], uuid_str[MY_UUID_STRING_LENGTH+1]; + const char *errmsg; + uint new_cf_create_time_size, new_cf_changeable_size; + my_off_t file_size; + ulong logno; + ulonglong trid,checkpoint_lsn; + int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR | O_CLOEXEC; + int error= CONTROL_FILE_UNKNOWN_ERROR; + uint recovery_fails; + File file; + DBUG_ENTER("ma_control_file_open"); + + if (fn_format(name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); + + if ((file= mysql_file_open(key_file_control, name, + open_flags, MYF(MY_WME))) < 0) + { + errmsg= "Can't open file"; + goto err2; + } + + file_size= mysql_file_seek(file, 0, SEEK_END, MYF(MY_WME)); + if (file_size == MY_FILEPOS_ERROR) + { + errmsg= "Can't read size"; + goto err; + } + if (file_size < CF_MIN_SIZE) + { + /* + Given that normally we write only a sector and it's atomic, the only + possibility for a file to be of too short size is if we crashed at the + very first startup, between file creation and file write. Quite unlikely + (and can be made even more unlikely by doing this: create a temp file, + write it, and then rename it to be the control file). + What's more likely is if someone forgot to restore the control file, + just did a "touch control" to try to get Maria to start, or if the + disk/filesystem has a problem. + So let's be rigid. + */ + error= CONTROL_FILE_TOO_SMALL; + errmsg= "Size of control file is smaller than expected"; + goto err; + } + + /* Check if control file is unexpectedly big */ + if (file_size > CF_MAX_SIZE) + { + error= CONTROL_FILE_TOO_BIG; + errmsg= "File size bigger than expected"; + goto err; + } + + if (mysql_file_pread(file, buffer, (size_t)file_size, 0, MYF(MY_FNABP))) + { + errmsg= "Can't read file"; + goto err; + } + + if (memcmp(buffer + CF_MAGIC_STRING_OFFSET, + CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE)) + { + error= CONTROL_FILE_BAD_MAGIC_STRING; + errmsg= "Missing valid id at start of file. File is not a valid aria control file"; + goto err; + } + + printf("Aria file version: %u\n", buffer[CF_VERSION_OFFSET]); + + new_cf_create_time_size= uint2korr(buffer + CF_CREATE_TIME_SIZE_OFFSET); + new_cf_changeable_size= uint2korr(buffer + CF_CHANGEABLE_SIZE_OFFSET); + + if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE || + new_cf_changeable_size < CF_MIN_CHANGEABLE_TOTAL_SIZE || + new_cf_create_time_size + new_cf_changeable_size != file_size) + { + error= CONTROL_FILE_INCONSISTENT_INFORMATION; + errmsg= "Sizes stored in control file are inconsistent"; + goto err; + } + checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size + + CF_LSN_OFFSET); + logno= uint4korr(buffer + new_cf_create_time_size + CF_FILENO_OFFSET); + my_uuid2str(buffer + CF_UUID_OFFSET, uuid_str, 1); + uuid_str[MY_UUID_STRING_LENGTH]= 0; + + printf("Block size: %u\n", uint2korr(buffer + CF_BLOCKSIZE_OFFSET)); + printf("maria_uuid: %s\n", uuid_str); + printf("last_checkpoint_lsn: " LSN_FMT "\n", LSN_IN_PARTS(checkpoint_lsn)); + printf("last_log_number: %lu\n", (ulong) logno); + if (new_cf_changeable_size >= (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE)) + { + trid= transid_korr(buffer + new_cf_create_time_size + CF_MAX_TRID_OFFSET); + printf("trid: %llu\n", (ulonglong) trid); + } + if (new_cf_changeable_size >= (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE)) + { + recovery_fails= + (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0]; + printf("recovery_failures: %u\n", recovery_fails); + } + mysql_file_close(file, MYF(0)); + DBUG_RETURN(0); + +err: + mysql_file_close(file, MYF(0)); +err2: + my_printf_error(HA_ERR_INITIALIZATION, + "Got error '%s' when trying to use aria control file " + "'%s'", 0, errmsg, name); + DBUG_RETURN(error); +} + +#endif /* EXTRACT_DEFINITIONS */ diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h new file mode 100644 index 00000000..40428f66 --- /dev/null +++ b/storage/maria/ma_control_file.h @@ -0,0 +1,79 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + WL#3234 Maria control file + First version written by Guilhem Bichot on 2006-04-27. +*/ + +#ifndef _ma_control_file_h +#define _ma_control_file_h + +C_MODE_START + +#define CONTROL_FILE_BASE_NAME "aria_log_control" +/* + Major version for control file. Should only be changed when doing + big changes that made the new control file incompatible with all + older versions of Maria. +*/ +#define CONTROL_FILE_VERSION 1 + +/* Here is the interface of this module */ + +/* + LSN of the last checkoint + (if last_checkpoint_lsn == LSN_IMPOSSIBLE then there was never a checkpoint) +*/ +extern LSN last_checkpoint_lsn; +/* + Last log number (if last_logno == FILENO_IMPOSSIBLE then there is no log + file yet) +*/ +extern uint32 last_logno; + +extern TrID max_trid_in_control_file, max_long_trid; + +extern uint8 recovery_failures; + +extern my_bool maria_multi_threaded, maria_in_recovery; + +typedef enum enum_control_file_error { + CONTROL_FILE_OK= 0, + CONTROL_FILE_TOO_SMALL, + CONTROL_FILE_TOO_BIG, + CONTROL_FILE_BAD_MAGIC_STRING, + CONTROL_FILE_BAD_VERSION, + CONTROL_FILE_BAD_CHECKSUM, + CONTROL_FILE_BAD_HEAD_CHECKSUM, + CONTROL_FILE_MISSING, + CONTROL_FILE_INCONSISTENT_INFORMATION, + CONTROL_FILE_WRONG_BLOCKSIZE, + CONTROL_FILE_LOCKED, + CONTROL_FILE_UNKNOWN_ERROR /* any other error */ +} CONTROL_FILE_ERROR; + + +CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing, + my_bool print_error, + my_bool wait_for_lock); +int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg, + uint32 last_logno_arg, TrID max_trid_arg, + uint8 recovery_failures_arg); +int ma_control_file_end(void); +my_bool ma_control_file_inited(void); +my_bool print_aria_log_control(void); +C_MODE_END +#endif diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c new file mode 100644 index 00000000..7fd739d1 --- /dev/null +++ b/storage/maria/ma_create.c @@ -0,0 +1,1526 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Create a MARIA table */ + +#include "ma_ftdefs.h" +#include "ma_sp_defs.h" +#include <my_bit.h> +#include "ma_blockrec.h" +#include "trnman_public.h" +#include "trnman.h" +#include "ma_crypt.h" + +#ifdef _WIN32 +#include <fcntl.h> +#endif +#include <m_ctype.h> + +static int compare_columns(MARIA_COLUMNDEF **a, MARIA_COLUMNDEF **b); + + +static ulonglong update_tot_length(ulonglong tot_length, ulonglong max_rows, uint length) +{ + ulonglong tot_length_part; + + if (tot_length == ULONGLONG_MAX) + return ULONGLONG_MAX; + + tot_length_part= (max_rows/(ulong) ((maria_block_size - + MAX_KEYPAGE_HEADER_SIZE - KEYPAGE_CHECKSUM_SIZE)/ + (length*2))); + if (tot_length_part >= ULONGLONG_MAX / maria_block_size) + return ULONGLONG_MAX; + + if (tot_length > ULONGLONG_MAX - tot_length_part * maria_block_size) + return ULONGLONG_MAX; + + return tot_length + tot_length_part * maria_block_size; +} + + +/* + Old options is used when recreating database, from maria_chk +*/ + +int maria_create(const char *name, enum data_file_type datafile_type, + uint keys,MARIA_KEYDEF *keydefs, + uint columns, MARIA_COLUMNDEF *columndef, + uint uniques, MARIA_UNIQUEDEF *uniquedefs, + MARIA_CREATE_INFO *ci,uint flags) +{ + uint i,j; + File UNINIT_VAR(dfile), UNINIT_VAR(file); + int errpos,save_errno, create_mode= O_RDWR | O_TRUNC, res; + myf create_flag, common_flag= MY_WME, sync_dir= 0; + uint length,max_key_length,packed,pack_bytes,pointer,real_length_diff, + key_length,info_length,key_segs,options,min_key_length, + base_pos,long_varchar_count, + unique_key_parts,fulltext_keys,offset, not_block_record_extra_length; + uint max_field_lengths, extra_header_size, column_nr; + uint internal_table= flags & HA_CREATE_INTERNAL_TABLE; + ulong reclength, real_reclength,min_pack_length; + char kfilename[FN_REFLEN], klinkname[FN_REFLEN], *klinkname_ptr= 0; + char dfilename[FN_REFLEN], dlinkname[FN_REFLEN], *dlinkname_ptr= 0; + ulong pack_reclength; + ulonglong tot_length,max_rows, tmp; + enum en_fieldtype type; + enum data_file_type org_datafile_type= datafile_type; + MARIA_SHARE share; + TRN tmp_transaction_object; + MARIA_KEYDEF *keydef,tmp_keydef; + MARIA_UNIQUEDEF *uniquedef; + HA_KEYSEG *keyseg,tmp_keyseg; + MARIA_COLUMNDEF *column, *end_column; + double *rec_per_key_part; + ulong *nulls_per_key_part; + uint16 *column_array; + my_off_t key_root[HA_MAX_POSSIBLE_KEY], kfile_size_before_extension; + MARIA_CREATE_INFO tmp_create_info; + my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */ + my_bool forced_packed; + uchar *log_data= NULL; + my_bool encrypted= ci->encrypted && datafile_type == BLOCK_RECORD; + my_bool insert_order= MY_TEST(flags & HA_PRESERVE_INSERT_ORDER); + uint crypt_page_header_space= 0; + DBUG_ENTER("maria_create"); + DBUG_PRINT("enter", ("keys: %u columns: %u uniques: %u flags: %u", + keys, columns, uniques, flags)); + + DBUG_ASSERT(maria_inited); + + if (flags & HA_CREATE_TMP_TABLE) + common_flag|= MY_THREAD_SPECIFIC; + + if (!ci) + { + bzero((char*) &tmp_create_info,sizeof(tmp_create_info)); + ci=&tmp_create_info; + } + + if (keys + uniques > MARIA_MAX_KEY) + { + DBUG_RETURN(my_errno=HA_WRONG_CREATE_OPTION); + } + errpos=0; + options=0; + bzero((uchar*) &share,sizeof(share)); + + if (flags & HA_DONT_TOUCH_DATA) + { + /* We come here from recreate table */ + org_datafile_type= ci->org_data_file_type; + if (!(ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD)) + options= (ci->old_options & + (HA_OPTION_COMPRESS_RECORD | HA_OPTION_PACK_RECORD | + HA_OPTION_READ_ONLY_DATA | HA_OPTION_CHECKSUM | + HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE | + HA_OPTION_LONG_BLOB_PTR | HA_OPTION_PAGE_CHECKSUM)); + else + { + /* Uncompressing rows */ + options= (ci->old_options & + (HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE | + HA_OPTION_DELAY_KEY_WRITE | HA_OPTION_LONG_BLOB_PTR | + HA_OPTION_PAGE_CHECKSUM)); + } + } + else + { + /* Transactional tables must be of type BLOCK_RECORD */ + if (ci->transactional) + datafile_type= BLOCK_RECORD; + } + + if (!(rec_per_key_part= + (double*) my_malloc(PSI_INSTRUMENT_ME, + (keys + uniques)*HA_MAX_KEY_SEG*sizeof(double) + + (keys + uniques)*HA_MAX_KEY_SEG*sizeof(ulong) + + sizeof(uint16) * columns, + MYF(common_flag | MY_ZEROFILL)))) + DBUG_RETURN(my_errno); + nulls_per_key_part= (ulong*) (rec_per_key_part + + (keys + uniques) * HA_MAX_KEY_SEG); + column_array= (uint16*) (nulls_per_key_part + + (keys + uniques) * HA_MAX_KEY_SEG); + + + /* Start by checking fields and field-types used */ + long_varchar_count=packed= not_block_record_extra_length= + pack_reclength= max_field_lengths= 0; + reclength= min_pack_length= ci->null_bytes; + forced_packed= 0; + column_nr= 0; + + if (encrypted) + { + DBUG_ASSERT(datafile_type == BLOCK_RECORD); + crypt_page_header_space= ma_crypt_get_data_page_header_space(); + } + + for (column= columndef, end_column= column + columns ; + column != end_column ; + column++) + { + /* Fill in not used struct parts */ + column->column_nr= column_nr++; + column->offset= reclength; + column->empty_pos= 0; + column->empty_bit= 0; + column->fill_length= column->length; + if (column->null_bit) + options|= HA_OPTION_NULL_FIELDS; + + reclength+= column->length; + type= column->type; + if (datafile_type == BLOCK_RECORD) + { + if (type == FIELD_SKIP_PRESPACE) + type= column->type= FIELD_NORMAL; /* SKIP_PRESPACE not supported */ + if (type == FIELD_NORMAL && + column->length > FULL_PAGE_SIZE2(maria_block_size, + crypt_page_header_space)) + { + /* FIELD_NORMAL can't be split over many blocks, convert to a CHAR */ + type= column->type= FIELD_SKIP_ENDSPACE; + } + } + + if (type != FIELD_NORMAL && type != FIELD_CHECK) + { + column->empty_pos= packed/8; + column->empty_bit= (1 << (packed & 7)); + if (type == FIELD_BLOB) + { + forced_packed= 1; + packed++; + share.base.blobs++; + if (pack_reclength != INT_MAX32) + { + if (column->length == 4+portable_sizeof_char_ptr) + pack_reclength= INT_MAX32; + else + { + /* Add max possible blob length */ + pack_reclength+= (1 << ((column->length- + portable_sizeof_char_ptr)*8)); + } + } + max_field_lengths+= (column->length - portable_sizeof_char_ptr); + } + else if (type == FIELD_SKIP_PRESPACE || + type == FIELD_SKIP_ENDSPACE) + { + forced_packed= 1; + max_field_lengths+= column->length > 255 ? 2 : 1; + not_block_record_extra_length++; + packed++; + } + else if (type == FIELD_VARCHAR) + { + pack_reclength++; + not_block_record_extra_length++; + max_field_lengths++; + if (datafile_type != DYNAMIC_RECORD) + packed++; + column->fill_length= 1; + options|= HA_OPTION_NULL_FIELDS; /* Use ma_checksum() */ + + /* We must test for 257 as length includes pack-length */ + if (MY_TEST(column->length >= 257)) + { + long_varchar_count++; + max_field_lengths++; + column->fill_length= 2; + } + } + else if (type == FIELD_SKIP_ZERO) + packed++; + else + { + if (!column->null_bit) + min_pack_length+= column->length; + else + { + /* Only BLOCK_RECORD skips NULL fields for all field values */ + not_block_record_extra_length+= column->length; + } + column->empty_pos= 0; + column->empty_bit= 0; + } + } + else /* FIELD_NORMAL */ + { + if (!column->null_bit) + { + min_pack_length+= column->length; + share.base.fixed_not_null_fields++; + share.base.fixed_not_null_fields_length+= column->length; + } + else + not_block_record_extra_length+= column->length; + } + } + + if (datafile_type == STATIC_RECORD && forced_packed) + { + /* Can't use fixed length records, revert to block records */ + datafile_type= BLOCK_RECORD; + } + + if (datafile_type == NO_RECORD && uniques) + { + /* Can't do unique without data, revert to block records */ + datafile_type= BLOCK_RECORD; + } + + if (encrypted) + { + /* + datafile_type is set (finally?) + update encryption that is only supported for BLOCK_RECORD + */ + if (datafile_type != BLOCK_RECORD) + { + encrypted= FALSE; + crypt_page_header_space= 0; + } + } + + if (datafile_type == DYNAMIC_RECORD) + options|= HA_OPTION_PACK_RECORD; /* Must use packed records */ + + if (datafile_type == STATIC_RECORD || datafile_type == NO_RECORD) + { + /* We can't use checksum with static length rows */ + flags&= ~HA_CREATE_CHECKSUM; + options&= ~HA_OPTION_CHECKSUM; + min_pack_length= reclength; + packed= 0; + } + else if (datafile_type != BLOCK_RECORD) + min_pack_length+= not_block_record_extra_length; + else + min_pack_length+= 5; /* Min row overhead */ + + if (flags & HA_CREATE_TMP_TABLE) + { + options|= HA_OPTION_TMP_TABLE; + tmp_table= TRUE; + create_mode|= O_NOFOLLOW | (internal_table ? 0 : O_EXCL); + /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */ + ci->transactional= FALSE; + flags&= ~HA_CREATE_PAGE_CHECKSUM; + } + share.base.null_bytes= ci->null_bytes; + share.base.original_null_bytes= ci->null_bytes; + share.base.born_transactional= ci->transactional; + share.base.max_field_lengths= max_field_lengths; + share.base.field_offsets= 0; /* for future */ + share.base.compression_algorithm= ci->compression_algorithm; + share.base.s3_block_size= ci->s3_block_size; + + if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM)) + { + options|= HA_OPTION_CHECKSUM; + min_pack_length++; + pack_reclength++; + } + if (pack_reclength < INT_MAX32) + pack_reclength+= max_field_lengths + long_varchar_count; + else + pack_reclength= INT_MAX32; + + if (flags & HA_CREATE_DELAY_KEY_WRITE) + options|= HA_OPTION_DELAY_KEY_WRITE; + if (flags & HA_CREATE_RELIES_ON_SQL_LAYER) + options|= HA_OPTION_RELIES_ON_SQL_LAYER; + if (flags & HA_CREATE_PAGE_CHECKSUM) + options|= HA_OPTION_PAGE_CHECKSUM; + + pack_bytes= (packed + 7) / 8; + if (pack_reclength != INT_MAX32) + pack_reclength+= reclength+pack_bytes + + MY_TEST(test_all_bits(options, HA_OPTION_CHECKSUM | + HA_OPTION_PACK_RECORD)); + min_pack_length+= pack_bytes; + /* Calculate min possible row length for rows-in-block */ + extra_header_size= MAX_FIXED_HEADER_SIZE; + if (ci->transactional) + { + extra_header_size= TRANS_MAX_FIXED_HEADER_SIZE; + DBUG_PRINT("info",("creating a transactional table")); + } + share.base.min_block_length= (extra_header_size + share.base.null_bytes + + pack_bytes); + if (!ci->data_file_length && ci->max_rows) + { + set_if_bigger(ci->max_rows, ci->reloc_rows); + if (pack_reclength == INT_MAX32 || + (~(ulonglong) 0)/ci->max_rows < (ulonglong) pack_reclength) + ci->data_file_length= ~(ulonglong) 0; + else + { + ci->data_file_length= _ma_safe_mul(ci->max_rows, pack_reclength); + if (datafile_type == BLOCK_RECORD) + { + /* Assume that blocks are only half full (very pessimistic!) */ + ci->data_file_length= _ma_safe_mul(ci->data_file_length, 2); + set_if_bigger(ci->data_file_length, maria_block_size*2); + } + } + } + else if (!ci->max_rows) + { + if (datafile_type == BLOCK_RECORD) + { + uint rows_per_page= + ((maria_block_size - PAGE_OVERHEAD_SIZE_RAW - crypt_page_header_space) + / (min_pack_length + extra_header_size + DIR_ENTRY_SIZE)); + ulonglong data_file_length= ci->data_file_length; + if (!data_file_length) + data_file_length= ((((ulonglong) 1 << ((BLOCK_RECORD_POINTER_SIZE-1) * + 8))/2 -1) * maria_block_size); + if (rows_per_page > 0) + { + set_if_smaller(rows_per_page, MAX_ROWS_PER_PAGE); + ci->max_rows= (data_file_length / maria_block_size+1) * rows_per_page; + } + else + ci->max_rows= data_file_length / (min_pack_length + + extra_header_size + + DIR_ENTRY_SIZE); + } + else + ci->max_rows=(ha_rows) (ci->data_file_length/(min_pack_length + + ((options & + HA_OPTION_PACK_RECORD) ? + 3 : 0))); + set_if_smaller(ci->reloc_rows, ci->max_rows); + } + max_rows= (ulonglong) ci->max_rows; + if (datafile_type == BLOCK_RECORD) + { + /* + The + 1 is for record position withing page + The * 2 is because we need one bit for knowing if there is transid's + after the row pointer + */ + pointer= maria_get_pointer_length((ci->data_file_length / + maria_block_size) * 2, 4) + 1; + set_if_smaller(pointer, BLOCK_RECORD_POINTER_SIZE); + + if (!max_rows) + max_rows= (((((ulonglong) 1 << ((pointer-1)*8)) -1) * maria_block_size) / + min_pack_length / 2); + } + else + { + if (datafile_type == NO_RECORD) + pointer= 0; + else if (datafile_type != STATIC_RECORD) + pointer= maria_get_pointer_length(ci->data_file_length, + maria_data_pointer_size); + else + pointer= maria_get_pointer_length(ci->max_rows, maria_data_pointer_size); + if (!max_rows) + max_rows= ((((ulonglong) 1 << (pointer*8)) -1) / min_pack_length); + } + + real_reclength=reclength; + if (datafile_type == STATIC_RECORD) + { + if (reclength <= pointer) + reclength=pointer+1; /* reserve place for delete link */ + } + else + reclength+= long_varchar_count; /* We need space for varchar! */ + + max_key_length=0; tot_length=0 ; key_segs=0; + fulltext_keys=0; + share.state.rec_per_key_part= rec_per_key_part; + share.state.nulls_per_key_part= nulls_per_key_part; + share.state.key_root=key_root; + share.state.key_del= HA_OFFSET_ERROR; + if (uniques) + max_key_length= MARIA_UNIQUE_HASH_LENGTH + pointer; + + for (i=0, keydef=keydefs ; i < keys ; i++ , keydef++) + { + share.state.key_root[i]= HA_OFFSET_ERROR; + length= real_length_diff= 0; + min_key_length= key_length= pointer; + + if (keydef->key_alg == HA_KEY_ALG_RTREE) + keydef->flag|= HA_RTREE_INDEX; /* For easier tests */ + + if (keydef->flag & HA_SPATIAL) + { +#ifdef HAVE_SPATIAL + /* BAR TODO to support 3D and more dimensions in the future */ + uint sp_segs=SPDIMS*2; + keydef->flag=HA_SPATIAL; + + if (flags & HA_DONT_TOUCH_DATA) + { + /* + Called by maria_chk - i.e. table structure was taken from + MYI file and SPATIAL key *does have* additional sp_segs keysegs. + keydef->seg here points right at the GEOMETRY segment, + so we only need to decrease keydef->keysegs. + (see maria_recreate_table() in _ma_check.c) + */ + keydef->keysegs-=sp_segs-1; + } + + for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ; + j++, keyseg++) + { + if (keyseg->type != HA_KEYTYPE_BINARY && + keyseg->type != HA_KEYTYPE_VARBINARY1 && + keyseg->type != HA_KEYTYPE_VARBINARY2) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + } + keydef->keysegs+=sp_segs; + key_length+=SPLEN*sp_segs; + length++; /* At least one length uchar */ + min_key_length++; +#else + my_errno= HA_ERR_UNSUPPORTED; + goto err_no_lock; +#endif /*HAVE_SPATIAL*/ + } + else if (keydef->flag & HA_FULLTEXT) + { + keydef->flag=HA_FULLTEXT | HA_PACK_KEY | HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + + for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ; + j++, keyseg++) + { + if (keyseg->type != HA_KEYTYPE_TEXT && + keyseg->type != HA_KEYTYPE_VARTEXT1 && + keyseg->type != HA_KEYTYPE_VARTEXT2) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + if (!(keyseg->flag & HA_BLOB_PART) && + (keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARTEXT2)) + { + /* Make a flag that this is a VARCHAR */ + keyseg->flag|= HA_VAR_LENGTH_PART; + /* Store in bit_start number of bytes used to pack the length */ + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1)? + 1 : 2); + } + } + + fulltext_keys++; + key_length+= HA_FT_MAXBYTELEN+HA_FT_WLEN; + length++; /* At least one length uchar */ + min_key_length+= 1 + HA_FT_WLEN; + real_length_diff=HA_FT_MAXBYTELEN-FT_MAX_WORD_LEN_FOR_SORT; + } + else + { + /* Test if prefix compression */ + if (keydef->flag & HA_PACK_KEY) + { + /* Can't use space_compression on number keys */ + if ((keydef->seg[0].flag & HA_SPACE_PACK) && + keydef->seg[0].type == (int) HA_KEYTYPE_NUM) + keydef->seg[0].flag&= ~HA_SPACE_PACK; + + /* Only use HA_PACK_KEY when first segment is a variable length key */ + if (!(keydef->seg[0].flag & (HA_SPACE_PACK | HA_BLOB_PART | + HA_VAR_LENGTH_PART))) + { + /* pack relative to previous key */ + keydef->flag&= ~HA_PACK_KEY; + keydef->flag|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY; + } + else + { + keydef->seg[0].flag|=HA_PACK_KEY; /* for easyer intern test */ + keydef->flag|=HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + } + } + if (keydef->flag & HA_BINARY_PACK_KEY) + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + + if (keydef->flag & HA_AUTO_KEY && ci->with_auto_increment) + share.base.auto_key=i+1; + for (j=0, keyseg=keydef->seg ; j < keydef->keysegs ; j++, keyseg++) + { + /* numbers are stored with high by first to make compression easier */ + switch (keyseg->type) { + case HA_KEYTYPE_SHORT_INT: + case HA_KEYTYPE_LONG_INT: + case HA_KEYTYPE_FLOAT: + case HA_KEYTYPE_DOUBLE: + case HA_KEYTYPE_USHORT_INT: + case HA_KEYTYPE_ULONG_INT: + case HA_KEYTYPE_LONGLONG: + case HA_KEYTYPE_ULONGLONG: + case HA_KEYTYPE_INT24: + case HA_KEYTYPE_UINT24: + case HA_KEYTYPE_INT8: + keyseg->flag|= HA_SWAP_KEY; + break; + case HA_KEYTYPE_VARTEXT1: + case HA_KEYTYPE_VARTEXT2: + case HA_KEYTYPE_VARBINARY1: + case HA_KEYTYPE_VARBINARY2: + if (!(keyseg->flag & HA_BLOB_PART)) + { + /* Make a flag that this is a VARCHAR */ + keyseg->flag|= HA_VAR_LENGTH_PART; + /* Store in bit_start number of bytes used to pack the length */ + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARBINARY1) ? + 1 : 2); + } + break; + default: + break; + } + if (keyseg->flag & HA_SPACE_PACK) + { + DBUG_ASSERT(!(keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))); + keydef->flag |= HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + length++; /* At least one length uchar */ + if (!keyseg->null_bit) + min_key_length++; + key_length+= keyseg->length; + if (keyseg->length >= 255) + { + /* prefix may be 3 bytes */ + length+= 2; + } + } + else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + { + DBUG_ASSERT(!test_all_bits(keyseg->flag, + (HA_VAR_LENGTH_PART | HA_BLOB_PART))); + keydef->flag|=HA_VAR_LENGTH_KEY; + length++; /* At least one length uchar */ + if (!keyseg->null_bit) + min_key_length++; + options|=HA_OPTION_PACK_KEYS; /* Using packed keys */ + key_length+= keyseg->length; + if (keyseg->length >= 255) + { + /* prefix may be 3 bytes */ + length+= 2; + } + } + else + { + key_length+= keyseg->length; + if (!keyseg->null_bit) + min_key_length+= keyseg->length; + } + if (keyseg->null_bit) + { + key_length++; + /* min key part is 1 byte */ + min_key_length++; + options|=HA_OPTION_PACK_KEYS; + keyseg->flag|=HA_NULL_PART; + keydef->flag|=HA_VAR_LENGTH_KEY | HA_NULL_PART_KEY; + } + } + } /* if HA_FULLTEXT */ + key_segs+=keydef->keysegs; + if (keydef->keysegs > HA_MAX_KEY_SEG) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + /* + key_segs may be 0 in the case when we only want to be able to + add on row into the table. This can happen with some DISTINCT queries + in MySQL + */ + if ((keydef->flag & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME && + key_segs) + share.state.rec_per_key_part[key_segs-1]=1L; + length+=key_length; + /* + A key can't be longer than than half a index block (as we have + to be able to put at least 2 keys on an index block for the key + algorithms to work). + */ + if (length > _ma_max_key_length()) + { + my_errno=HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + keydef->block_length= (uint16) maria_block_size; + keydef->keylength= (uint16) key_length; + keydef->minlength= (uint16) min_key_length; + keydef->maxlength= (uint16) length; + + if (length > max_key_length) + max_key_length= length; + + tot_length= update_tot_length(tot_length, max_rows, length); + } + + unique_key_parts=0; + for (i=0, uniquedef=uniquedefs ; i < uniques ; i++ , uniquedef++) + { + uniquedef->key=keys+i; + unique_key_parts+=uniquedef->keysegs; + share.state.key_root[keys+i]= HA_OFFSET_ERROR; + + tot_length= update_tot_length(tot_length, max_rows, MARIA_UNIQUE_HASH_LENGTH + pointer); + } + keys+=uniques; /* Each unique has 1 key */ + key_segs+=uniques; /* Each unique has 1 key seg */ + + base_pos=(MARIA_STATE_INFO_SIZE + keys * MARIA_STATE_KEY_SIZE + + key_segs * MARIA_STATE_KEYSEG_SIZE); + info_length= base_pos+(uint) (MARIA_BASE_INFO_SIZE+ + keys * MARIA_KEYDEF_SIZE+ + uniques * MARIA_UNIQUEDEF_SIZE + + (key_segs + unique_key_parts)*HA_KEYSEG_SIZE+ + columns*(MARIA_COLUMNDEF_SIZE + 2)); + + if (encrypted) + { + share.base.extra_options|= MA_EXTRA_OPTIONS_ENCRYPTED; + + /* store crypt data in info */ + info_length+= ma_crypt_get_file_length(); + } + + if (insert_order) + { + share.base.extra_options|= MA_EXTRA_OPTIONS_INSERT_ORDER; + } + + share.state.state.key_file_length= MY_ALIGN(info_length, maria_block_size); + DBUG_PRINT("info", ("info_length: %u", info_length)); + /* There are only 16 bits for the total header length. */ + if (share.state.state.key_file_length > 65535) + { + my_printf_error(HA_WRONG_CREATE_OPTION, + "Aria table '%s' has too many columns and/or " + "indexes and/or unique constraints.", + MYF(0), name + dirname_length(name)); + my_errno= HA_WRONG_CREATE_OPTION; + goto err_no_lock; + } + + bmove(share.state.header.file_version, maria_file_magic, 4); + ci->old_options=options | (ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD ? + HA_OPTION_COMPRESS_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD: 0); + mi_int2store(share.state.header.options,ci->old_options); + mi_int2store(share.state.header.header_length,info_length); + mi_int2store(share.state.header.state_info_length,MARIA_STATE_INFO_SIZE); + mi_int2store(share.state.header.base_info_length,MARIA_BASE_INFO_SIZE); + mi_int2store(share.state.header.base_pos,base_pos); + share.state.header.data_file_type= share.data_file_type= datafile_type; + share.state.header.org_data_file_type= org_datafile_type; + share.state.header.not_used= 0; + + share.state.dellink = HA_OFFSET_ERROR; + share.state.first_bitmap_with_space= 0; +#ifdef MARIA_EXTERNAL_LOCKING + share.state.process= (ulong) getpid(); +#endif + share.state.version= (ulong) time((time_t*) 0); + share.state.sortkey= (ushort) ~0; + share.state.auto_increment=ci->auto_increment; + share.options=options; + share.base.rec_reflength=pointer; + share.base.block_size= maria_block_size; + share.base.language= (ci->language ? ci->language : + default_charset_info->number); + + /* + Get estimate for index file length (this may be wrong for FT keys) + This is used for pointers to other key pages. + */ + tmp= (tot_length / maria_block_size + keys * MARIA_INDEX_BLOCK_MARGIN); + + /* + use maximum of key_file_length we calculated and key_file_length value we + got from MAI file header (see also mariapack.c:save_state) + */ + share.base.key_reflength= + maria_get_pointer_length(MY_MAX(ci->key_file_length,tmp),3); + share.base.keys= share.state.header.keys= keys; + share.state.header.uniques= uniques; + share.state.header.fulltext_keys= fulltext_keys; + mi_int2store(share.state.header.key_parts,key_segs); + mi_int2store(share.state.header.unique_key_parts,unique_key_parts); + + maria_set_all_keys_active(share.state.key_map, keys); + + share.base.keystart = share.state.state.key_file_length; + share.base.max_key_block_length= maria_block_size; + share.base.max_key_length=ALIGN_SIZE(max_key_length+4); + share.base.records=ci->max_rows; + share.base.reloc= ci->reloc_rows; + share.base.reclength=real_reclength; + share.base.pack_reclength= reclength + MY_TEST(options & HA_OPTION_CHECKSUM); + share.base.max_pack_length=pack_reclength; + share.base.min_pack_length=min_pack_length; + share.base.pack_bytes= pack_bytes; + share.base.fields= columns; + share.base.pack_fields= packed; + + if (share.data_file_type == BLOCK_RECORD) + { + /* + we are going to create a first bitmap page, set data_file_length + to reflect this, before the state goes to disk + */ + share.state.state.data_file_length= maria_block_size; + /* Add length of packed fields + length */ + share.base.pack_reclength+= share.base.max_field_lengths+3; + share.base.max_pack_length= share.base.pack_reclength; + + /* Adjust max_pack_length, to be used if we have short rows */ + if (share.base.max_pack_length < maria_block_size) + { + share.base.max_pack_length+= FLAG_SIZE; + if (ci->transactional) + share.base.max_pack_length+= TRANSID_SIZE * 2; + } + } + + /* max_data_file_length and max_key_file_length are recalculated on open */ + if (tmp_table) + share.base.max_data_file_length= (my_off_t) ci->data_file_length; + else if (ci->transactional && translog_status == TRANSLOG_OK && + !maria_in_recovery) + { + /* + we have checked translog_inited above, because maria_chk may call us + (via maria_recreate_table()) and it does not have a log. + */ + sync_dir= MY_SYNC_DIR; + /* + If crash between _ma_state_info_write_sub() and + _ma_update_state__lsns_sub(), table should be ignored by Recovery (or + old REDOs would fail), so we cannot let LSNs be 0: + */ + share.state.skip_redo_lsn= share.state.is_of_horizon= + share.state.create_rename_lsn= LSN_MAX; + /* + We have to mark the table as not movable as the table will contain the + maria_uuid and create_rename_lsn + */ + share.state.changed|= STATE_NOT_MOVABLE; + } + + if (datafile_type == DYNAMIC_RECORD) + { + share.base.min_block_length= + (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH && + ! share.base.blobs) ? + MY_MAX(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) : + MARIA_EXTEND_BLOCK_LENGTH; + } + else if (datafile_type == STATIC_RECORD) + share.base.min_block_length= share.base.pack_reclength; + + if (! (flags & HA_DONT_TOUCH_DATA)) + share.state.create_time= time((time_t*) 0); + + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_maria); + + /* + NOTE: For test_if_reopen() we need a real path name. Hence we need + MY_RETURN_REAL_PATH for every fn_format(filename, ...). + */ + if (ci->index_file_name) + { + char *iext= strrchr(ci->index_file_name, '.'); + int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); + if (tmp_table) + { + char *path; + /* chop off the table name, tempory tables use generated name */ + if ((path= strrchr(ci->index_file_name, FN_LIBCHAR))) + *path= '\0'; + fn_format(kfilename, name, ci->index_file_name, MARIA_NAME_IEXT, + MY_REPLACE_DIR | MY_UNPACK_FILENAME | + MY_RETURN_REAL_PATH | MY_APPEND_EXT); + } + else + { + fn_format(kfilename, ci->index_file_name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH | + (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + } + fn_format(klinkname, name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME|MY_APPEND_EXT); + klinkname_ptr= klinkname; + /* + Don't create the table if the link or file exists to ensure that one + doesn't accidently destroy another table. + Don't sync dir now if the data file has the same path. + */ + create_flag= + (ci->data_file_name && + !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir; + } + else + { + char *iext= strrchr(name, '.'); + int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT); + fn_format(kfilename, name, "", MARIA_NAME_IEXT, MY_UNPACK_FILENAME | + (internal_table ? 0 : MY_RETURN_REAL_PATH) | + (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + klinkname_ptr= NullS; + /* + Replace the current file. + Don't sync dir now if the data file has the same path. + */ + create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD; + create_flag|= (!ci->data_file_name ? 0 : sync_dir); + } + + /* + If a MRG_MARIA table is in use, the mapped MARIA tables are open, + but no entry is made in the table cache for them. + A TRUNCATE command checks for the table in the cache only and could + be fooled to believe, the table is not open. + Pull the emergency brake in this situation. (Bug #8306) + + + NOTE: The filename is compared against unique_file_name of every + open table. Hence we need a real path here. + */ + if (!internal_table && _ma_test_if_reopen(kfilename)) + { + my_printf_error(HA_ERR_TABLE_EXIST, "Aria table '%s' is in use " + "(most likely by a MERGE table). Try FLUSH TABLES.", + MYF(0), name + dirname_length(name)); + my_errno= HA_ERR_TABLE_EXIST; + goto err; + } + + if ((file= mysql_file_create_with_symlink(key_file_kfile, klinkname_ptr, + kfilename, 0, create_mode, + MYF(common_flag|create_flag))) < 0) + goto err; + errpos=1; + + DBUG_PRINT("info", ("write state info and base info")); + if (_ma_state_info_write_sub(file, &share.state, + MA_STATE_INFO_WRITE_FULL_INFO) || + _ma_base_info_write(file, &share.base)) + goto err; + DBUG_PRINT("info", ("base_pos: %d base_info_size: %d", + base_pos, MARIA_BASE_INFO_SIZE)); + DBUG_ASSERT(mysql_file_tell(file,MYF(0)) == base_pos+ MARIA_BASE_INFO_SIZE); + + /* Write key and keyseg definitions */ + DBUG_PRINT("info", ("write key and keyseg definitions")); + for (i=0 ; i < share.base.keys - uniques; i++) + { + uint sp_segs=(keydefs[i].flag & HA_SPATIAL) ? 2*SPDIMS : 0; + + if (_ma_keydef_write(file, &keydefs[i])) + goto err; + for (j=0 ; j < keydefs[i].keysegs-sp_segs ; j++) + if (_ma_keyseg_write(file, &keydefs[i].seg[j])) + goto err; +#ifdef HAVE_SPATIAL + for (j=0 ; j < sp_segs ; j++) + { + HA_KEYSEG sseg; + sseg.type=SPTYPE; + sseg.language= 7; /* Binary */ + sseg.null_bit=0; + sseg.bit_start=0; + sseg.bit_length= 0; + sseg.bit_pos= 0; + sseg.length=SPLEN; + sseg.null_pos=0; + sseg.start=j*SPLEN; + sseg.flag= HA_SWAP_KEY; + if (_ma_keyseg_write(file, &sseg)) + goto err; + } +#endif + } + /* Create extra keys for unique definitions */ + offset= real_reclength - uniques*MARIA_UNIQUE_HASH_LENGTH; + bzero((char*) &tmp_keydef,sizeof(tmp_keydef)); + bzero((char*) &tmp_keyseg,sizeof(tmp_keyseg)); + for (i=0; i < uniques ; i++) + { + tmp_keydef.keysegs=1; + tmp_keydef.block_length= (uint16) maria_block_size; + tmp_keydef.keylength= MARIA_UNIQUE_HASH_LENGTH + pointer; + tmp_keydef.minlength=tmp_keydef.maxlength=tmp_keydef.keylength; + tmp_keyseg.type= MARIA_UNIQUE_HASH_TYPE; + tmp_keyseg.length= MARIA_UNIQUE_HASH_LENGTH; + tmp_keyseg.start= offset; + offset+= MARIA_UNIQUE_HASH_LENGTH; + if (_ma_keydef_write(file,&tmp_keydef) || + _ma_keyseg_write(file,(&tmp_keyseg))) + goto err; + } + + /* Save unique definition */ + DBUG_PRINT("info", ("write unique definitions")); + for (i=0 ; i < share.state.header.uniques ; i++) + { + HA_KEYSEG *keyseg_end; + keyseg= uniquedefs[i].seg; + if (_ma_uniquedef_write(file, &uniquedefs[i])) + goto err; + for (keyseg= uniquedefs[i].seg, keyseg_end= keyseg+ uniquedefs[i].keysegs; + keyseg < keyseg_end; + keyseg++) + { + switch (keyseg->type) { + case HA_KEYTYPE_VARTEXT1: + case HA_KEYTYPE_VARTEXT2: + case HA_KEYTYPE_VARBINARY1: + case HA_KEYTYPE_VARBINARY2: + if (!(keyseg->flag & HA_BLOB_PART)) + { + keyseg->flag|= HA_VAR_LENGTH_PART; + keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 || + keyseg->type == HA_KEYTYPE_VARBINARY1) ? + 1 : 2); + } + break; + default: + DBUG_ASSERT((keyseg->flag & HA_VAR_LENGTH_PART) == 0); + break; + } + if (_ma_keyseg_write(file, keyseg)) + goto err; + } + } + DBUG_PRINT("info", ("write field definitions")); + if (datafile_type == BLOCK_RECORD) + { + /* Store columns in a more efficent order */ + MARIA_COLUMNDEF **col_order, **pos; + if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(PSI_INSTRUMENT_ME, + share.base.fields * + sizeof(MARIA_COLUMNDEF*), + common_flag))) + goto err; + for (column= columndef, pos= col_order ; + column != end_column ; + column++, pos++) + *pos= column; + qsort(col_order, share.base.fields, sizeof(*col_order), + (qsort_cmp) compare_columns); + for (i=0 ; i < share.base.fields ; i++) + { + column_array[col_order[i]->column_nr]= i; + if (_ma_columndef_write(file, col_order[i])) + { + my_free(col_order); + goto err; + } + } + my_free(col_order); + } + else + { + for (i=0 ; i < share.base.fields ; i++) + { + column_array[i]= (uint16) i; + if (_ma_columndef_write(file, &columndef[i])) + goto err; + } + } + if (_ma_column_nr_write(file, column_array, columns)) + goto err; + + if (encrypted) + { + DBUG_ASSERT(share.data_file_name.length == 0); + share.data_file_name.str= (char*) name; /* For error reporting */ + if (ma_crypt_create(&share) || + ma_crypt_write(&share, file)) + goto err; + } + + if ((kfile_size_before_extension= mysql_file_tell(file,MYF(0))) == MY_FILEPOS_ERROR) + goto err; +#ifndef DBUG_OFF + if (kfile_size_before_extension != info_length) + DBUG_PRINT("warning",("info_length: %u != used_length: %u", + info_length, (uint)kfile_size_before_extension)); +#endif + + if (sync_dir) + { + /* + we log the first bytes and then the size to which we extend; this is + a log of about 1 KB of mostly zeroes if this is a small table. + */ + char empty_string[]= ""; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + translog_size_t total_rec_length= 0; + uint k; + LSN lsn; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 + + (uint) kfile_size_before_extension; + /* we are needing maybe 64 kB, so don't use the stack */ + log_data= my_malloc(PSI_INSTRUMENT_ME, + log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0)); + if ((log_data == NULL) || + mysql_file_pread(file, 1 + 2 + 2 + log_data, + (size_t) kfile_size_before_extension, 0, MYF(MY_NABP))) + goto err; + /* + remember if the data file was created or not, to know if Recovery can + do it or not, in the future + */ + log_data[0]= MY_TEST(flags & HA_DONT_TOUCH_DATA); + int2store(log_data + 1, kfile_size_before_extension); + int2store(log_data + 1 + 2, share.base.keystart); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar *)name; + /* we store the end-zero, for Recovery to just pass it to my_create() */ + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_data; + /* symlink description is also needed for re-creation by Recovery: */ + { + const char *s= ci->data_file_name ? ci->data_file_name : empty_string; + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (uchar*)s; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= strlen(s) + 1; + s= ci->index_file_name ? ci->index_file_name : empty_string; + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (uchar*)s; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= strlen(s) + 1; + } + for (k= TRANSLOG_INTERNAL_PARTS; + k < (sizeof(log_array)/sizeof(log_array[0])); k++) + total_rec_length+= (translog_size_t) log_array[k].length; + /** + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require + work using the ddl_log of sql/sql_table.cc); when it is, we should + reconsider the moment of writing this log record (before or after op, + under THR_LOCK_maria or not...), how to use it in Recovery. + For now this record can serve when we apply logs to a backup, + so we sync it. This happens before the data file is created. If the + data file was created before, and we crashed before writing the log + record, at restart the table may be used, so we would not have a + trustable history in the log (impossible to apply this log to a + backup). The way we do it, if we crash before writing the log record + then there is no data file and the table cannot be used. + @todo Note that in case of TRUNCATE TABLE we also come here; for + Recovery to be able to finish TRUNCATE TABLE, instead of leaving a + half-truncated table, we should log the record at start of + maria_create(); for that we shouldn't write to the index file but to a + buffer (DYNAMIC_STRING), put the buffer into the record, then put the + buffer into the index file (so, change _ma_keydef_write() etc). That + would also enable Recovery to finish a CREATE TABLE. The final result + would be that we would be able to finish what the SQL layer has asked + for: it would be atomic. + When in CREATE/TRUNCATE (or DROP or RENAME or REPAIR) we have not + called external_lock(), so have no TRN. It does not matter, as all + these operations are non-transactional and sync their files. + */ + trnman_init_tmp_trn_for_logging_trid(&tmp_transaction_object); + if (unlikely(translog_write_record(&lsn, + LOGREC_REDO_CREATE_TABLE, + &tmp_transaction_object, NULL, + total_rec_length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + goto err; + share.kfile.file= file; + DBUG_EXECUTE_IF("maria_flush_whole_log", + { + DBUG_PRINT("maria_flush_whole_log", ("now")); + translog_flush(translog_get_horizon()); + }); + DBUG_EXECUTE_IF("maria_crash_create_table", + { + DBUG_PRINT("maria_crash_create_table", ("now")); + DBUG_SUICIDE(); + }); + /* + store LSN into file, needed for Recovery to not be confused if a + DROP+CREATE happened (applying REDOs to the wrong table). + */ + if (_ma_update_state_lsns_sub(&share, lsn, tmp_transaction_object.trid, + FALSE, TRUE)) + goto err; + my_free(log_data); + log_data= 0; + } + + if (!(flags & HA_DONT_TOUCH_DATA)) + { + if (ci->data_file_name) + { + char *dext= strrchr(ci->data_file_name, '.'); + int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT); + + if (tmp_table) + { + char *path; + /* chop off the table name, tempory tables use generated name */ + if ((path= strrchr(ci->data_file_name, FN_LIBCHAR))) + *path= '\0'; + fn_format(dfilename, name, ci->data_file_name, MARIA_NAME_DEXT, + MY_REPLACE_DIR | MY_UNPACK_FILENAME | MY_APPEND_EXT); + } + else + { + fn_format(dfilename, ci->data_file_name, "", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | + (have_dext ? MY_REPLACE_EXT : MY_APPEND_EXT)); + } + fn_format(dlinkname, name, "",MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + dlinkname_ptr= dlinkname; + create_flag=0; + } + else + { + fn_format(dfilename,name,"", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD; + } + ma_debug_crash_here("storage_engine_middle_of_create"); + if ((dfile= + mysql_file_create_with_symlink(key_file_dfile, dlinkname_ptr, + dfilename, 0, create_mode, + MYF(common_flag | create_flag | sync_dir))) < 0) + goto err; + errpos=3; + + if (_ma_initialize_data_file(&share, dfile)) + goto err; + } + + /* Enlarge files */ + DBUG_PRINT("info", ("enlarge to keystart: %lu", + (ulong) share.base.keystart)); + if (mysql_file_chsize(file,(ulong) share.base.keystart,0,MYF(0))) + goto err; + + if (!internal_table && sync_dir && mysql_file_sync(file, MYF(0))) + goto err; + + if (! (flags & HA_DONT_TOUCH_DATA)) + { +#ifdef USE_RELOC + if (mysql_file_chsize(key_file_dfile, dfile, + share.base.min_pack_length*ci->reloc_rows,0,MYF(0))) + goto err; +#endif + if (!internal_table && sync_dir && mysql_file_sync(dfile, MYF(0))) + goto err; + if (mysql_file_close(dfile,MYF(0))) + goto err; + } + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); + res= 0; + my_free((char*) rec_per_key_part); + ma_crypt_free(&share); + errpos=0; + if (mysql_file_close(file,MYF(0))) + res= my_errno; + DBUG_RETURN(res); + +err: + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); + +err_no_lock: + save_errno=my_errno; + switch (errpos) { + case 3: + mysql_file_close(dfile, MYF(0)); + if (! (flags & HA_DONT_TOUCH_DATA)) + { + mysql_file_delete(key_file_dfile, dfilename, MYF(sync_dir)); + if (dlinkname_ptr) + mysql_file_delete(key_file_dfile, dlinkname_ptr, MYF(sync_dir)); + } + /* fall through */ + case 1: + mysql_file_close(file, MYF(0)); + if (! (flags & HA_DONT_TOUCH_DATA)) + { + mysql_file_delete(key_file_kfile, kfilename, MYF(sync_dir)); + if (klinkname_ptr) + mysql_file_delete(key_file_kfile, klinkname_ptr, MYF(sync_dir)); + } + } + ma_crypt_free(&share); + my_free(log_data); + my_free(rec_per_key_part); + DBUG_RETURN(my_errno=save_errno); /* return the fatal errno */ +} + + +uint maria_get_pointer_length(ulonglong file_length, uint def) +{ + DBUG_ASSERT(def >= 2 && def <= 7); + if (file_length) /* If not default */ + { +#ifdef NOT_YET_READY_FOR_8_BYTE_POINTERS + if (file_length >= (1ULL << 56)) + def=8; + else +#endif + if (file_length >= (1ULL << 48)) + def=7; + else if (file_length >= (1ULL << 40)) + def=6; + else if (file_length >= (1ULL << 32)) + def=5; + else if (file_length >= (1ULL << 24)) + def=4; + else if (file_length >= (1ULL << 16)) + def=3; + else + def=2; + } + return def; +} + + +/* + Sort columns for records-in-block + + IMPLEMENTATION + Sort columns in following order: + + Fixed size, not null columns + Fixed length, null fields + Numbers (zero fill fields) + Variable length fields (CHAR, VARCHAR) according to length + Blobs + + For same kind of fields, keep fields in original order +*/ + +static inline int sign(long a) +{ + return a < 0 ? -1 : (a > 0 ? 1 : 0); +} + + +static int compare_columns(MARIA_COLUMNDEF **a_ptr, MARIA_COLUMNDEF **b_ptr) +{ + MARIA_COLUMNDEF *a= *a_ptr, *b= *b_ptr; + enum en_fieldtype a_type, b_type; + + a_type= (a->type == FIELD_CHECK) ? FIELD_NORMAL : a->type; + b_type= (b->type == FIELD_CHECK) ? FIELD_NORMAL : b->type; + + if (a_type == FIELD_NORMAL && !a->null_bit) + { + if (b_type != FIELD_NORMAL || b->null_bit) + return -1; + return sign((long) a->offset - (long) b->offset); + } + if (b_type == FIELD_NORMAL && !b->null_bit) + return 1; + if (a_type == b_type) + return sign((long) a->offset - (long) b->offset); + if (a_type == FIELD_NORMAL) + return -1; + if (b_type == FIELD_NORMAL) + return 1; + if (a_type == FIELD_SKIP_ZERO) + return -1; + if (b_type == FIELD_SKIP_ZERO) + return 1; + if (a->type != FIELD_BLOB && b->type != FIELD_BLOB) + if (a->length != b->length) + return sign((long) a->length - (long) b->length); + if (a_type == FIELD_BLOB) + return 1; + if (b_type == FIELD_BLOB) + return -1; + return sign((long) a->offset - (long) b->offset); +} + + +/** + @brief Initialize data file + + @note + In BLOCK_RECORD, a freshly created datafile is one page long; while in + other formats it is 0-byte long. + */ + +int _ma_initialize_data_file(MARIA_SHARE *share, File dfile) +{ + if (share->data_file_type == BLOCK_RECORD) + { + share->bitmap.block_size= share->base.block_size; + share->bitmap.file.file = dfile; + return _ma_bitmap_create_first(share); + } + return 0; +} + + +/** + @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk, + can force. + + This is for special cases where: + - we don't want to write the full state to disk (so, not call + _ma_state_info_write()) because some parts of the state may be + currently inconsistent, or because it would be overkill + - we must sync these LSNs immediately for correctness. + It acquires intern_lock to protect the LSNs and state write. + + @param share table's share + @param lsn LSN to write to log files + @param create_trid Trid to be used as state.create_trid + @param do_sync if the write should be forced to disk + @param update_create_rename_lsn if this LSN should be updated or not + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +int _ma_update_state_lsns(MARIA_SHARE *share, LSN lsn, TrID create_trid, + my_bool do_sync, my_bool update_create_rename_lsn) +{ + int res; + DBUG_ENTER("_ma_update_state_lsns"); + mysql_mutex_lock(&share->intern_lock); + res= _ma_update_state_lsns_sub(share, lsn, create_trid, do_sync, + update_create_rename_lsn); + mysql_mutex_unlock(&share->intern_lock); + DBUG_RETURN(res); +} + + +/** + @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk, + can force. + + Shortcut of _ma_update_state_lsns() when we know that intern_lock is not + needed (when creating a table or opening it for the first time). + + @param share table's share + @param lsn LSN to write to state; if LSN_IMPOSSIBLE, write + a LOGREC_IMPORTED_TABLE and use its LSN as lsn. + @param create_trid Trid to be used as state.create_trid + @param do_sync if the write should be forced to disk + @param update_create_rename_lsn if this LSN should be updated or not + + @return Operation status + @retval 0 ok + @retval 1 error (disk problem) +*/ + +#if defined(_MSC_VER) && (_MSC_VER == 1310) +/* + Visual Studio 2003 compiler produces internal compiler error + in this function. Disable optimizations to workaround. +*/ +#pragma optimize("",off) +#endif +int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn, TrID create_trid, + my_bool do_sync, + my_bool update_create_rename_lsn) +{ + uchar buf[LSN_STORE_SIZE * 3], *ptr; + uchar trid_buff[8]; + File file= share->kfile.file; + DBUG_ENTER("_ma_update_state_lsns_sub"); + DBUG_ASSERT(file >= 0); + CRASH_IF_S3_TABLE(share); + + if (lsn == LSN_IMPOSSIBLE) + { + int res; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + /* table name is logged only for information */ + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= + (uchar *)(share->open_file_name.str); + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= + share->open_file_name.length + 1; + if ((res= translog_write_record(&lsn, LOGREC_IMPORTED_TABLE, + &dummy_transaction_object, NULL, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL))) + DBUG_RETURN(res); + } + + for (ptr= buf; ptr < (buf + sizeof(buf)); ptr+= LSN_STORE_SIZE) + lsn_store(ptr, lsn); + share->state.skip_redo_lsn= share->state.is_of_horizon= lsn; + share->state.create_trid= create_trid; + mi_int8store(trid_buff, create_trid); + + /* + Update create_rename_lsn if update was requested or if the old one had an + impossible value. + */ + if (update_create_rename_lsn || + (share->state.create_rename_lsn > lsn && lsn != LSN_IMPOSSIBLE)) + { + share->state.create_rename_lsn= lsn; + if (share->id != 0) + { + /* + If OP is the operation which is calling us, if table is later written, + we could see in the log: + FILE_ID ... REDO_OP ... REDO_INSERT. + (that can happen in real life at least with OP=REPAIR). + As FILE_ID will be ignored by Recovery because it is < + create_rename_lsn, REDO_INSERT would be ignored too, wrongly. + To avoid that, we force a LOGREC_FILE_ID to be logged at next write: + */ + translog_deassign_id_from_share(share); + } + } + else + lsn_store(buf, share->state.create_rename_lsn); + DBUG_RETURN(my_pwrite(file, buf, sizeof(buf), + sizeof(share->state.header) + + MARIA_FILE_CREATE_RENAME_LSN_OFFSET, MYF(MY_NABP)) || + my_pwrite(file, trid_buff, sizeof(trid_buff), + sizeof(share->state.header) + + MARIA_FILE_CREATE_TRID_OFFSET, MYF(MY_NABP)) || + (do_sync && mysql_file_sync(file, MYF(0)))); +} +#if defined(_MSC_VER) && (_MSC_VER == 1310) +#pragma optimize("",on) +#endif /*VS2003 compiler bug workaround*/ diff --git a/storage/maria/ma_crypt.c b/storage/maria/ma_crypt.c new file mode 100644 index 00000000..1714fc6e --- /dev/null +++ b/storage/maria/ma_crypt.c @@ -0,0 +1,548 @@ +/* + Copyright (c) 2013 Google Inc. + Copyright (c) 2014, 2015 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include <my_crypt.h> + +#define CRYPT_SCHEME_1 1 +#define CRYPT_SCHEME_1_ID_LEN 4 /* 4 bytes for counter-block */ +#define CRYPT_SCHEME_1_IV_LEN 16 +#define CRYPT_SCHEME_1_KEY_VERSION_SIZE 4 + +#ifdef HAVE_PSI_INTERFACE +PSI_mutex_key key_CRYPT_DATA_lock; +#endif + +struct st_crypt_key +{ + uint key_version; + uchar key[CRYPT_SCHEME_1_IV_LEN]; +}; + +struct st_maria_crypt_data +{ + struct st_encryption_scheme scheme; + uint space; + mysql_mutex_t lock; /* protecting keys */ +}; + +/** + determine what key id to use for Aria encryption + + Same logic as for tempfiles: if key id 2 exists - use it, + otherwise use key id 1. + + Key id 1 is system, it always exists. Key id 2 is optional, + it allows to specify fast low-grade encryption for temporary data. +*/ +static uint get_encryption_key_id(MARIA_SHARE *share) +{ + if (share->options & HA_OPTION_TMP_TABLE && + encryption_key_id_exists(ENCRYPTION_KEY_TEMPORARY_DATA)) + return ENCRYPTION_KEY_TEMPORARY_DATA; + else + return ENCRYPTION_KEY_SYSTEM_DATA; +} + +uint +ma_crypt_get_data_page_header_space() +{ + return CRYPT_SCHEME_1_KEY_VERSION_SIZE; +} + +uint +ma_crypt_get_index_page_header_space(MARIA_SHARE *share) +{ + if (share->base.born_transactional) + { + return CRYPT_SCHEME_1_KEY_VERSION_SIZE; + } + else + { + /* if the index is not transactional, we add 7 bytes LSN anyway + to be used for counter block + */ + return LSN_STORE_SIZE + CRYPT_SCHEME_1_KEY_VERSION_SIZE; + } +} + +uint +ma_crypt_get_file_length() +{ + return 2 + CRYPT_SCHEME_1_IV_LEN + CRYPT_SCHEME_1_ID_LEN; +} + +static void crypt_data_scheme_locker(struct st_encryption_scheme *scheme, + int unlock) +{ + MARIA_CRYPT_DATA *crypt_data = (MARIA_CRYPT_DATA*)scheme; + if (unlock) + mysql_mutex_unlock(&crypt_data->lock); + else + mysql_mutex_lock(&crypt_data->lock); +} + +int +ma_crypt_create(MARIA_SHARE* share) +{ + uint key_version; + MARIA_CRYPT_DATA *crypt_data= + (MARIA_CRYPT_DATA*)my_malloc(PSI_INSTRUMENT_ME, sizeof(MARIA_CRYPT_DATA), MYF(MY_ZEROFILL)); + crypt_data->scheme.type= CRYPT_SCHEME_1; + crypt_data->scheme.locker= crypt_data_scheme_locker; + mysql_mutex_init(key_CRYPT_DATA_lock, &crypt_data->lock, MY_MUTEX_INIT_FAST); + crypt_data->scheme.key_id= get_encryption_key_id(share); + my_random_bytes(crypt_data->scheme.iv, sizeof(crypt_data->scheme.iv)); + my_random_bytes((uchar*)&crypt_data->space, sizeof(crypt_data->space)); + share->crypt_data= crypt_data; + share->crypt_page_header_space= CRYPT_SCHEME_1_KEY_VERSION_SIZE; + + key_version = encryption_key_get_latest_version(crypt_data->scheme.key_id); + if (unlikely(key_version == ENCRYPTION_KEY_VERSION_INVALID)) + { + my_errno= HA_ERR_NO_ENCRYPTION; + my_printf_error(HA_ERR_NO_ENCRYPTION, + "Initialization of encryption failed for %s", MYF(0), + share->data_file_name.str); + return 1; + } + return 0; +} + +void +ma_crypt_free(MARIA_SHARE* share) +{ + if (share->crypt_data != NULL) + { + mysql_mutex_destroy(&share->crypt_data->lock); + my_free(share->crypt_data); + share->crypt_data= NULL; + } +} + +int +ma_crypt_write(MARIA_SHARE* share, File file) +{ + MARIA_CRYPT_DATA *crypt_data= share->crypt_data; + uchar buff[2 + 4 + sizeof(crypt_data->scheme.iv)]; + if (crypt_data == 0) + return 0; + + buff[0]= crypt_data->scheme.type; + buff[1]= sizeof(buff) - 2; + + int4store(buff + 2, crypt_data->space); + memcpy(buff + 6, crypt_data->scheme.iv, sizeof(crypt_data->scheme.iv)); + + if (mysql_file_write(file, buff, sizeof(buff), MYF(MY_NABP))) + return 1; + + return 0; +} + +uchar* +ma_crypt_read(MARIA_SHARE* share, uchar *buff, my_bool silent) +{ + uchar type= buff[0]; + uchar iv_length= buff[1]; + + /* currently only supported type */ + if (type != CRYPT_SCHEME_1 || + iv_length != sizeof(((MARIA_CRYPT_DATA*)1)->scheme.iv) + 4) + { + my_printf_error(HA_ERR_UNSUPPORTED, + "Unsupported crypt scheme type: %d iv_length: %d\n", + MYF(ME_ERROR_LOG | (silent ? ME_WARNING : ME_FATAL)), + type, iv_length); + return 0; + } + + if (share->crypt_data == NULL) + { + /* opening a table */ + MARIA_CRYPT_DATA *crypt_data= + (MARIA_CRYPT_DATA*)my_malloc(PSI_INSTRUMENT_ME, sizeof(MARIA_CRYPT_DATA), MYF(MY_ZEROFILL)); + uint key_version; + + crypt_data->scheme.type= type; + mysql_mutex_init(key_CRYPT_DATA_lock, &crypt_data->lock, + MY_MUTEX_INIT_FAST); + crypt_data->scheme.locker= crypt_data_scheme_locker; + crypt_data->scheme.key_id= get_encryption_key_id(share); + crypt_data->space= uint4korr(buff + 2); + memcpy(crypt_data->scheme.iv, buff + 6, sizeof(crypt_data->scheme.iv)); + share->crypt_data= crypt_data; + + key_version= encryption_key_get_latest_version(crypt_data->scheme.key_id); + if (unlikely(key_version == ENCRYPTION_KEY_VERSION_INVALID)) + { + my_errno= HA_ERR_NO_ENCRYPTION; + my_printf_error(HA_ERR_NO_ENCRYPTION, + "Initialization of encryption failed for %s", + MYF(ME_ERROR_LOG | (silent ? ME_WARNING : ME_FATAL)), + share->data_file_name.str); + return 0; + } + } + + share->crypt_page_header_space= CRYPT_SCHEME_1_KEY_VERSION_SIZE; + return buff + 2 + iv_length; +} + +static int ma_encrypt(MARIA_SHARE *, MARIA_CRYPT_DATA *, const uchar *, + uchar *, uint, uint, LSN, uint *); +static int ma_decrypt(MARIA_SHARE *, MARIA_CRYPT_DATA *, const uchar *, + uchar *, uint, uint, LSN, uint); + +static my_bool ma_crypt_pre_read_hook(PAGECACHE_IO_HOOK_ARGS *args) +{ + MARIA_SHARE *share= (MARIA_SHARE*) args->data; + uchar *crypt_buf= my_malloc(PSI_INSTRUMENT_ME, share->block_size, MYF(0)); + if (crypt_buf == NULL) + { + args->crypt_buf= NULL; /* for post-hook */ + return 1; + } + + /* swap pointers to read into crypt_buf */ + args->crypt_buf= args->page; + args->page= crypt_buf; + + return 0; +} + +static my_bool ma_crypt_data_post_read_hook(int res, + PAGECACHE_IO_HOOK_ARGS *args) +{ + MARIA_SHARE *share= (MARIA_SHARE*) args->data; + const uint size= share->block_size; + const uchar page_type= args->page[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK; + const uint32 key_version_offset= (page_type <= TAIL_PAGE) ? + KEY_VERSION_OFFSET : FULL_PAGE_KEY_VERSION_OFFSET; + + if (res == 0) + { + const uchar *src= args->page; + uchar* dst= args->crypt_buf; + uint pageno= (uint)args->pageno; + LSN lsn= lsn_korr(src); + const uint head= (page_type <= TAIL_PAGE) ? + PAGE_HEADER_SIZE(share) : FULL_PAGE_HEADER_SIZE(share); + const uint tail= CRC_SIZE; + const uint32 key_version= uint4korr(src + key_version_offset); + + /* 1 - copy head */ + memcpy(dst, src, head); + /* 2 - decrypt page */ + res= ma_decrypt(share, share->crypt_data, + src + head, dst + head, size - (head + tail), pageno, lsn, + key_version); + /* 3 - copy tail */ + memcpy(dst + size - tail, src + size - tail, tail); + /* 4 clear key version to get correct crc */ + int4store(dst + key_version_offset, 0); + } + + if (args->crypt_buf != NULL) + { + uchar *tmp= args->page; + args->page= args->crypt_buf; + args->crypt_buf= NULL; + my_free(tmp); + } + + return maria_page_crc_check_data(res, args); +} + +static void store_rand_lsn(uchar * page) +{ + LSN lsn= 0; + lsn+= rand(); + lsn<<= 32; + lsn+= rand(); + lsn_store(page, lsn); +} + +static my_bool ma_crypt_data_pre_write_hook(PAGECACHE_IO_HOOK_ARGS *args) +{ + MARIA_SHARE *share= (MARIA_SHARE*) args->data; + const uint size= share->block_size; + uint key_version; + uchar *crypt_buf= my_malloc(PSI_INSTRUMENT_ME, share->block_size, MYF(0)); + + if (crypt_buf == NULL) + { + args->crypt_buf= NULL; /* for post-hook */ + return 1; + } + + if (!share->base.born_transactional) + { + /* store a random number instead of LSN (for counter block) */ + store_rand_lsn(args->page); + } + + maria_page_crc_set_normal(args); + + { + const uchar *src= args->page; + uchar* dst= crypt_buf; + uint pageno= (uint)args->pageno; + LSN lsn= lsn_korr(src); + const uchar page_type= src[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK; + const uint head= (page_type <= TAIL_PAGE) ? + PAGE_HEADER_SIZE(share) : FULL_PAGE_HEADER_SIZE(share); + const uint tail= CRC_SIZE; + const uint32 key_version_offset= (page_type <= TAIL_PAGE) ? + KEY_VERSION_OFFSET : FULL_PAGE_KEY_VERSION_OFFSET; + + DBUG_ASSERT(page_type < MAX_PAGE_TYPE); + + /* 1 - copy head */ + memcpy(dst, src, head); + /* 2 - encrypt page */ + if (ma_encrypt(share, share->crypt_data, + src + head, dst + head, size - (head + tail), pageno, lsn, + &key_version)) + return 1; + /* 3 - copy tail */ + memcpy(dst + size - tail, src + size - tail, tail); + /* 4 - store key version */ + int4store(dst + key_version_offset, key_version); + } + + /* swap pointers to instead write out the encrypted block */ + args->crypt_buf= args->page; + args->page= crypt_buf; + + return 0; +} + +static void ma_crypt_post_write_hook(int res, + PAGECACHE_IO_HOOK_ARGS *args) +{ + if (args->crypt_buf != NULL) + { + uchar *tmp= args->page; + args->page= args->crypt_buf; + args->crypt_buf= NULL; + my_free(tmp); + } + + maria_page_write_failure(res, args); +} + +void ma_crypt_set_data_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share + __attribute__((unused))) +{ + /* Only use encryption if we have defined it */ + if (encryption_key_id_exists(get_encryption_key_id(share))) + { + file->pre_read_hook= ma_crypt_pre_read_hook; + file->post_read_hook= ma_crypt_data_post_read_hook; + file->pre_write_hook= ma_crypt_data_pre_write_hook; + file->post_write_hook= ma_crypt_post_write_hook; + } +} + +static my_bool ma_crypt_index_post_read_hook(int res, + PAGECACHE_IO_HOOK_ARGS *args) +{ + MARIA_SHARE *share= (MARIA_SHARE*) args->data; + const uint block_size= share->block_size; + const uint page_used= _ma_get_page_used(share, args->page); + + if (res || + page_used < share->keypage_header || + page_used >= block_size - CRC_SIZE) + { + res= 1; + my_errno= HA_ERR_DECRYPTION_FAILED; + } + else + { + const uchar *src= args->page; + uchar* dst= args->crypt_buf; + uint pageno= (uint)args->pageno; + LSN lsn= lsn_korr(src); + const uint head= share->keypage_header; + const uint tail= CRC_SIZE; + const uint32 key_version= _ma_get_key_version(share, src); + /* page_used includes header (but not trailer) */ + const uint size= page_used - head; + + /* 1 - copy head */ + memcpy(dst, src, head); + /* 2 - decrypt page */ + res= ma_decrypt(share, share->crypt_data, + src + head, dst + head, size, pageno, lsn, key_version); + /* 3 - copy tail */ + memcpy(dst + block_size - tail, src + block_size - tail, tail); + /* 4 clear key version to get correct crc */ + _ma_store_key_version(share, dst, 0); + } + + if (args->crypt_buf != NULL) + { + uchar *tmp= args->page; + args->page= args->crypt_buf; + args->crypt_buf= NULL; + my_free(tmp); + } + + return maria_page_crc_check_index(res, args); +} + +static my_bool ma_crypt_index_pre_write_hook(PAGECACHE_IO_HOOK_ARGS *args) +{ + MARIA_SHARE *share= (MARIA_SHARE*) args->data; + const uint block_size= share->block_size; + const uint page_used= _ma_get_page_used(share, args->page); + uint key_version; + uchar *crypt_buf= my_malloc(PSI_INSTRUMENT_ME, block_size, MYF(0)); + if (crypt_buf == NULL) + { + args->crypt_buf= NULL; /* for post-hook */ + return 1; + } + + if (!share->base.born_transactional) + { + /* store a random number instead of LSN (for counter block) */ + store_rand_lsn(args->page); + } + + maria_page_crc_set_index(args); + + { + const uchar *src= args->page; + uchar* dst= crypt_buf; + uint pageno= (uint)args->pageno; + LSN lsn= lsn_korr(src); + const uint head= share->keypage_header; + const uint tail= CRC_SIZE; + /* page_used includes header (but not trailer) */ + const uint size= page_used - head; + + /* 1 - copy head */ + memcpy(dst, src, head); + /* 2 - encrypt page */ + if (ma_encrypt(share, share->crypt_data, + src + head, dst + head, size, pageno, lsn, &key_version)) + { + my_free(crypt_buf); + return 1; + } + /* 3 - copy tail */ + memcpy(dst + block_size - tail, src + block_size - tail, tail); + /* 4 - store key version */ + _ma_store_key_version(share, dst, key_version); +#ifdef HAVE_valgrind + /* 5 - keep valgrind happy by zeroing not used bytes */ + bzero(dst+head+size, block_size - size - tail - head); +#endif + } + + /* swap pointers to instead write out the encrypted block */ + args->crypt_buf= args->page; + args->page= crypt_buf; + + return 0; +} + +void ma_crypt_set_index_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share + __attribute__((unused))) +{ + file->pre_read_hook= ma_crypt_pre_read_hook; + file->post_read_hook= ma_crypt_index_post_read_hook; + file->pre_write_hook= ma_crypt_index_pre_write_hook; + file->post_write_hook= ma_crypt_post_write_hook; +} + +static int ma_encrypt(MARIA_SHARE *share, MARIA_CRYPT_DATA *crypt_data, + const uchar *src, uchar *dst, uint size, + uint pageno, LSN lsn, + uint *key_version) +{ + int rc; + uint32 dstlen= 0; /* Must be set because of error message */ + + *key_version = encryption_key_get_latest_version(crypt_data->scheme.key_id); + if (unlikely(*key_version == ENCRYPTION_KEY_VERSION_INVALID)) + { + /* + We use this error for both encryption and decryption, as in normal + cases it should be impossible to get an error here. + */ + my_errno= HA_ERR_DECRYPTION_FAILED; + my_printf_error(HA_ERR_DECRYPTION_FAILED, + "Unknown encryption key id %u for %s. Can't continue!", + MYF(ME_FATAL|ME_ERROR_LOG), + crypt_data->scheme.key_id, + share->open_file_name.str); + return 1; + } + + rc= encryption_scheme_encrypt(src, size, dst, &dstlen, + &crypt_data->scheme, *key_version, + crypt_data->space, pageno, lsn); + + /* The following can only fail if the encryption key is wrong */ + DBUG_ASSERT(!my_assert_on_error || rc == MY_AES_OK); + DBUG_ASSERT(!my_assert_on_error || dstlen == size); + if (! (rc == MY_AES_OK && dstlen == size)) + { + my_errno= HA_ERR_DECRYPTION_FAILED; + my_printf_error(HA_ERR_DECRYPTION_FAILED, + "failed to encrypt '%s' rc: %d dstlen: %u size: %u\n", + MYF(ME_FATAL|ME_ERROR_LOG), + share->open_file_name.str, rc, dstlen, size); + return 1; + } + + return 0; +} + +static int ma_decrypt(MARIA_SHARE *share, MARIA_CRYPT_DATA *crypt_data, + const uchar *src, uchar *dst, uint size, + uint pageno, LSN lsn, + uint key_version) +{ + int rc; + uint32 dstlen= 0; /* Must be set because of error message */ + + rc= encryption_scheme_decrypt(src, size, dst, &dstlen, + &crypt_data->scheme, key_version, + crypt_data->space, pageno, lsn); + + DBUG_ASSERT(!my_assert_on_error || rc == MY_AES_OK); + DBUG_ASSERT(!my_assert_on_error || dstlen == size); + if (! (rc == MY_AES_OK && dstlen == size)) + { + my_errno= HA_ERR_DECRYPTION_FAILED; + if (!share->silence_encryption_errors) + my_printf_error(HA_ERR_DECRYPTION_FAILED, + "failed to decrypt '%s' rc: %d dstlen: %u size: %u\n", + MYF(ME_FATAL|ME_ERROR_LOG), + share->open_file_name.str, rc, dstlen, size); + return 1; + } + return 0; +} diff --git a/storage/maria/ma_crypt.h b/storage/maria/ma_crypt.h new file mode 100644 index 00000000..acaf36ee --- /dev/null +++ b/storage/maria/ma_crypt.h @@ -0,0 +1,40 @@ +/* + Copyright (c) 2013 Google Inc. + Copyright (c) 2014, 2015 MariaDB Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef STORAGE_MARIA_MA_CRYPT_INCLUDED +#define STORAGE_MARIA_MA_CRYPT_INCLUDED + +struct st_maria_share; +struct st_pagecache_file; + +uint ma_crypt_get_data_page_header_space();/* bytes in data/index page header */ +uint ma_crypt_get_index_page_header_space(struct st_maria_share *); +uint ma_crypt_get_file_length(); /* bytes needed in file */ +int ma_crypt_create(struct st_maria_share *); /* create encryption data */ +int ma_crypt_write(struct st_maria_share *, File); /* write encryption data */ +uchar* ma_crypt_read(struct st_maria_share *, uchar *buff, + my_bool silent); /* read crypt data*/ + +void ma_crypt_set_data_pagecache_callbacks(struct st_pagecache_file *file, + struct st_maria_share *share); + +void ma_crypt_set_index_pagecache_callbacks(struct st_pagecache_file *file, + struct st_maria_share *share); + +void ma_crypt_free(struct st_maria_share *share); + +#endif diff --git a/storage/maria/ma_dbug.c b/storage/maria/ma_dbug.c new file mode 100644 index 00000000..63bbc9ed --- /dev/null +++ b/storage/maria/ma_dbug.c @@ -0,0 +1,201 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Support rutiner with are using with dbug */ + +#include "maria_def.h" + +void _ma_print_key(FILE *stream, MARIA_KEY *key) +{ + _ma_print_keydata(stream, key->keyinfo->seg, key->data, key->data_length); +} + + +/* Print a key in a user understandable format */ + +void _ma_print_keydata(FILE *stream, register HA_KEYSEG *keyseg, + const uchar *key, uint length) +{ + int flag; + short int s_1; + long int l_1; + float f_1; + double d_1; + const uchar *end; + const uchar *key_end= key + length; + + fputs("Key: \"",stream); + flag=0; + for (; keyseg->type && key < key_end ;keyseg++) + { + if (flag++) + putc('-',stream); + end= key+ keyseg->length; + if (keyseg->flag & HA_NULL_PART) + { + /* A NULL value is encoded by a 1-byte flag. Zero means NULL. */ + if (! *(key++)) + { + fprintf(stream,"NULL"); + continue; + } + end++; + } + + switch (keyseg->type) { + case HA_KEYTYPE_BINARY: + if (!(keyseg->flag & HA_SPACE_PACK) && keyseg->length == 1) + { /* packed binary digit */ + fprintf(stream,"%d",(uint) *key++); + break; + } + /* fall through */ + case HA_KEYTYPE_TEXT: + case HA_KEYTYPE_NUM: + if (keyseg->flag & HA_SPACE_PACK) + { + fprintf(stream,"%.*s",(int) *key,key+1); + key+= (int) *key+1; + } + else + { + fprintf(stream,"%.*s",(int) keyseg->length,key); + key=end; + } + break; + case HA_KEYTYPE_INT8: + fprintf(stream,"%d",(int) *((const signed char*) key)); + key=end; + break; + case HA_KEYTYPE_SHORT_INT: + s_1= mi_sint2korr(key); + fprintf(stream,"%d",(int) s_1); + key=end; + break; + case HA_KEYTYPE_USHORT_INT: + { + ushort u_1; + u_1= mi_uint2korr(key); + fprintf(stream,"%u",(uint) u_1); + key=end; + break; + } + case HA_KEYTYPE_LONG_INT: + l_1=mi_sint4korr(key); + fprintf(stream,"%ld",l_1); + key=end; + break; + case HA_KEYTYPE_ULONG_INT: + l_1=mi_uint4korr(key); + fprintf(stream,"%lu",(ulong) l_1); + key=end; + break; + case HA_KEYTYPE_INT24: + fprintf(stream,"%ld",(long) mi_sint3korr(key)); + key=end; + break; + case HA_KEYTYPE_UINT24: + fprintf(stream,"%lu",(ulong) mi_uint3korr(key)); + key=end; + break; + case HA_KEYTYPE_FLOAT: + mi_float4get(f_1,key); + fprintf(stream,"%g",(double) f_1); + key=end; + break; + case HA_KEYTYPE_DOUBLE: + mi_float8get(d_1,key); + fprintf(stream,"%g",d_1); + key=end; + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + { + char buff[21]; + longlong10_to_str(mi_sint8korr(key),buff,-10); + fprintf(stream,"%s",buff); + key=end; + break; + } + case HA_KEYTYPE_ULONGLONG: + { + char buff[21]; + longlong10_to_str(mi_sint8korr(key),buff,10); + fprintf(stream,"%s",buff); + key=end; + break; + } +#endif + case HA_KEYTYPE_BIT: + { + uint i; + fputs("0x",stream); + for (i=0 ; i < keyseg->length ; i++) + fprintf(stream, "%02x", (uint) *key++); + key= end; + break; + } + case HA_KEYTYPE_VARTEXT1: /* VARCHAR and TEXT */ + case HA_KEYTYPE_VARTEXT2: /* VARCHAR and TEXT */ + case HA_KEYTYPE_VARBINARY1: /* VARBINARY and BLOB */ + case HA_KEYTYPE_VARBINARY2: /* VARBINARY and BLOB */ + { + uint tmp_length; + get_key_length(tmp_length,key); + /* + The following command sometimes gives a warning from valgrind. + Not yet sure if the bug is in valgrind, glibc or mysqld + */ + fprintf(stream,"%.*s",(int) tmp_length,key); + key+=tmp_length; + break; + } + default: break; /* This never happens */ + } + } + fputs("\"\n",stream); + return; +} /* print_key */ + + +#ifdef EXTRA_DEBUG + +my_bool _ma_check_table_is_closed(const char *name, const char *where) +{ + char filename[FN_REFLEN]; + LIST *pos; + DBUG_ENTER("_ma_check_table_is_closed"); + + (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32); + mysql_mutex_lock(&THR_LOCK_maria); + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info=(MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (!strcmp(share->unique_file_name.str, filename)) + { + if (share->last_version) + { + fprintf(stderr,"Warning: Table: %s is open on %s\n", name,where); + DBUG_PRINT("warning",("Table: %s is open on %s", name,where)); + mysql_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(1); + } + } + } + mysql_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(0); +} +#endif /* EXTRA_DEBUG */ diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c new file mode 100644 index 00000000..b24cfcc6 --- /dev/null +++ b/storage/maria/ma_delete.c @@ -0,0 +1,1699 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (C) 2009-2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" +#include "trnman.h" +#include "ma_key_recover.h" + +static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag, + MARIA_PAGE *page); +static int del(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos, my_off_t next_block, uchar *ret_key_buff); +static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos); +static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag, + uchar *keypos, uchar *lastkey, uchar *page_end, + my_off_t *next_block, MARIA_KEY_PARAM *s_temp); + +/* @breif Remove a row from a MARIA table */ + +int maria_delete(MARIA_HA *info,const uchar *record) +{ + uint i; + uchar *old_key; + int save_errno; + char lastpos[8]; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_delete"); + + /* Test if record is in datafile */ + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + _ma_print_error(info, HA_ERR_CRASHED, 0); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + DBUG_EXECUTE_IF("my_error_test_undefined_error", + _ma_print_error(info, INT_MAX, 0); + DBUG_RETURN(my_errno= INT_MAX);); + if (!(info->update & HA_STATE_AKTIV)) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No database read */ + } + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + if ((*share->compare_record)(info,record)) + goto err; /* Error on read-check */ + + if (_ma_mark_file_changed(share)) + goto err; + + /* Ensure we don't change the autoincrement value */ + info->last_auto_increment= ~(ulonglong) 0; + /* Remove all keys from the index file */ + + old_key= info->lastkey_buff2; + + for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + keyinfo->version++; + if (keyinfo->flag & HA_FULLTEXT) + { + if (_ma_ft_del(info, i, old_key, record, info->cur_row.lastpos)) + goto err; + } + else + { + MARIA_KEY key; + if (keyinfo->ck_delete(info, + (*keyinfo->make_key)(info, &key, i, old_key, + record, + info->cur_row.lastpos, + info->cur_row.trid))) + goto err; + } + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + } + } + + if (share->calc_checksum) + { + /* + We can't use the row based checksum as this doesn't have enough + precision. + */ + info->cur_row.checksum= (*share->calc_checksum)(info, record); + } + + if ((*share->delete_record)(info, record)) + goto err; /* Remove record from database */ + + info->state->checksum-= info->cur_row.checksum; + info->state->records--; + info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED; + info->row_changes++; + share->state.changed|= (STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE | + STATE_NOT_ZEROFILLED); + info->state->changed=1; + + mi_sizestore(lastpos, info->cur_row.lastpos); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (delete)", + share->open_file_name.str)); + (*info->invalidator)(share->open_file_name.str); + info->invalidator=0; + } + DBUG_RETURN(0); + +err: + save_errno= my_errno; + DBUG_ASSERT(save_errno); + if (!save_errno) + save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */ + + mi_sizestore(lastpos, info->cur_row.lastpos); + (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE); + info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + if (save_errno != HA_ERR_RECORD_CHANGED) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + save_errno= HA_ERR_CRASHED; + } + DBUG_RETURN(my_errno= save_errno); +} /* maria_delete */ + + +/* + Remove a key from the btree index + + TODO: + Change ma_ck_real_delete() to use another buffer for changed keys instead + of key->data. This would allows us to remove the copying of the key here. +*/ + +my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key) +{ + MARIA_SHARE *share= info->s; + int res; + my_bool buff_alloced; + LSN lsn= LSN_IMPOSSIBLE; + my_off_t new_root= share->state.key_root[key->keyinfo->key_nr]; + uchar *key_buff, *save_key_data; + MARIA_KEY org_key; + DBUG_ENTER("_ma_ck_delete"); + + LINT_INIT_STRUCT(org_key); + + alloc_on_stack(*info->stack_end_ptr, key_buff, buff_alloced, + key->keyinfo->max_store_length); + if (!key_buff) + DBUG_RETURN(1); + + save_key_data= key->data; + if (share->now_transactional) + { + /* Save original value as the key may change */ + memcpy(key_buff, key->data, key->data_length + key->ref_length); + org_key= *key; + key->data= key_buff; + } + + if ((res= _ma_ck_real_delete(info, key, &new_root))) + { + /* We have to mark the table crashed before unpin_all_pages() */ + maria_mark_crashed(info); + } + + key->data= save_key_data; + if (!res && share->now_transactional) + res= _ma_write_undo_key_delete(info, &org_key, new_root, &lsn); + else + { + share->state.key_root[key->keyinfo->key_nr]= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + + stack_alloc_free(key_buff, buff_alloced); + DBUG_RETURN(res != 0); +} /* _ma_ck_delete */ + + +my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key, + my_off_t *root) +{ + int error; + my_bool result= 0, buff_alloced; + my_off_t old_root; + uchar *root_buff; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("_ma_ck_real_delete"); + + if ((old_root=*root) == HA_OFFSET_ERROR) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(1); + } + + alloc_on_stack(*info->stack_end_ptr, root_buff, buff_alloced, + (keyinfo->block_length + keyinfo->max_store_length*2)); + if (!root_buff) + DBUG_RETURN(1); + + DBUG_PRINT("info",("root_page: %lu", + (ulong) (old_root / keyinfo->block_length))); + if (_ma_fetch_keypage(&page, info, keyinfo, old_root, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, root_buff, 0)) + { + result= 1; + goto err; + } + if ((error= d_search(info, key, (keyinfo->flag & HA_FULLTEXT ? + SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT: + SEARCH_SAME), + &page))) + { + if (error < 0) + result= 1; + else if (error == 2) + { + DBUG_PRINT("test",("Enlarging of root when deleting")); + if (_ma_enlarge_root(info, key, root)) + result= 1; + } + else /* error == 1 */ + { + MARIA_SHARE *share= info->s; + + page_mark_changed(info, &page); + + if (page.size <= page.node + share->keypage_header + 1) + { + DBUG_ASSERT(page.size == page.node + share->keypage_header); + if (page.node) + *root= _ma_kpos(page.node, root_buff +share->keypage_header + + page.node); + else + *root=HA_OFFSET_ERROR; + if (_ma_dispose(info, old_root, 0)) + result= 1; + } + else if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + result= 1; + } + } +err: + stack_alloc_free(root_buff, buff_alloced); + DBUG_PRINT("exit",("Return: %d",result)); + DBUG_RETURN(result); +} /* _ma_ck_real_delete */ + + +/** + @brief Remove key below key root + + @param key Key to delete. Will contain new key if block was enlarged + + @return + @retval 0 ok (anc_page is not changed) + @retval 1 If data on page is too small; In this case anc_buff is not saved + @retval 2 If data on page is too big + @retval -1 On errors +*/ + +static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag, + MARIA_PAGE *anc_page) +{ + int flag,ret_value,save_flag; + uint nod_flag, page_flag; + my_bool last_key, buff_alloced= 0, lastkey_alloced; + uchar *leaf_buff=0, *keypos, *lastkey; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE leaf_page; + DBUG_ENTER("d_search"); + DBUG_DUMP("page", anc_page->buff, anc_page->size); + + alloc_on_stack(*info->stack_end_ptr, lastkey, lastkey_alloced, + keyinfo->max_store_length); + if (!lastkey) + DBUG_RETURN(1); + + flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, lastkey, + &last_key); + if (flag == MARIA_FOUND_WRONG_KEY) + { + DBUG_PRINT("error",("Found wrong key")); + goto err; + } + page_flag= anc_page->flag; + nod_flag= anc_page->node; + + if (!flag && (keyinfo->flag & HA_FULLTEXT)) + { + uint off; + int subkeys; + + get_key_full_length_rdonly(off, lastkey); + subkeys=ft_sintXkorr(lastkey+off); + DBUG_ASSERT(info->ft1_to_ft2==0 || subkeys >=0); + comp_flag=SEARCH_SAME; + if (subkeys >= 0) + { + /* normal word, one-level tree structure */ + if (info->ft1_to_ft2) + { + /* we're in ft1->ft2 conversion mode. Saving key data */ + insert_dynamic(info->ft1_to_ft2, (lastkey+off)); + } + else + { + /* we need exact match only if not in ft1->ft2 conversion mode */ + flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, + lastkey, &last_key); + } + /* fall through to normal delete */ + } + else + { + /* popular word. two-level tree. going down */ + uint tmp_key_length; + my_off_t root; + uchar *kpos=keypos; + MARIA_KEY tmp_key; + + tmp_key.data= lastkey; + tmp_key.keyinfo= keyinfo; + + if (!(tmp_key_length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, + &kpos))) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + goto err; + } + root= _ma_row_pos_from_key(&tmp_key); + if (subkeys == -1) + { + /* the last entry in sub-tree */ + if (_ma_dispose(info, root, 1)) + goto err; + /* fall through to normal delete */ + } + else + { + MARIA_KEY word_key; + keyinfo=&share->ft2_keyinfo; + /* we'll modify key entry 'in vivo' */ + kpos-=keyinfo->keylength+nod_flag; + get_key_full_length_rdonly(off, key->data); + + word_key.data= key->data + off; + word_key.keyinfo= &share->ft2_keyinfo; + word_key.data_length= HA_FT_WLEN; + word_key.ref_length= 0; + word_key.flag= 0; + ret_value= _ma_ck_real_delete(info, &word_key, &root); + _ma_dpointer(share, kpos+HA_FT_WLEN, root); + subkeys++; + ft_intXstore(kpos, subkeys); + if (!ret_value) + { + page_mark_changed(info, anc_page); + ret_value= _ma_write_keypage(anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS); + } + goto end; + } + } + } + if (nod_flag) + { + /* Read left child page */ + leaf_page.pos= _ma_kpos(nod_flag,keypos); + + alloc_on_stack(*info->stack_end_ptr, leaf_buff, buff_alloced, + (keyinfo->block_length + keyinfo->max_store_length*2)); + if (!leaf_buff) + goto err; + + if (_ma_fetch_keypage(&leaf_page, info,keyinfo, leaf_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, leaf_buff, + 0)) + goto err; + } + + if (flag != 0) + { + if (!nod_flag) + { + /* This should newer happend */ + DBUG_PRINT("error",("Didn't find key")); + _ma_set_fatal_error(info, HA_ERR_CRASHED); + goto err; + } + save_flag=0; + ret_value= d_search(info, key, comp_flag, &leaf_page); + } + else + { /* Found key */ + uint tmp; + uint anc_buff_length= anc_page->size; + uint anc_page_flag= anc_page->flag; + my_off_t next_block; + + if (!(tmp= remove_key(keyinfo, anc_page_flag, nod_flag, keypos, lastkey, + anc_page->buff + anc_buff_length, + &next_block, &s_temp))) + goto err; + + page_mark_changed(info, anc_page); + anc_buff_length-= tmp; + anc_page->size= anc_buff_length; + page_store_size(share, anc_page); + + /* + Log initial changes on pages + If there is an underflow, there will be more changes logged to the + page + */ + if (share->now_transactional && + _ma_log_delete(anc_page, s_temp.key_pos, + s_temp.changed_length, s_temp.move_length, + 0, KEY_OP_DEBUG_LOG_DEL_CHANGE_1)) + goto err; + + if (!nod_flag) + { /* On leaf page */ + if (anc_buff_length <= (info->quick_mode ? + MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)) + { + /* Page will be written by caller if we return 1 */ + ret_value= 1; + goto end; + } + if (_ma_write_keypage(anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + + ret_value= 0; /* Return ok */ + goto end; + } + save_flag=1; /* Mark that anc_buff is changed */ + ret_value= del(info, key, anc_page, &leaf_page, + keypos, next_block, lastkey); + } + if (ret_value >0) + { + save_flag= 2; + if (ret_value == 1) + ret_value= underflow(info, keyinfo, anc_page, &leaf_page, keypos); + else + { + /* This can only happen with variable length keys */ + MARIA_KEY last_key; + DBUG_PRINT("test",("Enlarging of key when deleting")); + + last_key.data= lastkey; + last_key.keyinfo= keyinfo; + if (!_ma_get_last_key(&last_key, anc_page, keypos)) + goto err; + ret_value= _ma_insert(info, key, anc_page, keypos, + last_key.data, + (MARIA_PAGE*) 0, (uchar*) 0, (my_bool) 0); + + if (_ma_write_keypage(&leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + ret_value= -1; + } + } + if (ret_value == 0 && anc_page->size > share->max_index_block_size) + { + /* + parent buffer got too big ; We have to split the page. + The | 2 is there to force write of anc page below + */ + save_flag= 3; + ret_value= _ma_split_page(info, key, anc_page, + share->max_index_block_size, + (uchar*) 0, 0, 0, lastkey, 0) | 2; + DBUG_ASSERT(anc_page->org_size == anc_page->size); + } + if (save_flag && ret_value != 1) + { + page_mark_changed(info, anc_page); + if (_ma_write_keypage(anc_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + ret_value= -1; + } + else + { + DBUG_DUMP("page", anc_page->buff, anc_page->size); + } + +end: + stack_alloc_free(leaf_buff, buff_alloced); + stack_alloc_free(lastkey, lastkey_alloced); + DBUG_PRINT("exit",("Return: %d",ret_value)); + DBUG_RETURN(ret_value); + +err: + stack_alloc_free(leaf_buff, buff_alloced); + stack_alloc_free(lastkey, lastkey_alloced); + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1); +} /* d_search */ + + +/** + @brief Remove a key that has a page-reference + + @param info Maria handler + @param key Buffer for key to be inserted at upper level + @param anc_page Page address for page where deleted key was + @param anc_buff Page buffer (nod) where deleted key was + @param leaf_page Page address for nod before the deleted key + @param leaf_buff Buffer for leaf_page + @param leaf_buff_link Pinned page link for leaf_buff + @param keypos Pos to where deleted key was on anc_buff + @param next_block Page adress for nod after deleted key + @param ret_key_buff Key before keypos in anc_buff + + @notes + leaf_page must be written to disk if retval > 0 + anc_page is not updated on disk. Caller should do this + + @return + @retval < 0 Error + @retval 0 OK. leaf_buff is written to disk + + @retval 1 key contains key to upper level (from balance page) + leaf_buff has underflow + @retval 2 key contains key to upper level (from split space) +*/ + +static int del(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos, my_off_t next_block, uchar *ret_key_buff) +{ + int ret_value,length; + uint a_length, page_flag, nod_flag, leaf_length, new_leaf_length; + uchar *keybuff,*endpos,*next_buff,*key_start, *prev_key; + uchar *anc_buff; + my_bool buff_alloced= 0, keybuff_alloced; + MARIA_KEY_PARAM s_temp; + MARIA_KEY tmp_key; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_KEY ret_key; + MARIA_PAGE next_page; + DBUG_ENTER("del"); + DBUG_PRINT("enter",("leaf_page: %lu keypos: %p", + (ulong) (leaf_page->pos / share->block_size), + keypos)); + DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size); + + alloc_on_stack(*info->stack_end_ptr, keybuff, keybuff_alloced, + keyinfo->max_store_length); + if (!keybuff) + DBUG_RETURN(1); + + page_flag= leaf_page->flag; + leaf_length= leaf_page->size; + nod_flag= leaf_page->node; + + endpos= leaf_page->buff + leaf_length; + tmp_key.keyinfo= keyinfo; + tmp_key.data= keybuff; + next_buff= 0; + + if (!(key_start= _ma_get_last_key(&tmp_key, leaf_page, endpos))) + goto err; + + if (nod_flag) + { + next_page.pos= _ma_kpos(nod_flag,endpos); + + alloc_on_stack(*info->stack_end_ptr, next_buff, buff_alloced, + (keyinfo->block_length + keyinfo->max_store_length*2)); + if (!next_buff) + goto err; + + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, next_buff, 0)) + ret_value= -1; + else + { + DBUG_DUMP("next_page", next_page.buff, next_page.size); + if ((ret_value= del(info, key, anc_page, &next_page, + keypos, next_block, ret_key_buff)) >0) + { + /* Get new length after key was deleted */ + endpos= leaf_page->buff+ leaf_page->size; + if (ret_value == 1) + { + /* underflow writes "next_page" to disk */ + ret_value= underflow(info, keyinfo, leaf_page, &next_page, + endpos); + if (ret_value < 0) + goto err; + if (leaf_page->size > share->max_index_block_size) + { + DBUG_ASSERT(ret_value == 0); + ret_value= (_ma_split_page(info, key, leaf_page, + share->max_index_block_size, + (uchar*) 0, 0, 0, + ret_key_buff, 0) | 2); + } + } + else + { + if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + DBUG_PRINT("test",("Inserting of key when deleting")); + if (!_ma_get_last_key(&tmp_key, leaf_page, endpos)) + goto err; + ret_value= _ma_insert(info, key, leaf_page, endpos, + tmp_key.data, (MARIA_PAGE *) 0, (uchar*) 0, + 0); + } + } + page_mark_changed(info, leaf_page); + /* + If ret_value <> 0, then leaf_page underflowed and caller will have + to handle underflow and write leaf_page to disk. + We can't write it here, as if leaf_page is empty we get an assert + in _ma_write_keypage. + */ + if (ret_value == 0 && _ma_write_keypage(leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + stack_alloc_free(next_buff, buff_alloced); + stack_alloc_free(keybuff, keybuff_alloced); + DBUG_ASSERT(leaf_page->size <= share->max_index_block_size); + DBUG_RETURN(ret_value); + } + + /* + Remove last key from leaf page + Note that leaf_page page may only have had one key (can normally only + happen in quick mode), in which ase it will now temporary have 0 keys + on it. This will be corrected by the caller as we will return 0. + */ + new_leaf_length= (uint) (key_start - leaf_page->buff); + leaf_page->size= new_leaf_length; + page_store_size(share, leaf_page); + + if (share->now_transactional && + _ma_log_suffix(leaf_page, leaf_length, new_leaf_length)) + goto err; + + page_mark_changed(info, leaf_page); /* Safety */ + if (new_leaf_length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)) + { + /* Underflow, leaf_page will be written by caller */ + ret_value= 1; + } + else + { + ret_value= 0; + if (_ma_write_keypage(leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + + /* Place last key in ancestor page on deleted key position */ + a_length= anc_page->size; + anc_buff= anc_page->buff; + endpos= anc_buff + a_length; + + ret_key.keyinfo= keyinfo; + ret_key.data= ret_key_buff; + + prev_key= 0; + if (keypos != anc_buff+share->keypage_header + share->base.key_reflength) + { + if (!_ma_get_last_key(&ret_key, anc_page, keypos)) + goto err; + prev_key= ret_key.data; + } + length= (*keyinfo->pack_key)(&tmp_key, share->base.key_reflength, + keypos == endpos ? (uchar*) 0 : keypos, + prev_key, prev_key, + &s_temp); + if (length > 0) + bmove_upp(endpos+length,endpos,(uint) (endpos-keypos)); + else + bmove(keypos,keypos-length, (int) (endpos-keypos)+length); + (*keyinfo->store_key)(keyinfo,keypos,&s_temp); + key_start= keypos; + if (tmp_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + _ma_mark_page_with_transid(share, anc_page); + + /* Save pointer to next leaf on parent page */ + if (!(*keyinfo->get_key)(&ret_key, page_flag, share->base.key_reflength, + &keypos)) + goto err; + _ma_kpointer(info,keypos - share->base.key_reflength,next_block); + anc_page->size= a_length + length; + page_store_size(share, anc_page); + + if (share->now_transactional && + _ma_log_add(anc_page, a_length, + key_start, s_temp.changed_length, s_temp.move_length, 1, + KEY_OP_DEBUG_LOG_ADD_2)) + goto err; + + DBUG_ASSERT(leaf_page->size <= share->max_index_block_size); + stack_alloc_free(next_buff, buff_alloced); + stack_alloc_free(keybuff, keybuff_alloced); + DBUG_RETURN(new_leaf_length <= + (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length)); + +err: + stack_alloc_free(next_buff, buff_alloced); + stack_alloc_free(keybuff, keybuff_alloced); + DBUG_RETURN(-1); +} /* del */ + + +/** + @brief Balances adjacent pages if underflow occours + + @fn underflow() + @param anc_buff Anchestor page data + @param leaf_page Leaf page (page that underflowed) + @param leaf_page_link Pointer to pin information about leaf page + @param keypos Position after current key in anc_buff + + @note + This function writes redo entries for all changes + leaf_page is saved to disk + Caller must save anc_buff + + For the algoritm to work, we have to ensure for packed keys that + key_length + (underflow_length + max_block_length + key_length) / 2 + <= block_length. + From which follows that underflow_length <= block_length - key_length *3 + For not packed keys we have: + (underflow_length + max_block_length + key_length) / 2 <= block_length + From which follows that underflow_length < block_length - key_length + This is ensured by setting of underflow_block_length. + + @return + @retval 0 ok + @retval 1 ok, but anc_page did underflow + @retval -1 error + */ + +static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page, + uchar *keypos) +{ + int t_length; + uint anc_length,buff_length,leaf_length,p_length,s_length,nod_flag; + uint next_buff_length, new_buff_length, key_reflength; + uint unchanged_leaf_length, new_leaf_length, new_anc_length; + uint anc_page_flag, page_flag; + uchar *anc_key_buff, *leaf_key_buff; + uchar *endpos, *next_keypos, *anc_pos, *half_pos, *prev_key; + uchar *anc_buff, *leaf_buff; + uchar *after_key, *anc_end_pos; + MARIA_KEY_PARAM key_deleted, key_inserted; + MARIA_SHARE *share= info->s; + my_bool first_key, buff_alloced; + MARIA_KEY tmp_key, anc_key, leaf_key; + MARIA_PAGE next_page; + DBUG_ENTER("underflow"); + DBUG_PRINT("enter",("leaf_page: %lu keypos: %p", + (ulong) (leaf_page->pos / share->block_size), + keypos)); + DBUG_DUMP("anc_buff", anc_page->buff, anc_page->size); + DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size); + + alloc_on_stack(*info->stack_end_ptr, anc_key_buff, buff_alloced, + keyinfo->max_store_length*2); + if (!anc_key_buff) + DBUG_RETURN(1); + + leaf_key_buff= anc_key_buff+ keyinfo->max_store_length; + + anc_page_flag= anc_page->flag; + anc_buff= anc_page->buff; + leaf_buff= leaf_page->buff; + info->keyread_buff_used=1; + next_keypos=keypos; + nod_flag= leaf_page->node; + p_length= nod_flag+share->keypage_header; + anc_length= anc_page->size; + leaf_length= leaf_page->size; + key_reflength= share->base.key_reflength; + if (share->keyinfo+info->lastinx == keyinfo) + info->page_changed=1; + first_key= keypos == anc_buff + share->keypage_header + key_reflength; + + tmp_key.data= info->buff; + anc_key.data= anc_key_buff; + leaf_key.data= leaf_key_buff; + tmp_key.keyinfo= leaf_key.keyinfo= anc_key.keyinfo= keyinfo; + + if ((keypos < anc_buff + anc_length && (info->state->records & 1)) || + first_key) + { + uint tmp_length; + uint next_page_flag; + /* Use page right of anc-page */ + DBUG_PRINT("test",("use right page")); + + /* + Calculate position after the current key. Note that keydata itself is + not used + */ + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + if (!(next_keypos= _ma_get_key(&tmp_key, anc_page, keypos))) + goto err; + } + else + { + /* Avoid length error check if packed key */ + tmp_key.data[0]= tmp_key.data[1]= 0; + /* Got to end of found key */ + if (!(*keyinfo->get_key)(&tmp_key, anc_page_flag, key_reflength, + &next_keypos)) + goto err; + } + next_page.pos= _ma_kpos(key_reflength, next_keypos); + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0)) + goto err; + next_buff_length= next_page.size; + next_page_flag= next_page.flag; + DBUG_DUMP("next", next_page.buff, next_page.size); + + /* find keys to make a big key-page */ + bmove(next_keypos-key_reflength, next_page.buff + share->keypage_header, + key_reflength); + + if (!_ma_get_last_key(&anc_key, anc_page, next_keypos) || + !_ma_get_last_key(&leaf_key, leaf_page, leaf_buff+leaf_length)) + goto err; + + /* merge pages and put parting key from anc_page between */ + prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data); + t_length= (*keyinfo->pack_key)(&anc_key, nod_flag, next_page.buff+p_length, + prev_key, prev_key, &key_inserted); + tmp_length= next_buff_length - p_length; + endpos= next_page.buff + tmp_length + leaf_length + t_length; + /* next_page.buff will always be larger than before !*/ + bmove_upp(endpos, next_page.buff + next_buff_length, tmp_length); + memcpy(next_page.buff, leaf_buff,(size_t) leaf_length); + (*keyinfo->store_key)(keyinfo, next_page.buff+leaf_length, &key_inserted); + buff_length= (uint) (endpos - next_page.buff); + + /* Set page flag from combination of both key pages and parting key */ + page_flag= next_page_flag | leaf_page->flag; + if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + page_flag|= KEYPAGE_FLAG_HAS_TRANSID; + + next_page.size= buff_length; + next_page.flag= page_flag; + page_store_info(share, &next_page); + + /* remove key from anc_page */ + if (!(s_length=remove_key(keyinfo, anc_page_flag, key_reflength, keypos, + anc_key_buff, anc_buff+anc_length, + (my_off_t *) 0, &key_deleted))) + goto err; + + new_anc_length= anc_length - s_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (buff_length <= share->max_index_block_size) + { + /* All keys fitted into one page */ + page_mark_changed(info, &next_page); + if (_ma_dispose(info, next_page.pos, 0)) + goto err; + + memcpy(leaf_buff, next_page.buff, (size_t) buff_length); + leaf_page->size= next_page.size; + leaf_page->flag= next_page.flag; + + if (share->now_transactional) + { + /* + Log changes to parent page. Note that this page may have been + temporarily bigger than block_size. + */ + if (_ma_log_delete(anc_page, key_deleted.key_pos, + key_deleted.changed_length, + key_deleted.move_length, + anc_length - anc_page->org_size, + KEY_OP_DEBUG_LOG_DEL_CHANGE_2)) + goto err; + /* + Log changes to leaf page. Data for leaf page is in leaf_buff + which contains original leaf_buff, parting key and next_buff + */ + if (_ma_log_suffix(leaf_page, leaf_length, buff_length)) + goto err; + } + } + else + { + /* + Balancing didn't free a page, so we have to split 'buff' into two + pages: + - Find key in middle of buffer + - Store everything before key in 'leaf_page' + - Pack key into anc_page at position of deleted key + Note that anc_page may overflow! (is handled by caller) + - Store remaining keys in next_page (buff) + */ + MARIA_KEY_PARAM anc_key_inserted; + + anc_end_pos= anc_buff + new_anc_length; + + DBUG_PRINT("test",("anc_buff:%p anc_end_pos:%p", + anc_buff, anc_end_pos)); + + if (!first_key && !_ma_get_last_key(&anc_key, anc_page, keypos)) + goto err; + if (!(half_pos= _ma_find_half_pos(&leaf_key, &next_page, &after_key))) + goto err; + new_leaf_length= (uint) (half_pos - next_page.buff); + memcpy(leaf_buff, next_page.buff, (size_t) new_leaf_length); + + leaf_page->size= new_leaf_length; + leaf_page->flag= page_flag; + page_store_info(share, leaf_page); + + /* Correct new keypointer to leaf_page */ + half_pos=after_key; + _ma_kpointer(info, + leaf_key.data + leaf_key.data_length + leaf_key.ref_length, + next_page.pos); + + /* Save key in anc_page */ + prev_key= (first_key ? (uchar*) 0 : anc_key.data); + t_length= (*keyinfo->pack_key)(&leaf_key, key_reflength, + (keypos == anc_end_pos ? (uchar*) 0 : + keypos), + prev_key, prev_key, &anc_key_inserted); + if (t_length >= 0) + bmove_upp(anc_end_pos+t_length, anc_end_pos, + (uint) (anc_end_pos - keypos)); + else + bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length); + (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted); + new_anc_length+= t_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + _ma_mark_page_with_transid(share, anc_page); + + /* Store key first in new page */ + if (nod_flag) + bmove(next_page.buff + share->keypage_header, half_pos-nod_flag, + (size_t) nod_flag); + if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos)) + goto err; + t_length=(int) (*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, + &key_inserted); + /* t_length will always be > 0 for a new page !*/ + tmp_length= (uint) ((next_page.buff + buff_length) - half_pos); + bmove(next_page.buff + p_length + t_length, half_pos, tmp_length); + (*keyinfo->store_key)(keyinfo, next_page.buff + p_length, &key_inserted); + new_buff_length= tmp_length + t_length + p_length; + next_page.size= new_buff_length; + page_store_size(share, &next_page); + /* keypage flag is already up to date */ + + if (share->now_transactional) + { + /* + Log changes to parent page + This has one key deleted from it and one key inserted to it at + keypos + + ma_log_add ensures that we don't log changes that is outside of + key block size, as the REDO code can't handle that + */ + if (_ma_log_add(anc_page, anc_length, keypos, + anc_key_inserted.move_length + + MY_MAX(anc_key_inserted.changed_length - + anc_key_inserted.move_length, + key_deleted.changed_length), + anc_key_inserted.move_length - + key_deleted.move_length, 1, + KEY_OP_DEBUG_LOG_ADD_3)) + goto err; + + /* + Log changes to leaf page. + This contains original data with new data added at end + */ + DBUG_ASSERT(leaf_length <= new_leaf_length); + if (_ma_log_suffix(leaf_page, leaf_length, new_leaf_length)) + goto err; + /* + Log changes to next page + + This contains original data with some prefix data deleted and + some compressed data at start possible extended + + Data in buff was originally: + org_leaf_buff [leaf_length] + separator_key [buff_key_inserted.move_length] + next_key_changes [buff_key_inserted.changed_length -move_length] + next_page_data [next_buff_length - p_length - + (buff_key_inserted.changed_length -move_length)] + + After changes it's now: + unpacked_key [key_inserted.changed_length] + next_suffix [next_buff_length - key_inserted.changed_length] + + */ + DBUG_ASSERT(new_buff_length <= next_buff_length); + if (_ma_log_prefix(&next_page, key_inserted.changed_length, + (int) (new_buff_length - next_buff_length), + KEY_OP_DEBUG_LOG_PREFIX_1)) + goto err; + } + page_mark_changed(info, &next_page); + if (_ma_write_keypage(&next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + } + + page_mark_changed(info, leaf_page); + if (_ma_write_keypage(leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + stack_alloc_free(anc_key_buff, buff_alloced); + DBUG_RETURN(new_anc_length <= + ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))); + } + + DBUG_PRINT("test",("use left page")); + + keypos= _ma_get_last_key(&anc_key, anc_page, keypos); + if (!keypos) + goto err; + next_page.pos= _ma_kpos(key_reflength,keypos); + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0)) + goto err; + buff_length= next_page.size; + endpos= next_page.buff + buff_length; + DBUG_DUMP("prev", next_page.buff, next_page.size); + + /* find keys to make a big key-page */ + bmove(next_keypos - key_reflength, leaf_buff + share->keypage_header, + key_reflength); + next_keypos=keypos; + if (!(*keyinfo->get_key)(&anc_key, anc_page_flag, key_reflength, + &next_keypos)) + goto err; + if (!_ma_get_last_key(&leaf_key, &next_page, endpos)) + goto err; + + /* merge pages and put parting key from anc_page between */ + prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data); + t_length=(*keyinfo->pack_key)(&anc_key, nod_flag, + (leaf_length == p_length ? + (uchar*) 0 : leaf_buff+p_length), + prev_key, prev_key, + &key_inserted); + if (t_length >= 0) + bmove(endpos+t_length, leaf_buff+p_length, + (size_t) (leaf_length-p_length)); + else /* We gained space */ + bmove(endpos,leaf_buff+((int) p_length-t_length), + (size_t) (leaf_length-p_length+t_length)); + (*keyinfo->store_key)(keyinfo,endpos, &key_inserted); + + /* Remember for logging how many bytes of leaf_buff that are not changed */ + DBUG_ASSERT((int) key_inserted.changed_length >= key_inserted.move_length); + unchanged_leaf_length= (leaf_length - p_length - + (key_inserted.changed_length - + key_inserted.move_length)); + + new_buff_length= buff_length + leaf_length - p_length + t_length; + +#ifdef EXTRA_DEBUG + /* Ensure that unchanged_leaf_length is correct */ + DBUG_ASSERT(bcmp(next_page.buff + new_buff_length - unchanged_leaf_length, + leaf_buff + leaf_length - unchanged_leaf_length, + unchanged_leaf_length) == 0); +#endif + + page_flag= next_page.flag | leaf_page->flag; + if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + page_flag|= KEYPAGE_FLAG_HAS_TRANSID; + + next_page.size= new_buff_length; + next_page.flag= page_flag; + page_store_info(share, &next_page); + + /* remove key from anc_page */ + if (!(s_length= remove_key(keyinfo, anc_page_flag, key_reflength, keypos, + anc_key_buff, + anc_buff+anc_length, (my_off_t *) 0, + &key_deleted))) + goto err; + + new_anc_length= anc_length - s_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (new_buff_length <= share->max_index_block_size) + { + /* All keys fitted into one page */ + page_mark_changed(info, leaf_page); + if (_ma_dispose(info, leaf_page->pos, 0)) + goto err; + + if (share->now_transactional) + { + /* + Log changes to parent page. Note that this page may have been + temporarily bigger than block_size. + */ + if (_ma_log_delete(anc_page, key_deleted.key_pos, + key_deleted.changed_length, key_deleted.move_length, + anc_length - anc_page->org_size, + KEY_OP_DEBUG_LOG_DEL_CHANGE_3)) + goto err; + /* + Log changes to next page. Data for leaf page is in buff + that contains original leaf_buff, parting key and next_buff + */ + if (_ma_log_suffix(&next_page, buff_length, new_buff_length)) + goto err; + } + } + else + { + /* + Balancing didn't free a page, so we have to split 'next_page' into two + pages + - Find key in middle of buffer (buff) + - Pack key at half_buff into anc_page at position of deleted key + Note that anc_page may overflow! (is handled by caller) + - Move everything after middlekey to 'leaf_buff' + - Shorten buff at 'endpos' + */ + MARIA_KEY_PARAM anc_key_inserted; + size_t tmp_length; + + if (keypos == anc_buff + share->keypage_header + key_reflength) + anc_pos= 0; /* First key */ + else + { + if (!_ma_get_last_key(&anc_key, anc_page, keypos)) + goto err; + anc_pos= anc_key.data; + } + if (!(endpos= _ma_find_half_pos(&leaf_key, &next_page, &half_pos))) + goto err; + + /* Correct new keypointer to leaf_page */ + _ma_kpointer(info,leaf_key.data + leaf_key.data_length + + leaf_key.ref_length, leaf_page->pos); + + /* Save parting key found by _ma_find_half_pos() in anc_page */ + DBUG_DUMP("anc_buff", anc_buff, new_anc_length); + DBUG_DUMP_KEY("key_to_anc", &leaf_key); + anc_end_pos= anc_buff + new_anc_length; + t_length=(*keyinfo->pack_key)(&leaf_key, key_reflength, + keypos == anc_end_pos ? (uchar*) 0 + : keypos, + anc_pos, anc_pos, + &anc_key_inserted); + if (t_length >= 0) + bmove_upp(anc_end_pos+t_length, anc_end_pos, + (uint) (anc_end_pos-keypos)); + else + bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length); + (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted); + new_anc_length+= t_length; + anc_page->size= new_anc_length; + page_store_size(share, anc_page); + + if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID | + SEARCH_PAGE_KEY_HAS_TRANSID)) + _ma_mark_page_with_transid(share, anc_page); + + /* Store first key on new page */ + if (nod_flag) + bmove(leaf_buff + share->keypage_header, half_pos-nod_flag, + (size_t) nod_flag); + if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos)) + goto err; + DBUG_DUMP_KEY("key_to_leaf", &leaf_key); + t_length=(*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, &key_inserted); + /* t_length will always be > 0 for a new page !*/ + tmp_length= (size_t) ((next_page.buff + new_buff_length) - half_pos); + DBUG_PRINT("info",("t_length: %d length: %d",t_length, (int) tmp_length)); + bmove(leaf_buff+p_length+t_length, half_pos, tmp_length); + (*keyinfo->store_key)(keyinfo,leaf_buff+p_length, &key_inserted); + new_leaf_length= (uint)(tmp_length + t_length + p_length); + DBUG_ASSERT(new_leaf_length <= share->max_index_block_size); + + leaf_page->size= new_leaf_length; + leaf_page->flag= page_flag; + page_store_info(share, leaf_page); + + new_buff_length= (uint) (endpos - next_page.buff); + next_page.size= new_buff_length; + page_store_size(share, &next_page); + + if (share->now_transactional) + { + /* + Log changes to parent page + This has one key deleted from it and one key inserted to it at + keypos + + ma_log_add() ensures that we don't log changes that is outside of + key block size, as the REDO code can't handle that + */ + if (_ma_log_add(anc_page, anc_length, keypos, + anc_key_inserted.move_length + + MY_MAX(anc_key_inserted.changed_length - + anc_key_inserted.move_length, + key_deleted.changed_length), + anc_key_inserted.move_length - + key_deleted.move_length, 1,KEY_OP_DEBUG_LOG_ADD_4)) + goto err; + + /* + Log changes to leaf page. + This contains original data with new data added first + */ + DBUG_ASSERT(leaf_length <= new_leaf_length); + DBUG_ASSERT(new_leaf_length >= unchanged_leaf_length); + if (_ma_log_prefix(leaf_page, new_leaf_length - unchanged_leaf_length, + (int) (new_leaf_length - leaf_length), + KEY_OP_DEBUG_LOG_PREFIX_2)) + goto err; + /* + Log changes to next page + This contains original data with some suffix data deleted + */ + DBUG_ASSERT(new_buff_length <= buff_length); + if (_ma_log_suffix(&next_page, buff_length, new_buff_length)) + goto err; + } + + page_mark_changed(info, leaf_page); + if (_ma_write_keypage(leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + } + page_mark_changed(info, &next_page); + if (_ma_write_keypage(&next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + + stack_alloc_free(anc_key_buff, buff_alloced); + DBUG_RETURN(new_anc_length <= + ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))); + +err: + stack_alloc_free(anc_key_buff, buff_alloced); + DBUG_RETURN(-1); +} /* underflow */ + + +/** + @brief Remove a key from page + + @fn remove_key() + keyinfo Key handle + nod_flag Length of node ptr + keypos Where on page key starts + lastkey Buffer for storing keys to be removed + page_end Pointer to end of page + next_block If <> 0 and node-page, this is set to address of + next page + s_temp Information about what changes was done one the page: + s_temp.key_pos Start of key + s_temp.move_length Number of bytes removed at keypos + s_temp.changed_length Number of bytes changed at keypos + + @todo + The current code doesn't handle the case that the next key may be + packed better against the previous key if there is a case difference + + @return + @retval 0 error + @retval # How many chars was removed +*/ + +static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag, + uchar *keypos, uchar *lastkey, + uchar *page_end, my_off_t *next_block, + MARIA_KEY_PARAM *s_temp) +{ + int s_length; + uchar *start; + DBUG_ENTER("remove_key"); + DBUG_PRINT("enter", ("keypos:%p page_end: %p", + keypos, page_end)); + + start= s_temp->key_pos= keypos; + s_temp->changed_length= 0; + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY)) && + !(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + /* Static length key */ + s_length=(int) (keyinfo->keylength+nod_flag); + if (next_block && nod_flag) + *next_block= _ma_kpos(nod_flag,keypos+s_length); + } + else + { + /* Let keypos point at next key */ + MARIA_KEY key; + + /* Calculate length of key */ + key.keyinfo= keyinfo; + key.data= lastkey; + if (!(*keyinfo->get_key)(&key, page_flag, nod_flag, &keypos)) + DBUG_RETURN(0); /* Error */ + + if (next_block && nod_flag) + *next_block= _ma_kpos(nod_flag,keypos); + s_length=(int) (keypos-start); + if (keypos != page_end) + { + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { + uchar *old_key= start; + uint next_length,prev_length,prev_pack_length; + + /* keypos points here on start of next key */ + get_key_length(next_length,keypos); + get_key_pack_length(prev_length,prev_pack_length,old_key); + if (next_length > prev_length) + { + uint diff= (next_length-prev_length); + /* We have to copy data from the current key to the next key */ + keypos-= diff + prev_pack_length; + store_key_length(keypos, prev_length); + bmove(keypos + prev_pack_length, lastkey + prev_length, diff); + s_length=(int) (keypos-start); + s_temp->changed_length= diff + prev_pack_length; + } + } + else + { + /* Check if a variable length first key part */ + if ((keyinfo->seg->flag & HA_PACK_KEY) && *keypos & 128) + { + /* Next key is packed against the current one */ + uint next_length,prev_length,prev_pack_length,lastkey_length, + rest_length; + if (keyinfo->seg[0].length >= 127) + { + if (!(prev_length=mi_uint2korr(start) & 32767)) + goto end; + next_length=mi_uint2korr(keypos) & 32767; + keypos+=2; + prev_pack_length=2; + } + else + { + if (!(prev_length= *start & 127)) + goto end; /* Same key as previous*/ + next_length= *keypos & 127; + keypos++; + prev_pack_length=1; + } + if (!(*start & 128)) + prev_length=0; /* prev key not packed */ + if (keyinfo->seg[0].flag & HA_NULL_PART) + lastkey++; /* Skip null marker */ + get_key_length(lastkey_length,lastkey); + if (!next_length) /* Same key after */ + { + next_length=lastkey_length; + rest_length=0; + } + else + get_key_length(rest_length,keypos); + + if (next_length >= prev_length) + { + /* Next key is based on deleted key */ + uint pack_length; + uint diff= (next_length-prev_length); + + /* keypos points to data of next key (after key length) */ + bmove(keypos - diff, lastkey + prev_length, diff); + rest_length+= diff; + pack_length= prev_length ? get_pack_length(rest_length): 0; + keypos-= diff + pack_length + prev_pack_length; + s_length=(int) (keypos-start); + if (prev_length) /* Pack against prev key */ + { + *keypos++= start[0]; + if (prev_pack_length == 2) + *keypos++= start[1]; + store_key_length(keypos,rest_length); + } + else + { + /* Next key is not packed anymore */ + if (keyinfo->seg[0].flag & HA_NULL_PART) + { + rest_length++; /* Mark not null */ + } + if (prev_pack_length == 2) + { + mi_int2store(keypos,rest_length); + } + else + *keypos= rest_length; + } + s_temp->changed_length= diff + pack_length + prev_pack_length; + } + } + } + } + } + end: + bmove(start, start+s_length, (uint) (page_end-start-s_length)); + s_temp->move_length= s_length; + DBUG_RETURN((uint) s_length); +} /* remove_key */ + + +/**************************************************************************** + Logging of redos +****************************************************************************/ + +/** + @brief + log entry where some parts are deleted and some things are changed + and some data could be added last. + + @fn _ma_log_delete() + @param info Maria handler + @param page Pageaddress for changed page + @param buff Page buffer + @param key_pos Start of change area + @param changed_length How many bytes where changed at key_pos + @param move_length How many bytes where deleted at key_pos + @param append_length Length of data added last + This is taken from end of ma_page->buff + + This is mainly used when a key is deleted. The append happens + when we delete a key from a page with data > block_size kept in + memory and we have to add back the data that was stored > block_size +*/ + +my_bool _ma_log_delete(MARIA_PAGE *ma_page, const uchar *key_pos, + uint changed_length, uint move_length, + uint append_length __attribute__((unused)), + enum en_key_debug debug_marker __attribute__((unused))) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 5+ 2 + 3 + 3 + 6 + 3 + 7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 7]; + uint translog_parts, current_size, extra_length; + uint offset= (uint) (key_pos - ma_page->buff); + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + my_off_t page= ma_page->pos / share->block_size; + DBUG_ENTER("_ma_log_delete"); + DBUG_PRINT("enter", ("page: %lu offset: %u changed_length: %u move_length: %u append_length: %u page_size: %u", + (ulong) page, offset, changed_length, move_length, + append_length, ma_page->size)); + DBUG_ASSERT(share->now_transactional && move_length); + DBUG_ASSERT(offset + changed_length <= ma_page->size); + DBUG_ASSERT(ma_page->org_size - move_length + append_length == ma_page->size); + DBUG_ASSERT(move_length <= ma_page->org_size - share->keypage_header); + + /* Store address of new root page */ + page_store(log_data + FILEID_STORE_SIZE, page); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + current_size= ma_page->org_size; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= debug_marker; + + *log_pos++= KEY_OP_DEBUG_2; + int2store(log_pos, ma_page->org_size); + int2store(log_pos+2, ma_page->size); + log_pos+=4; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, ma_page->buff); + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + translog_parts= TRANSLOG_INTERNAL_PARTS + 1; + extra_length= 0; + + if (changed_length) + { + if (offset + changed_length >= share->max_index_block_size) + { + changed_length= share->max_index_block_size - offset; + move_length= 0; /* Nothing to move */ + current_size= share->max_index_block_size; + } + + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, changed_length); + log_pos+= 3; + log_array[translog_parts].str= ma_page->buff + offset; + log_array[translog_parts].length= changed_length; + translog_parts++; + + /* We only have to move things after offset+changed_length */ + offset+= changed_length; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + + if (move_length) + { + uint log_length; + if (offset + move_length < share->max_index_block_size) + { + /* + Move down things that is on page. + page_offset in apply_redo_inxed() will be at original offset + + changed_length. + */ + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, - (int) move_length); + log_length= 3; + current_size-= move_length; + } + else + { + /* Delete to end of page */ + uint tmp= current_size - offset; + current_size= offset; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, tmp); + log_length= 3; + } + log_array[translog_parts].str= log_pos; + log_array[translog_parts].length= log_length; + translog_parts++; + log_pos+= log_length; + extra_length+= log_length; + } + + if (current_size != ma_page->size && + current_size != share->max_index_block_size) + { + /* Append data that didn't fit on the page before */ + uint length= (MY_MIN(ma_page->size, share->max_index_block_size) - + current_size); + uchar *data= ma_page->buff + current_size; + + DBUG_ASSERT(length <= append_length); + + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, length); + log_array[translog_parts].str= log_pos; + log_array[translog_parts].length= 3; + log_array[translog_parts + 1].str= data; + log_array[translog_parts + 1].length= length; + log_pos+= 3; + translog_parts+= 2; + current_size+= length; + extra_length+= 3 + length; + } + + _ma_log_key_changes(ma_page, + log_array + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= current_size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS].length + + changed_length + extra_length, translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + + DBUG_RETURN(0); +} + + +/**************************************************************************** + Logging of undos +****************************************************************************/ + +my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key, + my_off_t new_root, LSN *res_lsn) +{ + MARIA_SHARE *share= info->s; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE], *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_msg_to_write_hook_for_undo_key msg; + enum translog_record_type log_type= LOGREC_UNDO_KEY_DELETE; + uint keynr= key->keyinfo->key_nr; + + lsn_store(log_data, info->trn->undo_lsn); + key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, keynr); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE; + + /** + @todo BUG if we had concurrent insert/deletes, reading state's key_root + like this would be unsafe. + */ + if (new_root != share->state.key_root[keynr]) + { + my_off_t page; + page= ((new_root == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO : + new_root / share->block_size); + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + log_type= LOGREC_UNDO_KEY_DELETE_WITH_ROOT; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (key->data_length + + key->ref_length); + + msg.root= &share->state.key_root[keynr]; + msg.value= new_root; + /* + set autoincrement to 1 if this is an auto_increment key + This is only used if we are now in a rollback of a duplicate key + */ + msg.auto_increment= share->base.auto_key == keynr + 1; + + return translog_write_record(res_lsn, log_type, + info->trn, info, + (translog_size_t) + (log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + log_array[TRANSLOG_INTERNAL_PARTS + 1].length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data + LSN_STORE_SIZE, &msg) ? -1 : 0; +} diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c new file mode 100644 index 00000000..f355d0da --- /dev/null +++ b/storage/maria/ma_delete_all.c @@ -0,0 +1,211 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Remove all rows from a MARIA table */ +/* This clears the status information and truncates files */ + +#include "maria_def.h" +#include "trnman.h" + +/** + @brief deletes all rows from a table + + @param info Maria handler + + @note It is important that this function does not rely on the state + information, as it may be called by ma_apply_undo_bulk_insert() on an + inconsistent table left by a crash. + + @return Operation status + @retval 0 ok + @retval 1 error +*/ + +int maria_delete_all_rows(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + my_bool log_record; + LSN lsn; +#ifdef HAVE_MMAP + my_bool mmap_file= share->file_map != 0; +#endif + DBUG_ENTER("maria_delete_all_rows"); + + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + /** + @todo LOCK take X-lock on table here. + When we have versioning, if some other thread is looking at this table, + we cannot shrink the file like this. + */ + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + log_record= share->now_transactional && !share->temporary; + + if (log_record) + { + /* + This record will be used by Recovery to finish the deletion if it + crashed. We force it to have a complete history in the log. + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[FILEID_STORE_SIZE]; + my_bool error; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DELETE_ALL, + info->trn, info, 0, + sizeof(log_array)/sizeof(log_array[0]), + log_array, log_data, NULL) || + translog_flush(lsn))) + goto err; + /* + If we fail in this function after this point, log and table will be + inconsistent. + */ + if (_ma_mark_file_changed(share)) + goto err; + + /* + Because LOGREC_REDO_DELETE_ALL does not operate on pages, it has the + following problem: + delete_all; inserts (redo_insert); all pages get flushed; checkpoint: + the dirty pages list will be empty. In recovery, delete_all is executed, + but redo_insert are skipped (dirty pages list is empty). + To avoid this, we need to set skip_redo_lsn now, and thus need to sync + files. + Also fixes the problem of: + bulk insert; insert; delete_all; crash: + "bulk insert" is skipped (no REDOs), so if "insert" would not be skipped + (if we didn't update skip_redo_lsn below) then "insert" would be tried + and fail, saying that it sees that the first page has to be created + though the inserted row has rownr>0. + + We use lsn-1 below to ensure that the above redo will be executed + */ + error= _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK) || + _ma_update_state_lsns(share, lsn-1, info->trn->trid, FALSE, FALSE) || + _ma_sync_table_files(info); + info->trn->rec_lsn= LSN_IMPOSSIBLE; + if (error) + goto err; + } + else + { + if (_ma_mark_file_changed(share)) + goto err; + /* Other branch called function below when writing log record, in hook */ + _ma_reset_status(info); + } + /* Remove old history as the table is now empty for everyone */ + _ma_reset_state(info); + share->state.changed= 0; + + /* + If we are using delayed keys or if the user has done changes to the tables + since it was locked then there may be key blocks in the page cache. Or + there may be data blocks there. We need to throw them away or they may + re-enter the emptied table or another table later. + */ + +#ifdef HAVE_MMAP + if (mmap_file) + _ma_unmap_file(info); +#endif + + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, + FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) || + mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || + mysql_file_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME))) + goto err; + + if (_ma_initialize_data_file(share, info->dfile.file)) + goto err; + + if (log_record) + { + /* Update lsn to signal that the above redo does not have to be executed anymore */ + if ( _ma_update_state_lsns(share, lsn, info->trn->trid, FALSE, FALSE) || + _ma_sync_table_files(info)) + goto err; + } + + if (info->opt_flag & WRITE_CACHE_USED) + reinit_io_cache(&info->rec_cache, WRITE_CACHE, 0, 1, 1); + + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); +#ifdef HAVE_MMAP + /* Map again */ + if (mmap_file) + _ma_dynmap_file(info, (my_off_t) 0); +#endif + DBUG_RETURN(0); + +err: + { + int save_errno=my_errno; + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); + info->update|=HA_STATE_WRITTEN; /* Buffer changed */ + DBUG_RETURN(my_errno=save_errno); + } +} /* maria_delete_all_rows */ + + +/* + Reset status information + + SYNOPSIS + _ma_reset_status() + maria Maria handler + + DESCRIPTION + Resets data and index file information as if the file would be empty + Files are not touched. +*/ + +void _ma_reset_status(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_INFO *state= &share->state; + uint i; + DBUG_ENTER("_ma_reset_status"); + + state->split= 0; + state->state.records= state->state.del= 0; + state->changed= 0; /* File is optimized */ + state->dellink= HA_OFFSET_ERROR; + state->sortkey= (ushort) ~0; + state->state.key_file_length= share->base.keystart; + state->state.data_file_length= 0; + state->state.empty= state->state.key_empty= 0; + state->state.checksum= 0; + share->state.open_count= 0; + share->global_changed= 0; + + share->changed= 1; /* We must write state */ + + *info->state= state->state; + + /* Drop the delete key chain. */ + state->key_del= HA_OFFSET_ERROR; + /* Clear all keys */ + for (i=0 ; i < share->base.keys ; i++) + state->key_root[i]= HA_OFFSET_ERROR; + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c new file mode 100644 index 00000000..ebd94cdb --- /dev/null +++ b/storage/maria/ma_delete_table.c @@ -0,0 +1,118 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "ma_fulltext.h" +#include "trnman_public.h" + +/** + @brief drops (deletes) a table + + @param name table's name + + @return Operation status + @retval 0 ok + @retval 1 error +*/ + +int maria_delete_table(const char *name) +{ + MARIA_HA *info; + myf sync_dir; + int got_error= 0, error; + DBUG_ENTER("maria_delete_table"); + +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(name,"delete"); +#endif + /** @todo LOCK take X-lock on table */ + /* + We need to know if this table is transactional. + Unfortunately it is necessary to open the table just to check this. We use + 'open_for_repair' to be able to open even a crashed table. + */ + my_errno= 0; + if (!(info= maria_open(name, O_RDONLY, + (HA_OPEN_FOR_DROP | HA_OPEN_FOR_REPAIR), 0))) + { + sync_dir= 0; + /* Ignore not found errors and wrong symlink errors */ + if (my_errno != ENOENT && my_errno != HA_WRONG_CREATE_OPTION && + my_errno != HA_ERR_NO_ENCRYPTION) + got_error= my_errno; + } + else + { + sync_dir= (info->s->now_transactional && !info->s->temporary && + !maria_in_recovery) ? + MY_SYNC_DIR : 0; + /* Remove history for table */ + _ma_reset_state(info); + maria_close(info); + } + + if (sync_dir) + { + /* + For this log record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe in DDLs. + For now this record can serve when we apply logs to a backup, so we sync + it. + */ + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)name; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1; + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DROP_TABLE, + &dummy_transaction_object, NULL, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + DBUG_RETURN(1); + } + + if (!(error= maria_delete_table_files(name, 0, sync_dir | MY_WME))) + error= got_error; + DBUG_RETURN(error); +} + +/** + Delete all files related to a aria table +*/ + +int maria_delete_table_files(const char *name, my_bool temporary, myf flags) +{ + int error= 0; + DBUG_ENTER("maria_delete_table_files"); + + if (mysql_file_delete_with_symlink(key_file_kfile, name, MARIA_NAME_IEXT, + flags)) + error= my_errno; + if (mysql_file_delete_with_symlink(key_file_dfile, name, MARIA_NAME_DEXT, + flags)) + error= my_errno; + if (!temporary) + { + /* This is delete a possible temporary aria_chk file */ + mysql_file_delete_with_symlink(key_file_dfile, name, DATA_TMP_EXT, MYF(0)); +#ifdef SUPPORT_ARIA_PACK + /* This is delete a possible temporary aria_pack file */ + mysql_file_delete_with_symlink(key_file_dfile, name, ".OLD", MYF(0)); +#endif + } + DBUG_RETURN(error); +} diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c new file mode 100644 index 00000000..33f238d9 --- /dev/null +++ b/storage/maria/ma_dynrec.c @@ -0,0 +1,2109 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Functions to handle space-packed-records and blobs + + A row may be stored in one or more linked blocks. + The block size is between MARIA_MIN_BLOCK_LENGTH and MARIA_MAX_BLOCK_LENGTH. + Each block is aligned on MARIA_DYN_ALIGN_SIZE. + The reson for the max block size is to not have too many different types + of blocks. For the differnet block types, look at _ma_get_block_info() +*/ + +#include "maria_def.h" + +static my_bool write_dynamic_record(MARIA_HA *info,const uchar *record, + ulong reclength); +static int _ma_find_writepos(MARIA_HA *info,ulong reclength,my_off_t *filepos, + ulong *length); +static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uchar *record, ulong reclength); +static my_bool delete_dynamic_record(MARIA_HA *info,MARIA_RECORD_POS filepos, + uint second_read); +static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos, + uint length); + + /* Interface function from MARIA_HA */ + +#ifdef HAVE_MMAP + +/* + Create mmaped area for MARIA handler + + SYNOPSIS + _ma_dynmap_file() + info MARIA handler + + RETURN + 0 ok + 1 error. +*/ + +my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size) +{ + DBUG_ENTER("_ma_dynmap_file"); + if (size > (my_off_t) (~((size_t) 0)) - MEMMAP_EXTRA_MARGIN) + { + DBUG_PRINT("warning", ("File is too large for mmap")); + DBUG_RETURN(1); + } + /* + Ingo wonders if it is good to use MAP_NORESERVE. From the Linux man page: + MAP_NORESERVE + Do not reserve swap space for this mapping. When swap space is + reserved, one has the guarantee that it is possible to modify the + mapping. When swap space is not reserved one might get SIGSEGV + upon a write if no physical memory is available. + */ + info->s->file_map= (uchar*) + my_mmap(0, (size_t)(size + MEMMAP_EXTRA_MARGIN), + info->s->mode==O_RDONLY ? PROT_READ : + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_NORESERVE, + info->dfile.file, 0L); + if (info->s->file_map == (uchar*) MAP_FAILED) + { + info->s->file_map= NULL; + DBUG_RETURN(1); + } +#if defined(HAVE_MADVISE) + madvise((char*) info->s->file_map, size, MADV_RANDOM); +#endif + info->s->mmaped_length= size; + DBUG_RETURN(0); +} + + +/* + Resize mmaped area for MARIA handler + + SYNOPSIS + _ma_remap_file() + info MARIA handler + + RETURN +*/ + +void _ma_remap_file(MARIA_HA *info, my_off_t size) +{ + if (info->s->file_map) + { + my_munmap((char*) info->s->file_map, + (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN); + _ma_dynmap_file(info, size); + } +} +#endif + + +/* + Read bytes from MySAM handler, using mmap or pread + + SYNOPSIS + _ma_mmap_pread() + info MARIA handler + Buffer Input buffer + Count Count of bytes for read + offset Start position + MyFlags + + RETURN + 0 ok +*/ + +size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + DBUG_PRINT("info", ("maria_read with mmap %d\n", info->dfile.file)); + if (info->s->lock_key_trees) + mysql_rwlock_rdlock(&info->s->mmap_lock); + + /* + The following test may fail in the following cases: + - We failed to remap a memory area (fragmented memory?) + - This thread has done some writes, but not yet extended the + memory mapped area. + */ + + if (info->s->mmaped_length >= offset + Count) + { + memcpy(Buffer, info->s->file_map + offset, Count); + if (info->s->lock_key_trees) + mysql_rwlock_unlock(&info->s->mmap_lock); + return 0; + } + else + { + if (info->s->lock_key_trees) + mysql_rwlock_unlock(&info->s->mmap_lock); + return mysql_file_pread(info->dfile.file, Buffer, Count, offset, MyFlags); + } +} + + + /* wrapper for my_pread in case if mmap isn't used */ + +size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + return mysql_file_pread(info->dfile.file, Buffer, Count, offset, MyFlags); +} + + +/* + Write bytes to MySAM handler, using mmap or pwrite + + SYNOPSIS + _ma_mmap_pwrite() + info MARIA handler + Buffer Output buffer + Count Count of bytes for write + offset Start position + MyFlags + + RETURN + 0 ok + !=0 error. In this case return error from pwrite +*/ + +size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file)); + if (info->s->lock_key_trees) + mysql_rwlock_rdlock(&info->s->mmap_lock); + + /* + The following test may fail in the following cases: + - We failed to remap a memory area (fragmented memory?) + - This thread has done some writes, but not yet extended the + memory mapped area. + */ + + if (info->s->mmaped_length >= offset + Count) + { + memcpy(info->s->file_map + offset, Buffer, Count); + if (info->s->lock_key_trees) + mysql_rwlock_unlock(&info->s->mmap_lock); + return 0; + } + else + { + info->s->nonmmaped_inserts++; + if (info->s->lock_key_trees) + mysql_rwlock_unlock(&info->s->mmap_lock); + return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags); + } + +} + + + /* wrapper for my_pwrite in case if mmap isn't used */ + +size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags) +{ + return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags); +} + + +my_bool _ma_write_dynamic_record(MARIA_HA *info, const uchar *record) +{ + ulong reclength= _ma_rec_pack(info,info->rec_buff + MARIA_REC_BUFF_OFFSET, + record); + if (!reclength) + return 1; + return (write_dynamic_record(info,info->rec_buff + MARIA_REC_BUFF_OFFSET, + reclength)); +} + +my_bool _ma_update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + uint length= _ma_rec_pack(info, info->rec_buff + MARIA_REC_BUFF_OFFSET, + record); + if (!length) + return 1; + return (update_dynamic_record(info, pos, + info->rec_buff + MARIA_REC_BUFF_OFFSET, + length)); +} + + +my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record) +{ + uchar *rec_buff; + int error; + ulong reclength,reclength2,extra; + my_bool buff_alloced; + + extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER+1); + reclength= (info->s->base.pack_reclength + + _ma_calc_total_blob_length(info,record)+ extra); + + alloc_on_stack(*info->stack_end_ptr, rec_buff, buff_alloced, reclength); + if (!rec_buff) + { + my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */ + return(1); + } + + reclength2= _ma_rec_pack(info, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + record); + if (!reclength2) + { + error= 1; + goto err; + } + + DBUG_PRINT("info",("reclength: %lu reclength2: %lu", + reclength, reclength2)); + DBUG_ASSERT(reclength2 <= reclength); + error= write_dynamic_record(info, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + reclength2); +err: + stack_alloc_free(rec_buff, buff_alloced); + return(error != 0); +} + + +my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + uchar *rec_buff; + int error; + ulong reclength,reclength2,extra; + my_bool buff_alloced; + + extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+ + MARIA_DYN_DELETE_BLOCK_HEADER); + reclength= (info->s->base.pack_reclength+ + _ma_calc_total_blob_length(info,record)+ extra); +#ifdef NOT_USED /* We now support big rows */ + if (reclength > MARIA_DYN_MAX_ROW_LENGTH) + { + my_errno=HA_ERR_TO_BIG_ROW; + return 1; + } +#endif + + alloc_on_stack(*info->stack_end_ptr, rec_buff, buff_alloced, reclength); + if (!rec_buff) + { + my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */ + return(1); + } + + reclength2= _ma_rec_pack(info, rec_buff+ + ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + record); + if (!reclength2) + { + error= 1; + goto err; + } + DBUG_ASSERT(reclength2 <= reclength); + error=update_dynamic_record(info,pos, + rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER), + reclength2); +err: + stack_alloc_free(rec_buff, buff_alloced); + return(error != 0); +} + + +my_bool _ma_delete_dynamic_record(MARIA_HA *info, + const uchar *record __attribute__ ((unused))) +{ + return delete_dynamic_record(info, info->cur_row.lastpos, 0); +} + + +/** + Write record to data-file. + + @todo it's cheating: it casts "const uchar*" to uchar*. +*/ + +static my_bool write_dynamic_record(MARIA_HA *info, const uchar *record, + ulong reclength) +{ + int flag; + ulong length; + my_off_t filepos; + DBUG_ENTER("write_dynamic_record"); + + flag=0; + + /* + Check if we have enough room for the new record. + First we do simplified check to make usual case faster. + Then we do more precise check for the space left. + Though it still is not absolutely precise, as + we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be + less in the most of the cases. + */ + + if (unlikely(info->s->base.max_data_file_length - + info->state->data_file_length < + reclength + MARIA_MAX_DYN_BLOCK_HEADER)) + { + if (info->s->base.max_data_file_length - info->state->data_file_length + + info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER < + reclength + MARIA_MAX_DYN_BLOCK_HEADER) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + DBUG_RETURN(1); + } + } + + do + { + if (_ma_find_writepos(info,reclength,&filepos,&length)) + goto err; + if (_ma_write_part_record(info,filepos,length, + (info->append_insert_at_end ? + HA_OFFSET_ERROR : info->s->state.dellink), + (uchar**) &record,&reclength,&flag)) + goto err; + } while (reclength); + + DBUG_RETURN(0); +err: + DBUG_RETURN(1); +} + + + /* Get a block for data ; The given data-area must be used !! */ + +static int _ma_find_writepos(MARIA_HA *info, + ulong reclength, /* record length */ + my_off_t *filepos, /* Return file pos */ + ulong *length) /* length of block at filepos */ +{ + MARIA_BLOCK_INFO block_info; + ulong tmp; + DBUG_ENTER("_ma_find_writepos"); + + if (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) + { + /* Deleted blocks exists; Get last used block */ + *filepos=info->s->state.dellink; + block_info.second_read=0; + info->rec_cache.seek_not_done=1; + if (!(_ma_get_block_info(info, &block_info, info->dfile.file, + info->s->state.dellink) & + BLOCK_DELETED)) + { + DBUG_PRINT("error",("Delete link crashed")); + _ma_set_fatal_error_with_share(info->s, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(-1); + } + info->s->state.dellink=block_info.next_filepos; + info->state->del--; + info->state->empty-= block_info.block_len; + *length= block_info.block_len; + } + else + { + /* No deleted blocks; Allocate a new block */ + *filepos=info->state->data_file_length; + if ((tmp= reclength + 3 + MY_TEST(reclength >= (65520 - 3))) < + info->s->base.min_block_length) + tmp= info->s->base.min_block_length; + else + tmp= ((tmp+MARIA_DYN_ALIGN_SIZE-1) & + (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1))); + if (info->state->data_file_length > + (info->s->base.max_data_file_length - tmp)) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + DBUG_RETURN(-1); + } + if (tmp > MARIA_MAX_BLOCK_LENGTH) + tmp=MARIA_MAX_BLOCK_LENGTH; + *length= tmp; + info->state->data_file_length+= tmp; + info->s->state.split++; + info->update|=HA_STATE_WRITE_AT_END; + } + DBUG_RETURN(0); +} /* _ma_find_writepos */ + + + +/* + Unlink a deleted block from the deleted list. + This block will be combined with the preceding or next block to form + a big block. +*/ + +static my_bool unlink_deleted_block(MARIA_HA *info, + MARIA_BLOCK_INFO *block_info) +{ + DBUG_ENTER("unlink_deleted_block"); + if (block_info->filepos == info->s->state.dellink) + { + /* First deleted block; We can just use this ! */ + info->s->state.dellink=block_info->next_filepos; + } + else + { + MARIA_BLOCK_INFO tmp; + tmp.second_read=0; + /* Unlink block from the previous block */ + if (!(_ma_get_block_info(info, &tmp, info->dfile.file, + block_info->prev_filepos) + & BLOCK_DELETED)) + DBUG_RETURN(1); /* Something is wrong */ + mi_sizestore(tmp.header+4,block_info->next_filepos); + if (info->s->file_write(info, tmp.header+4,8, + block_info->prev_filepos+4, MYF(MY_NABP))) + DBUG_RETURN(1); + /* Unlink block from next block */ + if (block_info->next_filepos != HA_OFFSET_ERROR) + { + if (!(_ma_get_block_info(info, &tmp, info->dfile.file, + block_info->next_filepos) + & BLOCK_DELETED)) + DBUG_RETURN(1); /* Something is wrong */ + mi_sizestore(tmp.header+12,block_info->prev_filepos); + if (info->s->file_write(info, tmp.header+12,8, + block_info->next_filepos+12, + MYF(MY_NABP))) + DBUG_RETURN(1); + } + } + /* We now have one less deleted block */ + info->state->del--; + info->state->empty-= block_info->block_len; + info->s->state.split--; + + /* + If this was a block that we where accessing through table scan + (maria_rrnd() or maria_scan(), then ensure that we skip over this block + when doing next maria_rrnd() or maria_scan(). + */ + if (info->cur_row.nextpos == block_info->filepos) + info->cur_row.nextpos+= block_info->block_len; + DBUG_RETURN(0); +} + + +/* + Add a backward link to delete block + + SYNOPSIS + update_backward_delete_link() + info MARIA handler + delete_block Position to delete block to update. + If this is 'HA_OFFSET_ERROR', nothing will be done + filepos Position to block that 'delete_block' should point to + + RETURN + 0 ok + 1 error. In this case my_error is set. +*/ + +static my_bool update_backward_delete_link(MARIA_HA *info, + my_off_t delete_block, + MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("update_backward_delete_link"); + + if (delete_block != HA_OFFSET_ERROR) + { + block_info.second_read=0; + if (_ma_get_block_info(info, &block_info, info->dfile.file, delete_block) + & BLOCK_DELETED) + { + uchar buff[8]; + mi_sizestore(buff,filepos); + if (info->s->file_write(info,buff, 8, delete_block+12, MYF(MY_NABP))) + DBUG_RETURN(1); /* Error on write */ + } + else + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(1); /* Wrong delete link */ + } + } + DBUG_RETURN(0); +} + +/* Delete datarecord from database */ +/* info->rec_cache.seek_not_done is updated in cmp_record */ + +static my_bool delete_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uint second_read) +{ + uint length,b_type; + MARIA_BLOCK_INFO block_info,del_block; + int error; + my_bool remove_next_block; + DBUG_ENTER("delete_dynamic_record"); + + /* First add a link from the last block to the new one */ + error= update_backward_delete_link(info, info->s->state.dellink, filepos); + + block_info.second_read=second_read; + do + { + /* Remove block at 'filepos' */ + if ((b_type= _ma_get_block_info(info, &block_info, info->dfile.file, + filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR) || + (length=(uint) (block_info.filepos-filepos) +block_info.block_len) < + MARIA_MIN_BLOCK_LENGTH) + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(1); + } + /* Check if next block is a delete block */ + del_block.second_read=0; + remove_next_block=0; + if (_ma_get_block_info(info, &del_block, info->dfile.file, + filepos + length) & + BLOCK_DELETED && del_block.block_len+length < + MARIA_DYN_MAX_BLOCK_LENGTH) + { + /* We can't remove this yet as this block may be the head block */ + remove_next_block=1; + length+=del_block.block_len; + } + + block_info.header[0]=0; + mi_int3store(block_info.header+1,length); + mi_sizestore(block_info.header+4,info->s->state.dellink); + if (b_type & BLOCK_LAST) + bfill(block_info.header+12,8,255); + else + mi_sizestore(block_info.header+12,block_info.next_filepos); + if (info->s->file_write(info, block_info.header, 20, filepos, + MYF(MY_NABP))) + DBUG_RETURN(1); + info->s->state.dellink = filepos; + info->state->del++; + info->state->empty+=length; + filepos=block_info.next_filepos; + + /* Now it's safe to unlink the deleted block directly after this one */ + if (remove_next_block && unlink_deleted_block(info,&del_block)) + error=1; + } while (!(b_type & BLOCK_LAST)); + + DBUG_RETURN(error); +} + + + /* Write a block to datafile */ + +int _ma_write_part_record(MARIA_HA *info, + my_off_t filepos, /* points at empty block */ + ulong length, /* length of block */ + my_off_t next_filepos,/* Next empty block */ + uchar **record, /* pointer to record ptr */ + ulong *reclength, /* length of *record */ + int *flag) /* *flag == 0 if header */ +{ + ulong head_length,res_length,extra_length,long_block,del_length; + uchar *pos,*record_end; + my_off_t next_delete_block; + uchar temp[MARIA_SPLIT_LENGTH+MARIA_DYN_DELETE_BLOCK_HEADER]; + DBUG_ENTER("_ma_write_part_record"); + + next_delete_block=HA_OFFSET_ERROR; + + res_length=extra_length=0; + if (length > *reclength + MARIA_SPLIT_LENGTH) + { /* Splitt big block */ + res_length=MY_ALIGN(length- *reclength - MARIA_EXTEND_BLOCK_LENGTH, + MARIA_DYN_ALIGN_SIZE); + length-= res_length; /* Use this for first part */ + } + long_block= (length < 65520L && *reclength < 65520L) ? 0 : 1; + if (length == *reclength+ 3 + long_block) + { + /* Block is exactly of the right length */ + temp[0]=(uchar) (1+ *flag)+(uchar) long_block; /* Flag is 0 or 6 */ + if (long_block) + { + mi_int3store(temp+1,*reclength); + head_length=4; + } + else + { + mi_int2store(temp+1,*reclength); + head_length=3; + } + } + else if (length-long_block < *reclength+4) + { /* To short block */ + if (next_filepos == HA_OFFSET_ERROR) + next_filepos= (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end ? + info->s->state.dellink : info->state->data_file_length); + if (*flag == 0) /* First block */ + { + if (*reclength > MARIA_MAX_BLOCK_LENGTH) + { + head_length= 16; + temp[0]=13; + mi_int4store(temp+1,*reclength); + mi_int3store(temp+5,length-head_length); + mi_sizestore(temp+8,next_filepos); + } + else + { + head_length=5+8+long_block*2; + temp[0]=5+(uchar) long_block; + if (long_block) + { + mi_int3store(temp+1,*reclength); + mi_int3store(temp+4,length-head_length); + mi_sizestore(temp+7,next_filepos); + } + else + { + mi_int2store(temp+1,*reclength); + mi_int2store(temp+3,length-head_length); + mi_sizestore(temp+5,next_filepos); + } + } + } + else + { + head_length=3+8+long_block; + temp[0]=11+(uchar) long_block; + if (long_block) + { + mi_int3store(temp+1,length-head_length); + mi_sizestore(temp+4,next_filepos); + } + else + { + mi_int2store(temp+1,length-head_length); + mi_sizestore(temp+3,next_filepos); + } + } + } + else + { /* Block with empty info last */ + head_length=4+long_block; + extra_length= length- *reclength-head_length; + temp[0]= (uchar) (3+ *flag)+(uchar) long_block; /* 3,4 or 9,10 */ + if (long_block) + { + mi_int3store(temp+1,*reclength); + temp[4]= (uchar) (extra_length); + } + else + { + mi_int2store(temp+1,*reclength); + temp[3]= (uchar) (extra_length); + } + length= *reclength+head_length; /* Write only what is needed */ + } + DBUG_DUMP("header", temp, head_length); + + /* Make a long block for one write */ + record_end= *record+length-head_length; + del_length=(res_length ? MARIA_DYN_DELETE_BLOCK_HEADER : 0); + bmove((*record-head_length), temp, head_length); + memcpy(temp,record_end,(size_t) (extra_length+del_length)); + bzero(record_end, extra_length); + + if (res_length) + { + /* Check first if we can join this block with the next one */ + MARIA_BLOCK_INFO del_block; + my_off_t next_block=filepos+length+extra_length+res_length; + + del_block.second_read=0; + if (next_block < info->state->data_file_length && + info->s->state.dellink != HA_OFFSET_ERROR) + { + if ((_ma_get_block_info(info, &del_block, info->dfile.file, next_block) + & BLOCK_DELETED) && + res_length + del_block.block_len < MARIA_DYN_MAX_BLOCK_LENGTH) + { + if (unlink_deleted_block(info,&del_block)) + goto err; + res_length+=del_block.block_len; + } + } + + /* Create a delete link of the last part of the block */ + pos=record_end+extra_length; + pos[0]= '\0'; + mi_int3store(pos+1,res_length); + mi_sizestore(pos+4,info->s->state.dellink); + bfill(pos+12,8,255); /* End link */ + next_delete_block=info->s->state.dellink; + info->s->state.dellink= filepos+length+extra_length; + info->state->del++; + info->state->empty+=res_length; + info->s->state.split++; + } + if (info->opt_flag & WRITE_CACHE_USED && + info->update & HA_STATE_WRITE_AT_END) + { + if (info->update & HA_STATE_EXTEND_BLOCK) + { + info->update&= ~HA_STATE_EXTEND_BLOCK; + if (my_block_write(&info->rec_cache, *record-head_length, + length+extra_length+del_length,filepos)) + goto err; + } + else if (my_b_write(&info->rec_cache, *record-head_length, + length+extra_length+del_length)) + goto err; + } + else + { + info->rec_cache.seek_not_done=1; + if (info->s->file_write(info, *record-head_length, + length+extra_length+ + del_length,filepos,info->s->write_flag)) + goto err; + } + memcpy(record_end,temp,(size_t) (extra_length+del_length)); + *record=record_end; + *reclength-=(length-head_length); + *flag=6; + + if (del_length) + { + /* link the next delete block to this */ + if (update_backward_delete_link(info, next_delete_block, + info->s->state.dellink)) + goto err; + } + + DBUG_RETURN(0); +err: + DBUG_PRINT("exit",("errno: %d",my_errno)); + DBUG_RETURN(1); +} /* _ma_write_part_record */ + + + /* update record from datafile */ + +static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos, + uchar *record, ulong reclength) +{ + int flag; + uint error; + ulong length; + MARIA_BLOCK_INFO block_info; + DBUG_ENTER("update_dynamic_record"); + + flag=block_info.second_read=0; + /* + Check if we have enough room for the record. + First we do simplified check to make usual case faster. + Then we do more precise check for the space left. + Though it still is not absolutely precise, as + we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be + less in the most of the cases. + */ + + /* + compare with just the reclength as we're going + to get some space from the old replaced record + */ + if (unlikely(info->s->base.max_data_file_length - + info->state->data_file_length < reclength)) + { + /* If new record isn't longer, we can go on safely */ + if (info->cur_row.total_length < reclength) + { + if (info->s->base.max_data_file_length - info->state->data_file_length + + info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER < + reclength - info->cur_row.total_length + MARIA_MAX_DYN_BLOCK_HEADER) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + goto err; + } + } + } + /* Remember length for updated row if it's updated again */ + info->cur_row.total_length= reclength; + + while (reclength > 0) + { + if (filepos != info->s->state.dellink) + { + block_info.next_filepos= HA_OFFSET_ERROR; + if ((error= _ma_get_block_info(info, &block_info, info->dfile.file, + filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + DBUG_PRINT("error",("Got wrong block info")); + if (!(error & BLOCK_FATAL_ERROR)) + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; + } + length=(ulong) (block_info.filepos-filepos) + block_info.block_len; + if (length < reclength) + { + uint tmp=MY_ALIGN(reclength - length + 3 + + MY_TEST(reclength >= 65520L), MARIA_DYN_ALIGN_SIZE); + /* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */ + tmp= MY_MIN(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length; + /* Check if we can extend this block */ + if (block_info.filepos + block_info.block_len == + info->state->data_file_length && + info->state->data_file_length < + info->s->base.max_data_file_length-tmp) + { + /* extend file */ + DBUG_PRINT("info",("Extending file with %d bytes",tmp)); + if (info->cur_row.nextpos == info->state->data_file_length) + info->cur_row.nextpos+= tmp; + info->state->data_file_length+= tmp; + info->update|= HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK; + length+=tmp; + } + else if (length < MARIA_MAX_BLOCK_LENGTH - MARIA_MIN_BLOCK_LENGTH) + { + /* + Check if next block is a deleted block + Above we have MARIA_MIN_BLOCK_LENGTH to avoid the problem where + the next block is so small it can't be splited which could + casue problems + */ + + MARIA_BLOCK_INFO del_block; + del_block.second_read=0; + if (_ma_get_block_info(info, &del_block, info->dfile.file, + block_info.filepos + block_info.block_len) & + BLOCK_DELETED) + { + /* Use; Unlink it and extend the current block */ + DBUG_PRINT("info",("Extending current block")); + if (unlink_deleted_block(info,&del_block)) + goto err; + if ((length+=del_block.block_len) > MARIA_MAX_BLOCK_LENGTH) + { + /* + New block was too big, link overflow part back to + delete list + */ + my_off_t next_pos; + ulong rest_length= length-MARIA_MAX_BLOCK_LENGTH; + set_if_bigger(rest_length, MARIA_MIN_BLOCK_LENGTH); + next_pos= del_block.filepos+ del_block.block_len - rest_length; + + if (update_backward_delete_link(info, info->s->state.dellink, + next_pos)) + DBUG_RETURN(1); + + /* create delete link for data that didn't fit into the page */ + del_block.header[0]=0; + mi_int3store(del_block.header+1, rest_length); + mi_sizestore(del_block.header+4,info->s->state.dellink); + bfill(del_block.header+12,8,255); + if (info->s->file_write(info, del_block.header, 20, + next_pos, MYF(MY_NABP))) + DBUG_RETURN(1); + info->s->state.dellink= next_pos; + info->s->state.split++; + info->state->del++; + info->state->empty+= rest_length; + length-= rest_length; + } + } + } + } + } + else + { + if (_ma_find_writepos(info,reclength,&filepos,&length)) + goto err; + } + if (_ma_write_part_record(info,filepos,length,block_info.next_filepos, + &record,&reclength,&flag)) + goto err; + if ((filepos=block_info.next_filepos) == HA_OFFSET_ERROR) + { + /* Start writing data on deleted blocks */ + filepos=info->s->state.dellink; + } + } + + if (block_info.next_filepos != HA_OFFSET_ERROR) + if (delete_dynamic_record(info,block_info.next_filepos,1)) + goto err; + + DBUG_RETURN(0); +err: + DBUG_RETURN(1); +} + + +/** + Pack a record. + + @return new reclength + @return 0 in case of wrong data in record +*/ + +uint _ma_rec_pack(MARIA_HA *info, register uchar *to, + register const uchar *from) +{ + uint length,new_length,flag,bit,i; + const uchar *pos,*end; + uchar *startpos,*packpos; + enum en_fieldtype type; + reg3 MARIA_COLUMNDEF *column; + MARIA_BLOB *blob; + DBUG_ENTER("_ma_rec_pack"); + + flag= 0; + bit= 1; + startpos= packpos=to; + to+= info->s->base.pack_bytes; + blob= info->blobs; + column= info->s->columndef; + if (info->s->base.null_bytes) + { + memcpy(to, from, info->s->base.null_bytes); + from+= info->s->base.null_bytes; + to+= info->s->base.null_bytes; + } + + for (i=info->s->base.fields ; i-- > 0; from+= length, column++) + { + length=(uint) column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL) + { + if (type == FIELD_BLOB) + { + if (!blob->length) + flag|=bit; + else + { + char *temp_pos; + size_t tmp_length=length-portable_sizeof_char_ptr; + memcpy(to,from,tmp_length); + memcpy(&temp_pos,from+tmp_length,sizeof(char*)); + memcpy(to+tmp_length,temp_pos,(size_t) blob->length); + to+=tmp_length+blob->length; + } + blob++; + } + else if (type == FIELD_SKIP_ZERO) + { + if (memcmp(from, maria_zero_string, length) == 0) + flag|=bit; + else + { + memcpy(to, from, (size_t) length); + to+=length; + } + } + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + pos= from; end= from + length; + if (type == FIELD_SKIP_ENDSPACE) + { /* Pack trailing spaces */ + while (end > from && *(end-1) == ' ') + end--; + } + else + { /* Pack pref-spaces */ + while (pos < end && *pos == ' ') + pos++; + } + new_length=(uint) (end-pos); + if (new_length + 1 + MY_TEST(column->length > 255 && new_length > 127) + < length) + { + if (column->length > 255 && new_length > 127) + { + to[0]= (uchar) ((new_length & 127) + 128); + to[1]= (uchar) (new_length >> 7); + to+=2; + } + else + *to++= (uchar) new_length; + memcpy(to, pos, (size_t) new_length); to+=new_length; + flag|=bit; + } + else + { + memcpy(to,from,(size_t) length); to+=length; + } + } + else if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1); + uint tmp_length; + if (pack_length == 1) + { + tmp_length= (uint) *from; + *to++= *from; + } + else + { + tmp_length= uint2korr(from); + store_key_length_inc(to,tmp_length); + } + if (tmp_length > column->length) + { + my_errno= HA_ERR_WRONG_IN_RECORD; + DBUG_RETURN(0); + } + memcpy(to, from+pack_length,tmp_length); + to+= tmp_length; + continue; + } + else + { + memcpy(to,from,(size_t) length); to+=length; + continue; /* Normal field */ + } + if ((bit= bit << 1) >= 256) + { + *packpos++ = (uchar) flag; + bit=1; flag=0; + } + } + else + { + memcpy(to,from,(size_t) length); to+=length; + } + } + if (bit != 1) + *packpos= (uchar) flag; + if (info->s->calc_checksum) + *to++= (uchar) info->cur_row.checksum; + DBUG_PRINT("exit",("packed length: %d",(int) (to-startpos))); + DBUG_RETURN((uint) (to-startpos)); +} /* _ma_rec_pack */ + + + +/* + Check if a record was correctly packed. Used only by maria_chk + Returns 0 if record is ok. +*/ + +my_bool _ma_rec_check(MARIA_HA *info,const uchar *record, uchar *rec_buff, + ulong packed_length, my_bool with_checksum, + ha_checksum checksum) +{ + uint length,new_length,flag,bit,i; + const uchar *pos,*end; + uchar *packpos,*to; + enum en_fieldtype type; + reg3 MARIA_COLUMNDEF *column; + DBUG_ENTER("_ma_rec_check"); + + packpos=rec_buff; to= rec_buff+info->s->base.pack_bytes; + column= info->s->columndef; + flag= *packpos; bit=1; + record+= info->s->base.null_bytes; + to+= info->s->base.null_bytes; + + for (i=info->s->base.fields ; i-- > 0; record+= length, column++) + { + length=(uint) column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL) + { + if (type == FIELD_BLOB) + { + uint blob_length= + _ma_calc_blob_length(length-portable_sizeof_char_ptr,record); + if (!blob_length && !(flag & bit)) + goto err; + if (blob_length) + to+=length - portable_sizeof_char_ptr+ blob_length; + } + else if (type == FIELD_SKIP_ZERO) + { + if (memcmp(record, maria_zero_string, length) == 0) + { + if (!(flag & bit)) + goto err; + } + else + to+=length; + } + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + pos= record; end= record + length; + if (type == FIELD_SKIP_ENDSPACE) + { /* Pack trailing spaces */ + while (end > record && *(end-1) == ' ') + end--; + } + else + { /* Pack pre-spaces */ + while (pos < end && *pos == ' ') + pos++; + } + new_length=(uint) (end-pos); + if (new_length + 1 + MY_TEST(column->length > 255 && new_length > 127) + < length) + { + if (!(flag & bit)) + goto err; + if (column->length > 255 && new_length > 127) + { + /* purecov: begin inspected */ + if (to[0] != (uchar) ((new_length & 127) + 128) || + to[1] != (uchar) (new_length >> 7)) + goto err; + to+=2; + /* purecov: end */ + } + else if (*to++ != (uchar) new_length) + goto err; + to+=new_length; + } + else + to+=length; + } + else if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1); + uint tmp_length; + if (pack_length == 1) + { + tmp_length= (uint) *record; + to+= 1+ tmp_length; + continue; + } + else + { + tmp_length= uint2korr(record); + to+= get_pack_length(tmp_length)+tmp_length; + } + continue; + } + else + { + to+=length; + continue; /* Normal field */ + } + if ((bit= bit << 1) >= 256) + { + flag= *++packpos; + bit=1; + } + } + else + to+= length; + } + if (packed_length != (uint) (to - rec_buff) + + MY_TEST(info->s->calc_checksum) || (bit != 1 && (flag & ~(bit - 1)))) + goto err; + if (with_checksum && ((uchar) checksum != (uchar) *to)) + { + DBUG_PRINT("error",("wrong checksum for row")); + goto err; + } + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); +} + + +/* + @brief Unpacks a record + + @return Recordlength + @retval >0 ok + @retval MY_FILE_ERROR (== -1) Error. + my_errno is set to HA_ERR_WRONG_IN_RECORD +*/ + +size_t _ma_rec_unpack(register MARIA_HA *info, register uchar *to, uchar *from, + size_t found_length) +{ + uint flag,bit,length,min_pack_length, column_length; + enum en_fieldtype type; + uchar *from_end,*to_end,*packpos; + reg3 MARIA_COLUMNDEF *column, *end_column; + DBUG_ENTER("_ma_rec_unpack"); + + to_end=to + info->s->base.reclength; + from_end=from+found_length; + flag= (uchar) *from; bit=1; packpos=from; + if (found_length < info->s->base.min_pack_length) + goto err; + from+= info->s->base.pack_bytes; + min_pack_length= info->s->base.min_pack_length - info->s->base.pack_bytes; + + if ((length= info->s->base.null_bytes)) + { + memcpy(to, from, length); + from+= length; + to+= length; + min_pack_length-= length; + } + + for (column= info->s->columndef, end_column= column + info->s->base.fields; + column < end_column ; to+= column_length, column++) + { + column_length= column->length; + if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL && + (type != FIELD_CHECK)) + { + if (type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(column_length-1); + if (pack_length == 1) + { + length= (uint) *(uchar*) from; + if (length > column_length-1) + goto err; + *to= *from++; + } + else + { + get_key_length(length, from); + if (length > column_length-2) + goto err; + int2store(to,length); + } + if (from+length > from_end) + goto err; + memcpy(to+pack_length, from, length); + MEM_UNDEFINED(to+pack_length + length, + column_length - length - pack_length); + from+= length; + min_pack_length--; + continue; + } + if (flag & bit) + { + if (type == FIELD_BLOB || type == FIELD_SKIP_ZERO) + bzero(to, column_length); + else if (type == FIELD_SKIP_ENDSPACE || + type == FIELD_SKIP_PRESPACE) + { + if (column->length > 255 && *from & 128) + { + if (from + 1 >= from_end) + goto err; + length= (*from & 127)+ ((uint) (uchar) *(from+1) << 7); from+=2; + } + else + { + if (from == from_end) + goto err; + length= (uchar) *from++; + } + min_pack_length--; + if (length >= column_length || + min_pack_length + length > (uint) (from_end - from)) + goto err; + if (type == FIELD_SKIP_ENDSPACE) + { + memcpy(to, from, (size_t) length); + bfill(to+length, column_length-length, ' '); + } + else + { + bfill(to, column_length-length, ' '); + memcpy(to+column_length-length, from, (size_t) length); + } + from+=length; + } + } + else if (type == FIELD_BLOB) + { + uint size_length=column_length- portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(size_length,from); + ulong from_left= (ulong) (from_end - from); + if (from_left < size_length || + from_left - size_length < blob_length || + from_left - size_length - blob_length < min_pack_length) + goto err; + memcpy(to, from, (size_t) size_length); + from+=size_length; + memcpy(to+size_length,(uchar*) &from,sizeof(char*)); + from+=blob_length; + } + else + { + if (type == FIELD_SKIP_ENDSPACE || type == FIELD_SKIP_PRESPACE) + min_pack_length--; + if (min_pack_length + column_length > (uint) (from_end - from)) + goto err; + memcpy(to, from, (size_t) column_length); from+=column_length; + } + if ((bit= bit << 1) >= 256) + { + flag= (uchar) *++packpos; bit=1; + } + } + else + { + if (min_pack_length > (uint) (from_end - from)) + goto err; + min_pack_length-=column_length; + memcpy(to, from, (size_t) column_length); + from+=column_length; + } + } + if (info->s->calc_checksum) + info->cur_row.checksum= (uint) (uchar) *from++; + if (to == to_end && from == from_end && (bit == 1 || !(flag & ~(bit-1)))) + DBUG_RETURN(found_length); + +err: + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_PRINT("error",("to_end: %p -> %p from_end: %p -> %p", + to, to_end, from, from_end)); + DBUG_DUMP("from", info->rec_buff, info->s->base.min_pack_length); + DBUG_RETURN(MY_FILE_ERROR); +} /* _ma_rec_unpack */ + + + /* Calc length of blob. Update info in blobs->length */ + +ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record) +{ + ulong length; + MARIA_BLOB *blob,*end; + + for (length=0, blob= info->blobs, end=blob+info->s->base.blobs ; + blob != end; + blob++) + { + blob->length= _ma_calc_blob_length(blob->pack_length, + record + blob->offset); + length+=blob->length; + } + return length; +} + + +ulong _ma_calc_blob_length(uint length, const uchar *pos) +{ + switch (length) { + case 1: + return (uint) (uchar) *pos; + case 2: + return (uint) uint2korr(pos); + case 3: + return uint3korr(pos); + case 4: + return uint4korr(pos); + default: + break; + } + return 0; /* Impossible */ +} + + +void _ma_store_blob_length(uchar *pos,uint pack_length,uint length) +{ + switch (pack_length) { + case 1: + *pos= (uchar) length; + break; + case 2: + int2store(pos,length); + break; + case 3: + int3store(pos,length); + break; + case 4: + int4store(pos,length); + default: + break; + } + return; +} + + +/* + Read record from datafile. + + SYNOPSIS + _ma_read_dynamic_record() + info MARIA_HA pointer to table. + filepos From where to read the record. + buf Destination for record. + + NOTE + If a write buffer is active, it needs to be flushed if its contents + intersects with the record to read. We always check if the position + of the first uchar of the write buffer is lower than the position + past the last uchar to read. In theory this is also true if the write + buffer is completely below the read segment. That is, if there is no + intersection. But this case is unusual. We flush anyway. Only if the + first uchar in the write buffer is above the last uchar to read, we do + not flush. + + A dynamic record may need several reads. So this check must be done + before every read. Reading a dynamic record starts with reading the + block header. If the record does not fit into the free space of the + header, the block may be longer than the header. In this case a + second read is necessary. These one or two reads repeat for every + part of the record. + + RETURN + 0 OK + # Error number +*/ + +int _ma_read_dynamic_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos) +{ + int block_of_record; + uint b_type; + MARIA_BLOCK_INFO block_info; + File file; + uchar *UNINIT_VAR(to); + uint UNINIT_VAR(left_length); + MARIA_SHARE *share= info->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("_ma_read_dynamic_record"); + + if (filepos == HA_OFFSET_ERROR) + goto err; + + file= info->dfile.file; + block_of_record= 0; /* First block of record is numbered as zero. */ + block_info.second_read= 0; + do + { + /* A corrupted table can have wrong pointers. (Bug# 19835) */ + if (filepos == HA_OFFSET_ERROR) + goto panic; + if (info->opt_flag & WRITE_CACHE_USED && + (info->rec_cache.pos_in_file < filepos + + MARIA_BLOCK_INFO_HEADER_LENGTH) && + flush_io_cache(&info->rec_cache)) + goto err; + info->rec_cache.seek_not_done=1; + if ((b_type= _ma_get_block_info(info, &block_info, file, filepos)) & + (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED)) + my_errno=HA_ERR_RECORD_DELETED; + goto err; + } + if (block_of_record++ == 0) /* First block */ + { + info->cur_row.total_length= block_info.rec_len; + if (block_info.rec_len > (uint) share->base.max_pack_length) + goto panic; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + share->base.extra_rec_buff_size, flag)) + goto err; + } + to= info->rec_buff; + left_length=block_info.rec_len; + } + if (left_length < block_info.data_len || ! block_info.data_len) + goto panic; /* Wrong linked record */ + /* copy information that is already read */ + { + uint offset= (uint) (block_info.filepos - filepos); + uint prefetch_len= (sizeof(block_info.header) - offset); + filepos+= sizeof(block_info.header); + + if (prefetch_len > block_info.data_len) + prefetch_len= block_info.data_len; + if (prefetch_len) + { + memcpy(to, block_info.header + offset, prefetch_len); + block_info.data_len-= prefetch_len; + left_length-= prefetch_len; + to+= prefetch_len; + } + } + /* read rest of record from file */ + if (block_info.data_len) + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < filepos + block_info.data_len && + flush_io_cache(&info->rec_cache)) + goto err; + /* + What a pity that this method is not called 'file_pread' and that + there is no equivalent without seeking. We are at the right + position already. :( + */ + if (share->file_read(info, to, block_info.data_len, + filepos, MYF(MY_NABP))) + goto panic; + left_length-=block_info.data_len; + to+=block_info.data_len; + } + filepos= block_info.next_filepos; + } while (left_length); + + info->update|= HA_STATE_AKTIV; /* We have a aktive record */ + fast_ma_writeinfo(info); + DBUG_RETURN(_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) != + MY_FILE_ERROR ? 0 : my_errno); + +err: + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno); + +panic: + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; +} + + /* compare unique constraint between stored rows */ + +my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + uchar *old_rec_buff,*old_record; + size_t old_rec_buff_size; + my_bool error, buff_alloced; + DBUG_ENTER("_ma_cmp_dynamic_unique"); + + alloc_on_stack(*info->stack_end_ptr, old_record, buff_alloced, + info->s->base.reclength); + if (!old_record) + DBUG_RETURN(1); + + /* Don't let the compare destroy blobs that may be in use */ + old_rec_buff= info->rec_buff; + old_rec_buff_size= info->rec_buff_size; + + if (info->s->base.blobs) + { + info->rec_buff= 0; + info->rec_buff_size= 0; + } + error= _ma_read_dynamic_record(info, old_record, pos) != 0; + if (!error) + error=_ma_unique_comp(def, record, old_record, def->null_are_equal) != 0; + if (info->s->base.blobs) + { + my_free(info->rec_buff); + info->rec_buff= old_rec_buff; + info->rec_buff_size= old_rec_buff_size; + } + stack_alloc_free(old_record, buff_alloced); + DBUG_RETURN(error); +} + + + /* Compare of record on disk with packed record in memory */ + +my_bool _ma_cmp_dynamic_record(register MARIA_HA *info, + register const uchar *record) +{ + uint flag, reclength, b_type,cmp_length; + my_off_t filepos; + uchar *buffer; + MARIA_BLOCK_INFO block_info; + my_bool error= 1, buff_alloced= 0; + size_t UNINIT_VAR(buffer_length); + DBUG_ENTER("_ma_cmp_dynamic_record"); + + if (info->opt_flag & WRITE_CACHE_USED) + { + info->update&= ~(HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK); + if (flush_io_cache(&info->rec_cache)) + DBUG_RETURN(1); + } + info->rec_cache.seek_not_done=1; + + /* If nobody have touched the database we don't have to test rec */ + + buffer=info->rec_buff; + if ((info->opt_flag & READ_CHECK_USED)) + { /* If check isn't disabled */ + if (info->s->base.blobs) + { + buffer_length= (info->s->base.pack_reclength + + _ma_calc_total_blob_length(info,record)); + + alloc_on_stack(*info->stack_end_ptr, buffer, buff_alloced, buffer_length); + if (!buffer) + DBUG_RETURN(1); + } + if (!(reclength= _ma_rec_pack(info,buffer,record))) + goto err; + + record= buffer; + + filepos= info->cur_row.lastpos; + flag=block_info.second_read=0; + block_info.next_filepos=filepos; + while (reclength > 0) + { + if ((b_type= _ma_get_block_info(info, &block_info, info->dfile.file, + block_info.next_filepos)) + & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED)) + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + if (flag == 0) /* First block */ + { + flag=1; + if (reclength != block_info.rec_len) + { + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + } else if (reclength < block_info.data_len) + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; + } + reclength-= block_info.data_len; + cmp_length= block_info.data_len; + if (!reclength && info->s->calc_checksum) + cmp_length--; /* 'record' may not contain checksum */ + + if (_ma_cmp_buffer(info->dfile.file, record, block_info.filepos, + cmp_length)) + { + my_errno=HA_ERR_RECORD_CHANGED; + goto err; + } + flag=1; + record+=block_info.data_len; + } + } + my_errno=0; + error= 0; +err: + stack_alloc_free(buffer, buff_alloced); + DBUG_PRINT("exit", ("result: %d", error)); + DBUG_RETURN(error); +} + + + /* Compare file to buffert */ + +static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos, + uint length) +{ + uint next_length; + uchar temp_buff[IO_SIZE*2]; + DBUG_ENTER("_ma_cmp_buffer"); + + next_length= IO_SIZE*2 - (uint) (filepos & (IO_SIZE-1)); + + while (length > IO_SIZE*2) + { + if (mysql_file_pread(file,temp_buff,next_length,filepos, MYF(MY_NABP)) || + memcmp(buff, temp_buff, next_length)) + goto err; + filepos+=next_length; + buff+=next_length; + length-= next_length; + next_length=IO_SIZE*2; + } + if (mysql_file_pread(file,temp_buff,length,filepos,MYF(MY_NABP))) + goto err; + DBUG_RETURN(memcmp(buff, temp_buff, length) != 0); +err: + DBUG_RETURN(1); +} + + +/* + Read next record from datafile during table scan. + + SYNOPSIS + _ma_read_rnd_dynamic_record() + info MARIA_HA pointer to table. + buf Destination for record. + filepos From where to read the record. + skip_deleted_blocks If to repeat reading until a non-deleted + record is found. + + NOTE + This is identical to _ma_read_dynamic_record(), except the following + cases: + + - If there is no active row at 'filepos', continue scanning for + an active row. (This is becasue the previous + _ma_read_rnd_dynamic_record() call stored the next block position + in filepos, but this position may not be a start block for a row + - We may have READ_CACHING enabled, in which case we use the cache + to read rows. + + For other comments, check _ma_read_dynamic_record() + + RETURN + 0 OK + != 0 Error number +*/ + +int _ma_read_rnd_dynamic_record(MARIA_HA *info, + uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + int block_of_record; +#ifdef MARIA_EXTERNAL_LOCKING + int info_read; +#endif + uint left_len,b_type; + uchar *UNINIT_VAR(to); + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("_ma_read_rnd_dynamic_record"); + +#ifdef MARIA_EXTERNAL_LOCKING + info_read=0; +#endif + + if (info->lock_type == F_UNLCK) + { +#ifndef UNSAFE_LOCKING +#else + info->tmp_lock_type=F_RDLCK; +#endif + } +#ifdef MARIA_EXTERNAL_LOCKING + else + info_read=1; /* memory-keyinfoblock is ok */ +#endif + + block_of_record= 0; /* First block of record is numbered as zero. */ + block_info.second_read= 0; + left_len=1; + do + { + if (filepos >= info->state->data_file_length) + { +#ifdef MARIA_EXTERNAL_LOCKING + if (!info_read) + { /* Check if changed */ + info_read=1; + info->rec_cache.seek_not_done=1; + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + goto panic; + } + if (filepos >= info->state->data_file_length) + { + my_errno= HA_ERR_END_OF_FILE; + goto err; + } +#else + my_errno= HA_ERR_END_OF_FILE; + goto err; +#endif + } + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(info, &info->rec_cache, block_info.header, filepos, + sizeof(block_info.header), + (!block_of_record && skip_deleted_blocks ? + READING_NEXT : 0) | READING_HEADER)) + goto panic; + b_type= _ma_get_block_info(info, &block_info,-1,filepos); + } + else + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < filepos + MARIA_BLOCK_INFO_HEADER_LENGTH && + flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + info->rec_cache.seek_not_done=1; + b_type= _ma_get_block_info(info, &block_info, info->dfile.file, filepos); + } + + if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR | + BLOCK_FATAL_ERROR)) + { + if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + && skip_deleted_blocks) + { + filepos=block_info.filepos+block_info.block_len; + block_info.second_read=0; + continue; /* Search after next_record */ + } + if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR)) + { + my_errno= HA_ERR_RECORD_DELETED; + info->cur_row.lastpos= block_info.filepos; + info->cur_row.nextpos= block_info.filepos+block_info.block_len; + } + goto err; + } + if (block_of_record == 0) /* First block */ + { + info->cur_row.total_length= block_info.rec_len; + if (block_info.rec_len > (uint) share->base.max_pack_length) + goto panic; + info->cur_row.lastpos= filepos; + if (share->base.blobs) + { + if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + block_info.rec_len + + share->base.extra_rec_buff_size, flag)) + goto err; + } + to= info->rec_buff; + left_len=block_info.rec_len; + } + if (left_len < block_info.data_len) + goto panic; /* Wrong linked record */ + + /* copy information that is already read */ + { + uint offset=(uint) (block_info.filepos - filepos); + uint tmp_length= (sizeof(block_info.header) - offset); + filepos=block_info.filepos; + + if (tmp_length > block_info.data_len) + tmp_length= block_info.data_len; + if (tmp_length) + { + memcpy(to, block_info.header+offset, tmp_length); + block_info.data_len-=tmp_length; + left_len-=tmp_length; + to+=tmp_length; + filepos+=tmp_length; + } + } + /* read rest of record from file */ + if (block_info.data_len) + { + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(info, &info->rec_cache, to,filepos, + block_info.data_len, + (!block_of_record && skip_deleted_blocks) ? + READING_NEXT : 0)) + goto panic; + } + else + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file < + block_info.filepos + block_info.data_len && + flush_io_cache(&info->rec_cache)) + goto err; + /* VOID(my_seek(info->dfile.file, filepos, MY_SEEK_SET, MYF(0))); */ + if (mysql_file_read(info->dfile.file, to, block_info.data_len, MYF(MY_NABP))) + { + if (my_errno == HA_ERR_FILE_TOO_SHORT) + { + /* Unexpected end of file */ + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + } + goto err; + } + } + } + /* + Increment block-of-record counter. If it was the first block, + remember the position behind the block for the next call. + */ + if (block_of_record++ == 0) + { + info->cur_row.nextpos= block_info.filepos+block_info.block_len; + skip_deleted_blocks=0; + } + left_len-=block_info.data_len; + to+=block_info.data_len; + filepos=block_info.next_filepos; + } while (left_len); + + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + fast_ma_writeinfo(info); + if (_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) != + MY_FILE_ERROR) + DBUG_RETURN(0); + DBUG_RETURN(my_errno); /* Wrong record */ + +panic: + /* Something is fatal wrong */ + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); +err: + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno); +} + + + /* Read and process header from a dynamic-record-file */ + +uint _ma_get_block_info(MARIA_HA *handler, MARIA_BLOCK_INFO *info, File file, + my_off_t filepos) +{ + uint return_val=0; + uchar *header=info->header; + + if (file >= 0) + { + /* + We do not use my_pread() here because we want to have the file + pointer set to the end of the header after this function. + my_pread() may leave the file pointer untouched. + */ + mysql_file_seek(file,filepos,MY_SEEK_SET,MYF(0)); + if (mysql_file_read(file, header, sizeof(info->header),MYF(0)) != + sizeof(info->header)) + { + /* + This is either an error or just reading at end of file. + Don't give a fatal error for this case. + */ + my_errno= HA_ERR_WRONG_IN_RECORD; + return BLOCK_ERROR; + } + } + DBUG_DUMP("header",header,MARIA_BLOCK_INFO_HEADER_LENGTH); + if (info->second_read) + { + if (info->header[0] <= 6 || info->header[0] == 13) + return_val=BLOCK_SYNC_ERROR; + } + else + { + if (info->header[0] > 6 && info->header[0] != 13) + return_val=BLOCK_SYNC_ERROR; + } + info->next_filepos= HA_OFFSET_ERROR; /* Dummy if no next block */ + + switch (info->header[0]) { + case 0: + if ((info->block_len=(uint) mi_uint3korr(header+1)) < + MARIA_MIN_BLOCK_LENGTH || + (info->block_len & (MARIA_DYN_ALIGN_SIZE -1))) + goto err; + info->filepos=filepos; + info->next_filepos=mi_sizekorr(header+4); + info->prev_filepos=mi_sizekorr(header+12); +#if SIZEOF_OFF_T == 4 + if ((mi_uint4korr(header+4) != 0 && + (mi_uint4korr(header+4) != (ulong) ~0 || + info->next_filepos != (ulong) ~0)) || + (mi_uint4korr(header+12) != 0 && + (mi_uint4korr(header+12) != (ulong) ~0 || + info->prev_filepos != (ulong) ~0))) + goto err; +#endif + return return_val | BLOCK_DELETED; /* Deleted block */ + + case 1: + info->rec_len=info->data_len=info->block_len=mi_uint2korr(header+1); + info->filepos=filepos+3; + return return_val | BLOCK_FIRST | BLOCK_LAST; + case 2: + info->rec_len=info->data_len=info->block_len=mi_uint3korr(header+1); + info->filepos=filepos+4; + return return_val | BLOCK_FIRST | BLOCK_LAST; + + case 13: + info->rec_len=mi_uint4korr(header+1); + info->block_len=info->data_len=mi_uint3korr(header+5); + info->next_filepos=mi_sizekorr(header+8); + info->second_read=1; + info->filepos=filepos+16; + return return_val | BLOCK_FIRST; + + case 3: + info->rec_len=info->data_len=mi_uint2korr(header+1); + info->block_len=info->rec_len+ (uint) header[3]; + info->filepos=filepos+4; + return return_val | BLOCK_FIRST | BLOCK_LAST; + case 4: + info->rec_len=info->data_len=mi_uint3korr(header+1); + info->block_len=info->rec_len+ (uint) header[4]; + info->filepos=filepos+5; + return return_val | BLOCK_FIRST | BLOCK_LAST; + + case 5: + info->rec_len=mi_uint2korr(header+1); + info->block_len=info->data_len=mi_uint2korr(header+3); + info->next_filepos=mi_sizekorr(header+5); + info->second_read=1; + info->filepos=filepos+13; + return return_val | BLOCK_FIRST; + case 6: + info->rec_len=mi_uint3korr(header+1); + info->block_len=info->data_len=mi_uint3korr(header+4); + info->next_filepos=mi_sizekorr(header+7); + info->second_read=1; + info->filepos=filepos+15; + return return_val | BLOCK_FIRST; + + /* The following blocks are identical to 1-6 without rec_len */ + case 7: + info->data_len=info->block_len=mi_uint2korr(header+1); + info->filepos=filepos+3; + return return_val | BLOCK_LAST; + case 8: + info->data_len=info->block_len=mi_uint3korr(header+1); + info->filepos=filepos+4; + return return_val | BLOCK_LAST; + + case 9: + info->data_len=mi_uint2korr(header+1); + info->block_len=info->data_len+ (uint) header[3]; + info->filepos=filepos+4; + return return_val | BLOCK_LAST; + case 10: + info->data_len=mi_uint3korr(header+1); + info->block_len=info->data_len+ (uint) header[4]; + info->filepos=filepos+5; + return return_val | BLOCK_LAST; + + case 11: + info->data_len=info->block_len=mi_uint2korr(header+1); + info->next_filepos=mi_sizekorr(header+3); + info->second_read=1; + info->filepos=filepos+11; + return return_val; + case 12: + info->data_len=info->block_len=mi_uint3korr(header+1); + info->next_filepos=mi_sizekorr(header+4); + info->second_read=1; + info->filepos=filepos+12; + return return_val; + } + +err: + if (!handler->in_check_table) + { + /* We may be scanning the table for new rows; Don't give an error */ + _ma_set_fatal_error(handler, HA_ERR_WRONG_IN_RECORD); + } + return BLOCK_ERROR; +} diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c new file mode 100644 index 00000000..425cb421 --- /dev/null +++ b/storage/maria/ma_extra.c @@ -0,0 +1,677 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "ma_blockrec.h" + +static void maria_extra_keyflag(MARIA_HA *info, + enum ha_extra_function function); + +/** + @brief Set options and buffers to optimize table handling + + @param name table's name + @param info open table + @param function operation + @param extra_arg Pointer to extra argument (normally pointer to + ulong); used when function is one of: + HA_EXTRA_WRITE_CACHE + HA_EXTRA_CACHE + + @return Operation status + @retval 0 ok + @retval !=0 error +*/ + +int maria_extra(MARIA_HA *info, enum ha_extra_function function, + void *extra_arg) +{ + int error= 0; + ulong cache_size; + MARIA_SHARE *share= info->s; + my_bool block_records= share->data_file_type == BLOCK_RECORD; + DBUG_ENTER("maria_extra"); + DBUG_PRINT("enter",("function: %d",(int) function)); + + switch (function) { + case HA_EXTRA_RESET_STATE: /* Reset state (don't free buffers) */ + info->lastinx= ~0; /* Detect index changes */ + info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed= 1; + /* Next/prev gives first/last */ + if (info->opt_flag & READ_CACHE_USED) + { + reinit_io_cache(&info->rec_cache,READ_CACHE,0, + (pbool) (info->lock_type != F_UNLCK), + (pbool) MY_TEST(info->update & HA_STATE_ROW_CHANGED) + ); + } + info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND | + HA_STATE_PREV_FOUND); + break; + case HA_EXTRA_CACHE: + if (block_records) + break; /* Not supported */ + + if (info->lock_type == F_UNLCK && + (share->options & HA_OPTION_PACK_RECORD)) + { + error= 1; /* Not possibly if not locked */ + my_errno= EACCES; + break; + } + if (info->s->file_map) /* Don't use cache if mmap */ + break; +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if ((share->options & HA_OPTION_COMPRESS_RECORD)) + { + mysql_mutex_lock(&share->intern_lock); + if (_ma_memmap_file(info)) + { + /* We don't nead MADV_SEQUENTIAL if small file */ + madvise((char*) share->file_map, share->state.state.data_file_length, + share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ? + MADV_RANDOM : MADV_SEQUENTIAL); + mysql_mutex_unlock(&share->intern_lock); + break; + } + mysql_mutex_unlock(&share->intern_lock); + } +#endif + if (info->opt_flag & WRITE_CACHE_USED) + { + info->opt_flag&= ~WRITE_CACHE_USED; + if ((error= end_io_cache(&info->rec_cache))) + break; + } + if (!(info->opt_flag & + (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED))) + { + cache_size= (extra_arg ? *(ulong*) extra_arg : + my_default_record_cache_size); + if (!(init_io_cache(&info->rec_cache, info->dfile.file, + (uint) MY_MIN(share->state.state.data_file_length+1, + cache_size), + READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK), + MYF(share->write_flag & MY_WAIT_IF_FULL)))) + { + info->opt_flag|= READ_CACHE_USED; + info->update&= ~HA_STATE_ROW_CHANGED; + } + if (share->non_transactional_concurrent_insert) + info->rec_cache.end_of_file= info->state->data_file_length; + } + break; + case HA_EXTRA_REINIT_CACHE: + if (info->opt_flag & READ_CACHE_USED) + { + reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos, + (pbool) (info->lock_type != F_UNLCK), + (pbool) MY_TEST(info->update & HA_STATE_ROW_CHANGED)); + info->update&= ~HA_STATE_ROW_CHANGED; + if (share->non_transactional_concurrent_insert) + info->rec_cache.end_of_file= info->state->data_file_length; + } + break; + case HA_EXTRA_WRITE_CACHE: + if (info->lock_type == F_UNLCK) + { + error= 1; /* Not possibly if not locked */ + break; + } + if (block_records) + break; /* Not supported */ + + cache_size= (extra_arg ? *(ulong*) extra_arg : + my_default_record_cache_size); + if (!(info->opt_flag & + (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) && + !share->state.header.uniques) + if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size, + WRITE_CACHE, info->state->data_file_length, + (pbool) (info->lock_type != F_UNLCK), + MYF(share->write_flag & MY_WAIT_IF_FULL)))) + { + info->opt_flag|= WRITE_CACHE_USED; + info->update&= ~(HA_STATE_ROW_CHANGED | + HA_STATE_WRITE_AT_END | + HA_STATE_EXTEND_BLOCK); + } + break; + case HA_EXTRA_PREPARE_FOR_UPDATE: + if (info->s->data_file_type != DYNAMIC_RECORD) + break; + /* Remove read/write cache if dynamic rows */ + /* fall through */ + case HA_EXTRA_NO_CACHE: + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + error= end_io_cache(&info->rec_cache); + /* Sergei will insert full text index caching here */ + } +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if (info->opt_flag & MEMMAP_USED) + madvise((char*) share->file_map, share->state.state.data_file_length, + MADV_RANDOM); +#endif + break; + case HA_EXTRA_FLUSH_CACHE: + if (info->opt_flag & WRITE_CACHE_USED) + { + if ((error= flush_io_cache(&info->rec_cache))) + { + /* Fatal error found */ + _ma_set_fatal_error(info, HA_ERR_CRASHED); + } + } + break; + case HA_EXTRA_NO_READCHECK: + info->opt_flag&= ~READ_CHECK_USED; /* No readcheck */ + break; + case HA_EXTRA_READCHECK: + info->opt_flag|= READ_CHECK_USED; + break; + case HA_EXTRA_KEYREAD: /* Read only keys to record */ + case HA_EXTRA_REMEMBER_POS: + info->opt_flag|= REMEMBER_OLD_POS; + bmove(info->last_key.data + share->base.max_key_length*2, + info->last_key.data, + info->last_key.data_length + info->last_key.ref_length); + info->save_update= info->update; + info->save_lastinx= info->lastinx; + info->save_lastpos= info->cur_row.lastpos; + info->save_lastkey_data_length= info->last_key.data_length; + info->save_lastkey_ref_length= info->last_key.ref_length; + if (function == HA_EXTRA_REMEMBER_POS) + break; + /* fall through */ + case HA_EXTRA_KEYREAD_CHANGE_POS: + info->opt_flag|= KEY_READ_USED; + info->read_record= _ma_read_key_record; + break; + case HA_EXTRA_NO_KEYREAD: + case HA_EXTRA_RESTORE_POS: + if (info->opt_flag & REMEMBER_OLD_POS) + { + bmove(info->last_key.data, + info->last_key.data + share->base.max_key_length*2, + info->save_lastkey_data_length + info->save_lastkey_ref_length); + info->update= info->save_update | HA_STATE_WRITTEN; + if (info->lastinx != info->save_lastinx) /* Index changed */ + { + info->lastinx = info->save_lastinx; + info->last_key.keyinfo= info->s->keyinfo + info->lastinx; + info->last_key.flag= 0; + info->page_changed=1; + } + info->cur_row.lastpos= info->save_lastpos; + info->last_key.data_length= info->save_lastkey_data_length; + info->last_key.ref_length= info->save_lastkey_ref_length; + info->last_key.flag= 0; + } + info->read_record= share->read_record; + info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS); + break; + case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */ + info->lock_type= F_EXTRA_LCK; /* Simulate as locked */ + break; + case HA_EXTRA_WAIT_LOCK: + info->lock_wait= 0; + break; + case HA_EXTRA_NO_WAIT_LOCK: + info->lock_wait= MY_SHORT_WAIT; + break; + case HA_EXTRA_NO_KEYS: + if (share->s3_path) /* Not supported with S3 */ + break; + + /* we're going to modify pieces of the state, stall Checkpoint */ + mysql_mutex_lock(&share->intern_lock); + if (info->lock_type == F_UNLCK) + { + mysql_mutex_unlock(&share->intern_lock); + error= 1; /* Not possibly if not lock */ + break; + } + if (maria_is_any_key_active(share->state.key_map)) + { + MARIA_KEYDEF *key= share->keyinfo; + uint i; + for (i =0 ; i < share->base.keys ; i++,key++) + { + if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1) + { + maria_clear_key_active(share->state.key_map, i); + info->update|= HA_STATE_CHANGED; + } + } + + if (!share->changed) + { + share->changed= 1; /* Update on close */ + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + if (!share->global_changed) + { + share->global_changed= 1; + share->state.open_count++; + } + } + if (!share->now_transactional) + share->state.state= *info->state; + /* + That state write to disk must be done, even for transactional tables; + indeed the table's share is going to be lost (there was a + HA_EXTRA_FORCE_REOPEN before, which set share->last_version to + 0), and so the only way it leaves information (share->state.key_map) + for the posterity is by writing it to disk. + */ + DBUG_ASSERT(!maria_in_recovery); + error= _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO); + } + mysql_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_FORCE_REOPEN: + /* + MySQL uses this case after it has closed all other instances + of this table. + We however do a flush here for additional safety. + */ + /** @todo consider porting these flush-es to MyISAM */ + error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE); + if (!error && share->changed) + { + mysql_mutex_lock(&share->intern_lock); + error= _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET| + MA_STATE_INFO_WRITE_FULL_INFO); + mysql_mutex_unlock(&share->intern_lock); + } + mysql_mutex_lock(&THR_LOCK_maria); + mysql_mutex_lock(&share->intern_lock); /* protect against Checkpoint */ + /* Safety against assert in checkpoint */ + share->bitmap.changed_not_flushed= 0; + /* this makes the share not be re-used next time the table is opened */ + share->last_version= 0L; /* Impossible version */ + mysql_mutex_unlock(&share->intern_lock); + mysql_mutex_unlock(&THR_LOCK_maria); + break; + case HA_EXTRA_PREPARE_FOR_DROP: + /* Signals about intent to delete this table */ + share->deleting= TRUE; + share->global_changed= FALSE; /* force writing changed flag */ + /* To force repair if reopened */ + share->state.open_count= 1; + share->changed= 1; + _ma_mark_file_changed_now(share); + if (share->temporary) + break; + /* fall through */ + case HA_EXTRA_PREPARE_FOR_RENAME: + { + my_bool do_flush= MY_TEST(function != HA_EXTRA_PREPARE_FOR_DROP); + my_bool save_global_changed; + enum flush_type type; + DBUG_ASSERT(!share->temporary); + /* + This share, to have last_version=0, needs to save all its data/index + blocks to disk if this is not for a DROP TABLE. Otherwise they would be + invisible to future openers; and they could even go to disk late and + cancel the work of future openers. + */ + if (info->lock_type != F_UNLCK && !info->was_locked) + { + info->was_locked= info->lock_type; + if (maria_lock_database(info, F_UNLCK)) + error= my_errno; + info->lock_type= F_UNLCK; + } + /* + We don't need to call _mi_decrement_open_count() if we are + dropping the table, as the files will be removed anyway. If we + are aborted before the files is removed, it's better to not + call it as in that case the automatic repair on open will add + the missing index entries + */ + mysql_mutex_lock(&share->intern_lock); + if (share->kfile.file >= 0 && function != HA_EXTRA_PREPARE_FOR_DROP) + _ma_decrement_open_count(info, 0); + if (info->trn) + { + _ma_remove_table_from_trnman(info); + /* Ensure we don't point to the deleted data in trn */ + info->state= info->state_start= &share->state.state; + } + /* Remove history for table */ + _ma_reset_state(info); + + type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED; + save_global_changed= share->global_changed; + share->global_changed= 1; /* Don't increment open count */ + mysql_mutex_unlock(&share->intern_lock); + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + type, type)) + { + error=my_errno; + share->changed= 1; + } + mysql_mutex_lock(&share->intern_lock); + share->global_changed= save_global_changed; + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + if (end_io_cache(&info->rec_cache)) + error= 1; + } + if (share->kfile.file >= 0 && share->s3_path == 0) + { + if (do_flush) + { + /* Save the state so that others can find it from disk. */ + if (share->changed && + (_ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO) || + mysql_file_sync(share->kfile.file, MYF(0)))) + error= my_errno; + } + else + { + /* be sure that state is not tried for write as file may be closed */ + share->changed= 0; + share->global_changed= 0; + share->state.open_count= 0; + } + } + if (share->data_file_type == BLOCK_RECORD && + share->bitmap.file.file >= 0 && share->s3_path == 0) + { + DBUG_ASSERT(share->bitmap.non_flushable == 0 && + share->bitmap.changed == 0); + if (do_flush && my_sync(share->bitmap.file.file, MYF(0))) + error= my_errno; + share->bitmap.changed_not_flushed= 0; + } + /* last_version must be protected by intern_lock; See collect_tables() */ + share->last_version= 0L; /* Impossible version */ + mysql_mutex_unlock(&share->intern_lock); + break; + } + case HA_EXTRA_PREPARE_FOR_FORCED_CLOSE: + if (info->trn) + { + mysql_mutex_lock(&share->intern_lock); + _ma_remove_table_from_trnman(info); + /* Ensure we don't point to the deleted data in trn */ + info->state= info->state_start= &share->state.state; + mysql_mutex_unlock(&share->intern_lock); + } + break; + case HA_EXTRA_FLUSH: + if (!share->temporary) + error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP); + + mysql_mutex_lock(&share->intern_lock); + /* Tell maria_lock_database() that we locked the intern_lock mutex */ + info->intern_lock_locked= 1; + _ma_decrement_open_count(info, 1); + info->intern_lock_locked= 0; + if (share->not_flushed) + { + share->not_flushed= 0; + if (_ma_sync_table_files(info)) + error= my_errno; + if (error) + { + /* Fatal error found */ + share->changed= 1; + _ma_set_fatal_error(info, HA_ERR_CRASHED); + } + } + mysql_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_NORMAL: /* Theese isn't in use */ + info->quick_mode= 0; + break; + case HA_EXTRA_QUICK: + info->quick_mode= 1; + break; + case HA_EXTRA_NO_ROWS: + if (!share->state.header.uniques) + info->opt_flag|= OPT_NO_ROWS; + break; + case HA_EXTRA_PRELOAD_BUFFER_SIZE: + info->preload_buff_size= *((ulong *) extra_arg); + break; + case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + case HA_EXTRA_CHANGE_KEY_TO_DUP: + maria_extra_keyflag(info, function); + break; + case HA_EXTRA_MMAP: +#ifdef HAVE_MMAP + if (block_records) + break; /* Not supported */ + mysql_mutex_lock(&share->intern_lock); + /* + Memory map the data file if it is not already mapped. It is safe + to memory map a file while other threads are using file I/O on it. + Assigning a new address to a function pointer is an atomic + operation. intern_lock prevents that two or more mappings are done + at the same time. + */ + if (!share->file_map) + { + if (_ma_dynmap_file(info, share->state.state.data_file_length)) + { + DBUG_PRINT("warning",("mmap failed: errno: %d",errno)); + error= my_errno= errno; + } + else + { + share->file_read= _ma_mmap_pread; + share->file_write= _ma_mmap_pwrite; + } + } + mysql_mutex_unlock(&share->intern_lock); +#endif + break; + case HA_EXTRA_MARK_AS_LOG_TABLE: + mysql_mutex_lock(&share->intern_lock); + share->is_log_table= TRUE; + mysql_mutex_unlock(&share->intern_lock); + break; + case HA_EXTRA_KEY_CACHE: + case HA_EXTRA_NO_KEY_CACHE: + default: + break; + } + DBUG_RETURN(error); +} /* maria_extra */ + + +void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func, + void *func_arg) +{ + info->index_cond_func= func; + info->index_cond_func_arg= func_arg; +} + + +/* + Start/Stop Inserting Duplicates Into a Table, WL#1648. +*/ + +static void maria_extra_keyflag(MARIA_HA *info, + enum ha_extra_function function) +{ + uint idx; + + for (idx= 0; idx< info->s->base.keys; idx++) + { + switch (function) { + case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + info->s->keyinfo[idx].flag|= HA_NOSAME; + break; + case HA_EXTRA_CHANGE_KEY_TO_DUP: + info->s->keyinfo[idx].flag&= ~(HA_NOSAME); + break; + default: + break; + } + } +} + + +int maria_reset(MARIA_HA *info) +{ + int error= 0; + MARIA_SHARE *share= info->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("maria_reset"); + /* + Free buffers and reset the following flags: + EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK + + If the row buffer cache is large (for dynamic tables), reduce it + to save memory. + */ + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + error= end_io_cache(&info->rec_cache); + } + /* Free memory used for keeping blobs */ + if (share->base.blobs) + { + if (info->rec_buff_size > share->base.default_rec_buff_size) + { + info->rec_buff_size= 1; /* Force realloc */ + _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, + share->base.default_rec_buff_size, flag); + } + if (info->blob_buff_size > MARIA_SMALL_BLOB_BUFFER) + { + info->blob_buff_size= 1; /* Force realloc */ + _ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size, + MARIA_SMALL_BLOB_BUFFER, flag); + } + } +#if defined(HAVE_MMAP) && defined(HAVE_MADVISE) + if (info->opt_flag & MEMMAP_USED) + madvise((char*) share->file_map, share->state.state.data_file_length, + MADV_RANDOM); +#endif + info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS); + info->quick_mode= 0; + info->lastinx= ~0; /* detect index changes */ + info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed= 1; + info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND | + HA_STATE_PREV_FOUND); + info->error_count= 0; + DBUG_RETURN(error); +} + + +int _ma_sync_table_files(const MARIA_HA *info) +{ + return (mysql_file_sync(info->dfile.file, MYF(MY_WME)) || + mysql_file_sync(info->s->kfile.file, MYF(MY_WME))); +} + +uint _ma_file_callback_to_id(void *callback_data) +{ + MARIA_SHARE *share= (MARIA_SHARE*) callback_data; + return share ? share->id : 0; +} + + +/** + @brief flushes the data and/or index file of a table + + This is useful when one wants to read a table using OS syscalls (like + my_copy()) and first wants to be sure that MySQL-level caches go down to + the OS so that OS syscalls can see all data. It can flush rec_cache, + bitmap, pagecache of data file, pagecache of index file. + + @param info table + @param flush_data_or_index one or two of these flags: + MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX + @param flush_type_for_data + @param flush_type_for_index + + @note does not sync files (@see _ma_sync_table_files()). + @note Progressively this function will be used in all places where we flush + the index but not the data file (probable bugs). + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index) +{ + int error= 0; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_flush_table_files"); + + /* flush data file first because it's more critical */ + if (flush_data_or_index & MARIA_FLUSH_DATA) + { + if ((info->opt_flag & WRITE_CACHE_USED) && + flush_type_for_data != FLUSH_IGNORE_CHANGED && + flush_io_cache(&info->rec_cache)) + error= 1; + if (share->data_file_type == BLOCK_RECORD) + { + if (flush_type_for_data != FLUSH_IGNORE_CHANGED) + { + if (_ma_bitmap_flush(share)) + error= 1; + } + else + { + mysql_mutex_lock(&share->bitmap.bitmap_lock); + share->bitmap.changed= 0; + share->bitmap.changed_not_flushed= 0; + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + } + if (flush_pagecache_blocks(share->pagecache, &info->dfile, + flush_type_for_data)) + error= 1; + } + } + if ((flush_data_or_index & MARIA_FLUSH_INDEX) && + flush_pagecache_blocks(share->pagecache, &share->kfile, + flush_type_for_index)) + error= 1; + if (!error) + DBUG_RETURN(0); + + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(1); +} + + +my_bool ma_killed_standalone(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c new file mode 100644 index 00000000..a7bc2a7f --- /dev/null +++ b/storage/maria/ma_ft_boolean_search.c @@ -0,0 +1,1052 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* TODO: add caching - pre-read several index entries at once */ + +/* + Added optimization for full-text queries with plus-words. It was + implemented by sharing maximal document id (max_docid) variable + inside plus subtree. max_docid could be used by any word in plus + subtree, but it could be updated by plus-word only. + + Fulltext "smarter index merge" optimization assumes that rows + it gets are ordered by doc_id. That is not the case when we + search for a word with truncation operator. It may return + rows in random order. Thus we may not use "smarter index merge" + optimization with "trunc-words". + + The idea is: there is no need to search for docid smaller than + biggest docid inside current plus subtree or any upper plus subtree. + + Examples: + +word1 word2 + share same max_docid + max_docid updated by word1 + +word1 +(word2 word3) + share same max_docid + max_docid updated by word1 + +(word1 -word2) +(+word3 word4) + share same max_docid + max_docid updated by word3 + +word1 word2 (+word3 word4 (+word5 word6)) + three subexpressions (including the top-level one), + every one has its own max_docid, updated by its plus word. + but for the search word6 uses + MY_MAX(word1.max_docid, word3.max_docid, word5.max_docid), + while word4 uses, accordingly, + MY_MAX(word1.max_docid, word3.max_docid). +*/ + +#define FT_CORE +#include "ma_ftdefs.h" + +/* search with boolean queries */ + +static double _wghts[11]= +{ + 0.131687242798354, + 0.197530864197531, + 0.296296296296296, + 0.444444444444444, + 0.666666666666667, + 1.000000000000000, + 1.500000000000000, + 2.250000000000000, + 3.375000000000000, + 5.062500000000000, + 7.593750000000000}; +static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */ + +static double _nwghts[11]= +{ + -0.065843621399177, + -0.098765432098766, + -0.148148148148148, + -0.222222222222222, + -0.333333333333334, + -0.500000000000000, + -0.750000000000000, + -1.125000000000000, + -1.687500000000000, + -2.531250000000000, + -3.796875000000000}; +static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */ + +#define FTB_FLAG_TRUNC 1 +/* At most one of the following flags can be set */ +#define FTB_FLAG_YES 2 +#define FTB_FLAG_NO 4 +#define FTB_FLAG_WONLY 8 + +typedef struct st_ftb_expr FTB_EXPR; +struct st_ftb_expr +{ + FTB_EXPR *up; + uint flags; +/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ + my_off_t docid[2]; + my_off_t max_docid; + float weight; + float cur_weight; + LIST *phrase; /* phrase words */ + LIST *document; /* for phrase search */ + uint yesses; /* number of "yes" words matched */ + uint nos; /* number of "no" words matched */ + uint ythresh; /* number of "yes" words in expr */ + uint yweaks; /* number of "yes" words for scan only */ +}; + +typedef struct st_ftb_word +{ + FTB_EXPR *up; + uint flags; +/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */ + my_off_t docid[2]; /* for index search and for scan */ + my_off_t key_root; + FTB_EXPR *max_docid_expr; + MARIA_KEYDEF *keyinfo; + struct st_ftb_word *prev; + float weight; + uint ndepth; + uint len; + uchar off; + uchar word[1]; +} FTB_WORD; + +typedef struct st_ft_info +{ + struct _ft_vft *please; + MARIA_HA *info; + CHARSET_INFO *charset; + FTB_EXPR *root; + FTB_WORD **list; + FTB_WORD *last_word; + MEM_ROOT mem_root; + QUEUE queue; + TREE no_dupes; + my_off_t lastpos; + uint keynr; + uchar with_scan; + enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state; +} FTB; + +static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b) +{ + int i; + + /* if a==curdoc, take it as a < b */ + if (v && a->docid[0] == *v) + return -1; + + /* ORDER BY docid, ndepth DESC */ + i=CMP_NUM(a->docid[0], b->docid[0]); + if (!i) + i=CMP_NUM(b->ndepth,a->ndepth); + return i; +} + +static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b) +{ + /* ORDER BY word, ndepth */ + int i= ha_compare_word(cs, (uchar*) (*a)->word + 1, (*a)->len - 1, + (uchar*) (*b)->word + 1, (*b)->len - 1); + if (!i) + i=CMP_NUM((*a)->ndepth, (*b)->ndepth); + return i; +} + + +typedef struct st_my_ftb_param +{ + FTB *ftb; + FTB_EXPR *ftbe; + uchar *up_quot; + uint depth; +} MY_FTB_PARAM; + + +static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param, + const char *word, int word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *info) +{ + MY_FTB_PARAM *ftb_param= param->mysql_ftparam; + FTB_WORD *ftbw; + FTB_EXPR *ftbe, *tmp_expr; + FT_WORD *phrase_word; + LIST *tmp_element; + int r= info->weight_adjust; + float weight= (float) + (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)]; + + switch (info->type) { + case FT_TOKEN_WORD: + ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root, + sizeof(FTB_WORD) + HA_MAX_KEY_BUFF); + ftbw->len= word_len + 1; + ftbw->flags= 0; + ftbw->off= 0; + if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES; + if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO; + if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC; + ftbw->weight= weight; + ftbw->up= ftb_param->ftbe; + ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR; + ftbw->ndepth= (info->yesno < 0) + ftb_param->depth; + ftbw->key_root= HA_OFFSET_ERROR; + memcpy(ftbw->word + 1, word, word_len); + ftbw->word[0]= word_len; + if (info->yesno > 0) ftbw->up->ythresh++; + ftb_param->ftb->queue.max_elements++; + ftbw->prev= ftb_param->ftb->last_word; + ftb_param->ftb->last_word= ftbw; + ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC); + for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up) + if (! (tmp_expr->flags & FTB_FLAG_YES)) + break; + ftbw->max_docid_expr= tmp_expr; + /* fall through */ + case FT_TOKEN_STOPWORD: + if (! ftb_param->up_quot) break; + phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); + tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); + phrase_word->pos= (uchar*)word; + phrase_word->len= word_len; + tmp_element->data= (void *)phrase_word; + ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element); + /* Allocate document list at this point. + It allows to avoid huge amount of allocs/frees for each row.*/ + tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST)); + tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD)); + ftb_param->ftbe->document= + list_add(ftb_param->ftbe->document, tmp_element); + break; + case FT_TOKEN_LEFT_PAREN: + ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR)); + ftbe->flags= 0; + if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES; + if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO; + ftbe->weight= weight; + ftbe->up= ftb_param->ftbe; + ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; + ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR; + ftbe->phrase= NULL; + ftbe->document= 0; + if (info->quot) ftb_param->ftb->with_scan|= 2; + if (info->yesno > 0) ftbe->up->ythresh++; + ftb_param->ftbe= ftbe; + ftb_param->depth++; + ftb_param->up_quot= (uchar*)info->quot; + break; + case FT_TOKEN_RIGHT_PAREN: + if (ftb_param->ftbe->document) + { + /* Circuit document list */ + for (tmp_element= ftb_param->ftbe->document; + tmp_element->next; tmp_element= tmp_element->next) /* no-op */; + tmp_element->next= ftb_param->ftbe->document; + ftb_param->ftbe->document->prev= tmp_element; + } + info->quot= 0; + if (ftb_param->ftbe->up) + { + DBUG_ASSERT(ftb_param->depth); + ftb_param->ftbe= ftb_param->ftbe->up; + ftb_param->depth--; + ftb_param->up_quot= 0; + } + break; + case FT_TOKEN_EOF: + default: + break; + } + return(0); +} + + +static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param, + const char *query, int len) +{ + MY_FTB_PARAM *ftb_param= param->mysql_ftparam; + MYSQL_FTPARSER_BOOLEAN_INFO info; + CHARSET_INFO *cs= ftb_param->ftb->charset; + const uchar **start= (const uchar**) &query; + uchar *end= (uchar*) query + len; + FT_WORD w; + + info.prev= ' '; + info.quot= 0; + while (maria_ft_get_word(cs, start, end, &w, &info)) + param->mysql_add_word(param, (char*)w.pos, w.len, &info); + return(0); +} + + +static int _ftb_parse_query(FTB *ftb, uchar *query, uint len, + struct st_mysql_ftparser *parser) +{ + MYSQL_FTPARSER_PARAM *param; + MY_FTB_PARAM ftb_param; + DBUG_ENTER("_ftb_parse_query"); + DBUG_ASSERT(parser); + + if (ftb->state != UNINITIALIZED) + DBUG_RETURN(0); + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) + DBUG_RETURN(1); + + ftb_param.ftb= ftb; + ftb_param.depth= 0; + ftb_param.ftbe= ftb->root; + ftb_param.up_quot= 0; + + param->mysql_parse= ftb_parse_query_internal; + param->mysql_add_word= ftb_query_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->cs= ftb->charset; + param->doc= (char*)query; + param->length= len; + param->flags= 0; + param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO; + DBUG_RETURN(parser->parse(param)); +} + + +static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)), + const void *a,const void *b) +{ + return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b))); +} + + +/* returns 1 if the search was finished (must-word wasn't found) */ + +static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) +{ + int r; + int subkeys=1; + my_bool can_go_down; + MARIA_HA *info=ftb->info; + uint UNINIT_VAR(off), extra=HA_FT_WLEN+info->s->base.rec_reflength; + uchar *lastkey_buf= ftbw->word+ftbw->off; + MARIA_KEY key; + + if (ftbw->flags & FTB_FLAG_TRUNC) + lastkey_buf+=ftbw->len; + + if (init_search) + { + ftbw->key_root=info->s->state.key_root[ftb->keynr]; + ftbw->keyinfo=info->s->keyinfo+ftb->keynr; + info->last_key.keyinfo= key.keyinfo= ftbw->keyinfo; + info->lastinx= ~0; /* Safety */ + key.data= ftbw->word; + key.data_length= ftbw->len; + key.ref_length= 0; + key.flag= 0; + + r= _ma_search(info, &key, SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root); + } + else + { + uint sflag= SEARCH_BIGGER; + my_off_t max_docid=0; + FTB_EXPR *tmp; + + for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up) + set_if_bigger(max_docid, tmp->max_docid); + + if (ftbw->docid[0] < max_docid) + { + sflag|= SEARCH_SAME; + _ma_dpointer(info->s, (uchar*) (ftbw->word + ftbw->len + HA_FT_WLEN), + max_docid); + } + + info->last_key.keyinfo= key.keyinfo= ftbw->keyinfo; + info->lastinx= ~0; /* Safety */ + key.data= lastkey_buf; + key.data_length= USE_WHOLE_KEY; + key.ref_length= 0; + key.flag= 0; + + r= _ma_search(info, &key, sflag, ftbw->key_root); + } + + can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC))); + /* Skip rows inserted by concurrent insert */ + while (!r) + { + if (can_go_down) + { + /* going down ? */ + off= info->last_key.data_length + info->last_key.ref_length - extra; + subkeys=ft_sintXkorr(info->last_key.data + off); + } + if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length) + break; + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, ftbw->key_root); + } + + if (!r && !ftbw->off) + { + r= ha_compare_word_or_prefix(ftb->charset, + info->last_key.data + 1, + info->last_key.data_length + + info->last_key.ref_length - + extra - 1, + (uchar*) ftbw->word + 1, + ftbw->len - 1, + (my_bool) (ftbw->flags & FTB_FLAG_TRUNC)); + } + + if (r) /* not found */ + { + if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC)) + { + ftbw->docid[0]=HA_OFFSET_ERROR; + if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0) + { + /* + This word MUST BE present in every document returned, + so we can stop the search right now + */ + ftb->state=INDEX_DONE; + return 1; /* search is done */ + } + else + return 0; + } + + /* going up to the first-level tree to continue search there */ + _ma_dpointer(info->s, (lastkey_buf+HA_FT_WLEN), ftbw->key_root); + ftbw->key_root=info->s->state.key_root[ftb->keynr]; + ftbw->keyinfo=info->s->keyinfo+ftb->keynr; + ftbw->off=0; + return _ft2_search_no_lock(ftb, ftbw, 0); + } + + /* matching key found */ + memcpy(lastkey_buf, info->last_key.data, + info->last_key.data_length + info->last_key.ref_length); + if (lastkey_buf == ftbw->word) + ftbw->len= info->last_key.data_length + info->last_key.ref_length - extra; + + /* going down ? */ + if (subkeys<0) + { + /* + yep, going down, to the second-level tree + TODO here: subkey-based optimization + */ + ftbw->off=off; + ftbw->key_root= info->cur_row.lastpos; + ftbw->keyinfo= info->last_key.keyinfo= & info->s->ft2_keyinfo; + r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root); + DBUG_ASSERT(r==0); /* found something */ + memcpy(lastkey_buf+off, info->last_key.data, + info->last_key.data_length + info->last_key.ref_length); + } + ftbw->docid[0]= info->cur_row.lastpos; + if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC)) + ftbw->max_docid_expr->max_docid= info->cur_row.lastpos; + return 0; +} + +static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search) +{ + int r; + MARIA_SHARE *share= ftb->info->s; + if (share->lock_key_trees) + mysql_rwlock_rdlock(&share->keyinfo[ftb->keynr].root_lock); + r= _ft2_search_no_lock(ftb, ftbw, init_search); + if (share->lock_key_trees) + mysql_rwlock_unlock(&share->keyinfo[ftb->keynr].root_lock); + return r; +} + + +static void _ftb_init_index_search(FT_INFO *ftb) +{ + int i; + FTB_WORD *ftbw; + + if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY) + return; + ftb->state=INDEX_SEARCH; + + for (i= queue_last_element(&ftb->queue); + (int) i >= (int) queue_first_element(&ftb->queue); + i--) + { + ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i)); + + if (ftbw->flags & FTB_FLAG_TRUNC) + { + /* + special treatment for truncation operator + 1. there are some (besides this) +words + | no need to search in the index, it can never ADD new rows + | to the result, and to remove half-matched rows we do scan anyway + 2. -trunc* + | same as 1. + 3. in 1 and 2, +/- need not be on the same expr. level, + but can be on any upper level, as in +word +(trunc1* trunc2*) + 4. otherwise + | We have to index-search for this prefix. + | It may cause duplicates, as in the index (sorted by <word,docid>) + | <aaaa,row1> + | <aabb,row2> + | <aacc,row1> + | Searching for "aa*" will find row1 twice... + */ + FTB_EXPR *ftbe; + for (ftbe=(FTB_EXPR*)ftbw; + ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC); + ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up) + { + if (ftbe->flags & FTB_FLAG_NO || /* 2 */ + ftbe->up->ythresh - ftbe->up->yweaks > + (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES)) /* 1 */ + { + FTB_EXPR *top_ftbe=ftbe->up; + ftbw->docid[0]=HA_OFFSET_ERROR; + for (ftbe=(FTB_EXPR *)ftbw; + ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO); + ftbe=ftbe->up) + ftbe->up->yweaks++; + ftbe=0; + break; + } + } + if (!ftbe) + continue; + /* 4 */ + if (!is_tree_inited(& ftb->no_dupes)) + init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t), + _ftb_no_dupes_cmp,0,0,0); + else + reset_tree(& ftb->no_dupes); + } + + ftbw->off=0; /* in case of reinit */ + if (_ft2_search(ftb, ftbw, 1)) + return; + } + queue_fix(& ftb->queue); +} + + +FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr, + uchar *query, uint query_len, + CHARSET_INFO *cs) +{ + FTB *ftb; + FTB_EXPR *ftbe; + FTB_WORD *ftbw; + + if (!(ftb=(FTB *)my_malloc(PSI_INSTRUMENT_ME, sizeof(FTB), MYF(MY_WME)))) + return 0; + ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean; + ftb->state=UNINITIALIZED; + ftb->info=info; + ftb->keynr=keynr; + ftb->charset=cs; + DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset); + ftb->with_scan=0; + ftb->lastpos=HA_OFFSET_ERROR; + bzero(& ftb->no_dupes, sizeof(TREE)); + ftb->last_word= 0; + + init_alloc_root(PSI_INSTRUMENT_ME, &ftb->mem_root, 1024, 1024, 0); + ftb->queue.max_elements= 0; + if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR)))) + goto err; + ftbe->weight=1; + ftbe->flags=FTB_FLAG_YES; + ftbe->nos=1; + ftbe->up=0; + ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0; + ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR; + ftbe->phrase= NULL; + ftbe->document= 0; + ftb->root=ftbe; + if (unlikely(_ftb_parse_query(ftb, query, query_len, + keynr == NO_SUCH_KEY ? &ft_default_parser : + info->s->keyinfo[keynr].parser))) + goto err; + /* + Hack: instead of init_queue, we'll use reinit queue to be able + to alloc queue with alloc_root() + */ + if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root, + (ftb->queue.max_elements + 1) * + sizeof(void *)))) + goto err; + reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0, + (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0); + for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev) + queue_insert(&ftb->queue, (uchar *)ftbw); + ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root, + sizeof(FTB_WORD *)*ftb->queue.elements); + memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements); + my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *), + (qsort2_cmp)FTB_WORD_cmp_list, (void*) ftb->charset); + if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC; + ftb->state=READY; + return ftb; +err: + free_root(& ftb->mem_root, MYF(0)); + my_free(ftb); + return 0; +} + + +typedef struct st_my_ftb_phrase_param +{ + LIST *phrase; + LIST *document; + CHARSET_INFO *cs; + uint phrase_length; + uint document_length; + uint match; +} MY_FTB_PHRASE_PARAM; + + +static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param, + const char *word, int word_len, + MYSQL_FTPARSER_BOOLEAN_INFO + *boolean_info __attribute__((unused))) +{ + MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; + FT_WORD *w= (FT_WORD *)phrase_param->document->data; + LIST *phrase, *document; + w->pos= (uchar*)word; + w->len= word_len; + phrase_param->document= phrase_param->document->prev; + if (phrase_param->phrase_length > phrase_param->document_length) + { + phrase_param->document_length++; + return 0; + } + /* TODO: rewrite phrase search to avoid + comparing the same word twice. */ + for (phrase= phrase_param->phrase, document= phrase_param->document->next; + phrase; phrase= phrase->next, document= document->next) + { + FT_WORD *phrase_word= (FT_WORD *)phrase->data; + FT_WORD *document_word= (FT_WORD *)document->data; + if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos, + phrase_word->len, + (uchar*) document_word->pos, document_word->len)) + return 0; + } + phrase_param->match++; + return 0; +} + + +static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param, + const char *document, int len) +{ + FT_WORD word; + MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam; + const uchar *docend= (uchar*)document + len; + while (maria_ft_simple_get_word(phrase_param->cs, (uchar**)&document, + docend, &word, FALSE)) + { + param->mysql_add_word(param, (char*)word.pos, word.len, 0); + if (phrase_param->match) + break; + } + return 0; +} + + +/* + Checks if given buffer matches phrase list. + + SYNOPSIS + _ftb_check_phrase() + s0 start of buffer + e0 end of buffer + phrase broken into list phrase + cs charset info + + RETURN VALUE + 1 is returned if phrase found, 0 else. + -1 is returned if error occurs. +*/ + +static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len, + FTB_EXPR *ftbe, struct st_mysql_ftparser *parser) +{ + MY_FTB_PHRASE_PARAM ftb_param; + MYSQL_FTPARSER_PARAM *param; + DBUG_ENTER("_ftb_check_phrase"); + DBUG_ASSERT(parser); + + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1))) + DBUG_RETURN(0); + ftb_param.phrase= ftbe->phrase; + ftb_param.document= ftbe->document; + ftb_param.cs= ftb->charset; + ftb_param.phrase_length= list_length(ftbe->phrase); + ftb_param.document_length= 1; + ftb_param.match= 0; + + param->mysql_parse= ftb_check_phrase_internal; + param->mysql_add_word= ftb_phrase_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->cs= ftb->charset; + param->doc= (char *)document; + param->length= len; + param->flags= 0; + param->mode= MYSQL_FTPARSER_WITH_STOPWORDS; + if (unlikely(parser->parse(param))) + return -1; + DBUG_RETURN(ftb_param.match ? 1 : 0); +} + + +static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig) +{ + FT_SEG_ITERATOR ftsi; + FTB_EXPR *ftbe; + float weight=ftbw->weight; + int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0); + my_off_t curdoc=ftbw->docid[mode]; + struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? + &ft_default_parser : + ftb->info->s->keyinfo[ftb->keynr].parser; + + for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up) + { + ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks); + if (ftbe->docid[mode] != curdoc) + { + ftbe->cur_weight=0; + ftbe->yesses=ftbe->nos=0; + ftbe->docid[mode]=curdoc; + } + if (ftbe->nos) + break; + if (yn_flag & FTB_FLAG_YES) + { + weight /= ftbe->ythresh; + ftbe->cur_weight += weight; + if ((int) ++ftbe->yesses == ythresh) + { + yn_flag=ftbe->flags; + weight=ftbe->cur_weight*ftbe->weight; + if (mode && ftbe->phrase) + { + int found= 0; + + memcpy(&ftsi, ftsi_orig, sizeof(ftsi)); + while (_ma_ft_segiterator(&ftsi) && !found) + { + if (!ftsi.pos) + continue; + found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser); + if (unlikely(found < 0)) + return 1; + } + if (!found) + break; + } /* ftbe->quot */ + } + else + break; + } + else + if (yn_flag & FTB_FLAG_NO) + { + /* + NOTE: special sort function of queue assures that all + (yn_flag & FTB_FLAG_NO) != 0 + events for every particular subexpression will + "auto-magically" happen BEFORE all the + (yn_flag & FTB_FLAG_YES) != 0 events. So no + already matched expression can become not-matched again. + */ + ++ftbe->nos; + break; + } + else + { + if (ftbe->ythresh) + weight/=3; + ftbe->cur_weight += weight; + if ((int) ftbe->yesses < ythresh) + break; + if (!(yn_flag & FTB_FLAG_WONLY)) + yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ; + weight*= ftbe->weight; + } + } + return 0; +} + + +int maria_ft_boolean_read_next(FT_INFO *ftb, char *record) +{ + FTB_EXPR *ftbe; + FTB_WORD *ftbw; + MARIA_HA *info=ftb->info; + my_off_t curdoc; + + if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE) + return -1; + + /* black magic ON */ + if ((int) _ma_check_index(info, ftb->keynr) < 0) + return my_errno; + if (_ma_readinfo(info, F_RDLCK, 1)) + return my_errno; + /* black magic OFF */ + + if (!ftb->queue.elements) + return my_errno=HA_ERR_END_OF_FILE; + + /* Attention!!! Address of a local variable is used here! See err: label */ + ftb->queue.first_cmp_arg=(void *)&curdoc; + + while (ftb->state == INDEX_SEARCH && + (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) != + HA_OFFSET_ERROR) + { + while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0]) + { + if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0))) + { + my_errno= HA_ERR_OUT_OF_MEM; + goto err; + } + + /* update queue */ + _ft2_search(ftb, ftbw, 0); + queue_replace_top(&ftb->queue); + } + + ftbe=ftb->root; + if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 && + ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos) + { + /* curdoc matched ! */ + if (is_tree_inited(&ftb->no_dupes) && + tree_insert(&ftb->no_dupes, &curdoc, 0, + ftb->no_dupes.custom_arg)->count >1) + /* but it managed already to get past this line once */ + continue; + + info->cur_row.lastpos= curdoc; + /* Clear all states, except that the table was updated */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + if (!(*info->read_record)(info, (uchar *) record, curdoc)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + if (ftb->with_scan && + maria_ft_boolean_find_relevance(ftb, (uchar*)record, 0)==0) + continue; /* no match */ + my_errno=0; + goto err; + } + goto err; + } + } + ftb->state=INDEX_DONE; + my_errno=HA_ERR_END_OF_FILE; +err: + ftb->queue.first_cmp_arg=(void *)0; + return my_errno; +} + + +typedef struct st_my_ftb_find_param +{ + FT_INFO *ftb; + FT_SEG_ITERATOR *ftsi; +} MY_FTB_FIND_PARAM; + + +static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param, + const char *word, int len, + MYSQL_FTPARSER_BOOLEAN_INFO + *boolean_info __attribute__((unused))) +{ + MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; + FT_INFO *ftb= ftb_param->ftb; + FTB_WORD *ftbw; + int a, b, c; + /* + Find right-most element in the array of query words matching this + word from a document. + */ + for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2) + { + ftbw= ftb->list[c]; + if (ha_compare_word_or_prefix(ftb->charset, (uchar*) word, len, + (uchar*) ftbw->word + 1, ftbw->len - 1, + (my_bool) (ftbw->flags & FTB_FLAG_TRUNC)) < 0) + b= c; + else + a= c; + } + /* + If there were no words with truncation operator, we iterate to the + beginning of an array until array element is equal to the word from + a document. This is done mainly because the same word may be + mentioned twice (or more) in the query. + + In case query has words with truncation operator we must iterate + to the beginning of the array. There may be non-matching query words + between matching word with truncation operator and the right-most + matching element. E.g., if we're looking for 'aaa15' in an array of + 'aaa1* aaa14 aaa15 aaa16'. + + Worse of that there still may be match even if the binary search + above didn't find matching element. E.g., if we're looking for + 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will + stop at 'aaa16'. + */ + for (; c >= 0; c--) + { + ftbw= ftb->list[c]; + if (ha_compare_word_or_prefix(ftb->charset, (uchar*) word, len, + (uchar*)ftbw->word + 1, ftbw->len - 1, + (my_bool) (ftbw->flags & FTB_FLAG_TRUNC))) + { + if (ftb->with_scan & FTB_FLAG_TRUNC) + continue; + else + break; + } + if (ftbw->docid[1] == ftb->info->cur_row.lastpos) + continue; + ftbw->docid[1]= ftb->info->cur_row.lastpos; + if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi))) + return 1; + } + return(0); +} + + +static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param, + const char *doc, int len) +{ + MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam; + FT_INFO *ftb= ftb_param->ftb; + uchar *end= (uchar*) doc + len; + FT_WORD w; + while (maria_ft_simple_get_word(ftb->charset, (uchar**)&doc, end, &w, TRUE)) + param->mysql_add_word(param, (char*)w.pos, w.len, 0); + return(0); +} + + +float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length) +{ + FTB_EXPR *ftbe; + FT_SEG_ITERATOR ftsi, ftsi2; + MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos; + MY_FTB_FIND_PARAM ftb_param; + MYSQL_FTPARSER_PARAM *param; + struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ? + &ft_default_parser : + ftb->info->s->keyinfo[ftb->keynr].parser; + + if (docid == HA_OFFSET_ERROR) + return -2.0; + if (!ftb->queue.elements) + return 0; + if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0))) + return 0; + + if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos) + { + FTB_EXPR *x; + uint i; + + for (i=0; i < ftb->queue.elements; i++) + { + ftb->list[i]->docid[1]=HA_OFFSET_ERROR; + for (x=ftb->list[i]->up; x; x=x->up) + x->docid[1]=HA_OFFSET_ERROR; + } + } + + ftb->lastpos=docid; + + if (ftb->keynr==NO_SUCH_KEY) + _ma_ft_segiterator_dummy_init(record, length, &ftsi); + else + _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi); + memcpy(&ftsi2, &ftsi, sizeof(ftsi)); + + ftb_param.ftb= ftb; + ftb_param.ftsi= &ftsi2; + param->mysql_parse= ftb_find_relevance_parse; + param->mysql_add_word= ftb_find_relevance_add_word; + param->mysql_ftparam= (void *)&ftb_param; + param->flags= 0; + param->cs= ftb->charset; + param->mode= MYSQL_FTPARSER_SIMPLE_MODE; + + while (_ma_ft_segiterator(&ftsi)) + { + if (!ftsi.pos) + continue; + param->doc= (char *)ftsi.pos; + param->length= ftsi.len; + if (unlikely(parser->parse(param))) + return 0; + } + ftbe=ftb->root; + if (ftbe->docid[1]==docid && ftbe->cur_weight>0 && + ftbe->yesses>=ftbe->ythresh && !ftbe->nos) + { /* row matched ! */ + return ftbe->cur_weight; + } + else + { /* match failed ! */ + return 0.0; + } +} + + +void maria_ft_boolean_close_search(FT_INFO *ftb) +{ + if (is_tree_inited(& ftb->no_dupes)) + { + delete_tree(&ftb->no_dupes, 0); + } + free_root(& ftb->mem_root, MYF(0)); + my_free(ftb); +} + + +float maria_ft_boolean_get_relevance(FT_INFO *ftb) +{ + return ftb->root->cur_weight; +} + + +void maria_ft_boolean_reinit_search(FT_INFO *ftb) +{ + _ftb_init_index_search(ftb); +} diff --git a/storage/maria/ma_ft_eval.c b/storage/maria/ma_ft_eval.c new file mode 100644 index 00000000..22b19b99 --- /dev/null +++ b/storage/maria/ma_ft_eval.c @@ -0,0 +1,254 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include "maria_ft_eval.h" +#include <stdarg.h> +#include <my_getopt.h> + +static void print_error(int exit_code, const char *fmt,...); +static void get_options(int argc, char *argv[]); +static int create_record(char *pos, FILE *file); +static void usage(); + +static struct my_option my_long_options[] = +{ + {"", 's', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'q', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '#', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +int main(int argc, char *argv[]) +{ + MARIA_HA *file; + int i,j; + + MY_INIT(argv[0]); + get_options(argc,argv); + bzero((char*)recinfo,sizeof(recinfo)); + + maria_init(); + /* First define 2 columns */ + recinfo[0].type=FIELD_SKIP_ENDSPACE; + recinfo[0].length=docid_length; + recinfo[1].type=FIELD_BLOB; + recinfo[1].length= 4+portable_sizeof_char_ptr; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].seg[0].type= HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].flag= HA_BLOB_PART; + keyinfo[0].seg[0].start=recinfo[0].length; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit=0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].bit_start=4; + keyinfo[0].seg[0].language=MY_CHARSET_CURRENT; + keyinfo[0].flag = HA_FULLTEXT; + + if (!silent) + printf("- Creating isam-file\n"); + if (maria_create(filename,1,keyinfo,2,recinfo,0,NULL,(MARIA_CREATE_INFO*) 0,0)) + goto err; + if (!(file=maria_open(filename,2,0))) + goto err; + if (!silent) + printf("Initializing stopwords\n"); + maria_ft_init_stopwords(stopwordlist); + + if (!silent) + printf("- Writing key:s\n"); + + my_errno=0; + i=0; + while (create_record(record,df)) + { + error=maria_write(file,record); + if (error) + printf("I= %2d maria_write: %d errno: %d\n",i,error,my_errno); + i++; + } + fclose(df); + + if (maria_close(file)) goto err; + if (!silent) + printf("- Reopening file\n"); + if (!(file=maria_open(filename,2,0))) goto err; + if (!silent) + printf("- Reading rows with key\n"); + for (i=1;create_record(record,qf);i++) + { + FT_DOCLIST *result; + double w; + int t, err; + + result=maria_ft_nlq_init_search(file,0,blob_record,(uint) strlen(blob_record),1); + if (!result) + { + printf("Query %d failed with errno %3d\n",i,my_errno); + goto err; + } + if (!silent) + printf("Query %d. Found: %d.\n",i,result->ndocs); + for (j=0;(err=maria_ft_nlq_read_next(result, read_record))==0;j++) + { + t=uint2korr(read_record); + w=maria_ft_nlq_get_relevance(result); + printf("%d %.*s %f\n",i,t,read_record+2,w); + } + if (err != HA_ERR_END_OF_FILE) + { + printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno); + goto err; + } + maria_ft_nlq_close_search(result); + } + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); + + err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ + +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch (optid) { + case 's': + if (stopwordlist && stopwordlist != maria_ft_precompiled_stopwords) + break; + { + FILE *f; char s[HA_FT_MAXLEN]; int i=0,n=SWL_INIT; + + if (!(stopwordlist=(const char**) malloc(n*sizeof(char *)))) + print_error(1,"malloc(%d)",n*sizeof(char *)); + if (!(f=fopen(argument,"r"))) + print_error(1,"fopen(%s)",argument); + while (!feof(f)) + { + if (!(fgets(s,HA_FT_MAXLEN,f))) + print_error(1,"fgets(s,%d,%s)",HA_FT_MAXLEN,argument); + if (!(stopwordlist[i++]=strdup(s))) + print_error(1,"strdup(%s)",s); + if (i >= n) + { + n+=SWL_PLUS; + if (!(stopwordlist=(const char**) realloc((char*) stopwordlist, + n*sizeof(char *)))) + print_error(1,"realloc(%d)",n*sizeof(char *)); + } + } + fclose(f); + stopwordlist[i]=NULL; + break; + } + case 'q': silent=1; break; + case 'S': if (stopwordlist==maria_ft_precompiled_stopwords) stopwordlist=NULL; break; + case '#': + DBUG_PUSH (argument); + break; + case 'V': + case '?': + case 'h': + usage(); + exit(1); + } + return 0; +} + + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + + if (!(d_file=argv[optind])) print_error(1,"No d_file"); + if (!(df=fopen(d_file,"r"))) + print_error(1,"fopen(%s)",d_file); + if (!(q_file=argv[optind+1])) print_error(1,"No q_file"); + if (!(qf=fopen(q_file,"r"))) + print_error(1,"fopen(%s)",q_file); + return; +} /* get options */ + + +static int create_record(char *pos, FILE *file) +{ + uint tmp; char *ptr; + + bzero((char *)pos,MAX_REC_LENGTH); + + /* column 1 - VARCHAR */ + if (!(fgets(pos+2,MAX_REC_LENGTH-32,file))) + { + if (feof(file)) + return 0; + else + print_error(1,"fgets(docid) - 1"); + } + tmp=(uint) strlen(pos+2)-1; + int2store(pos,tmp); + pos+=recinfo[0].length; + + /* column 2 - BLOB */ + + if (!(fgets(blob_record,MAX_BLOB_LENGTH,file))) + print_error(1,"fgets(docid) - 2"); + tmp=(uint) strlen(blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + return 1; +} + +/* VARARGS */ + +static void print_error(int exit_code, const char *fmt,...) +{ + va_list args; + + va_start(args,fmt); + fprintf(stderr,"%s: error: ",my_progname); + VOID(vfprintf(stderr, fmt, args)); + VOID(fputc('\n',stderr)); + fflush(stderr); + va_end(args); + exit(exit_code); +} + + +static void usage() +{ + printf("%s [options]\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_ft_eval.h b/storage/maria/ma_ft_eval.h new file mode 100644 index 00000000..46017134 --- /dev/null +++ b/storage/maria/ma_ft_eval.h @@ -0,0 +1,41 @@ +/* Copyright (C) 2006 MySQL AB & Sergei A. Golubchik + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +const char **stopwordlist=maria_ft_precompiled_stopwords; + +#define MAX_REC_LENGTH 128 +#define MAX_BLOB_LENGTH 60000 +char record[MAX_REC_LENGTH], read_record[MAX_REC_LENGTH+MAX_BLOB_LENGTH]; +char blob_record[MAX_BLOB_LENGTH+20*20]; + +char *filename= (char*) "EVAL"; + +int silent=0, error=0; + +uint key_length=MAX_BLOB_LENGTH,docid_length=32; +char *d_file, *q_file; +FILE *df,*qf; + +MARIA_COLUMNDEF recinfo[3]; +MARIA_KEYDEF keyinfo[2]; +HA_KEYSEG keyseg[10]; + +#define SWL_INIT 500 +#define SWL_PLUS 50 + +#define MAX_LINE_LENGTH 128 +char line[MAX_LINE_LENGTH]; diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c new file mode 100644 index 00000000..890de3db --- /dev/null +++ b/storage/maria/ma_ft_nlq_search.c @@ -0,0 +1,388 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#define FT_CORE +#include "ma_ftdefs.h" + +/* search with natural language queries */ + +typedef struct ft_doc_rec +{ + my_off_t dpos; + double weight; +} FT_DOC; + +struct st_ft_info +{ + struct _ft_vft *please; + MARIA_HA *info; + int ndocs; + int curdoc; + FT_DOC doc[1]; +}; + +typedef struct st_all_in_one +{ + MARIA_HA *info; + uint keynr; + CHARSET_INFO *charset; + uchar *keybuff; + TREE dtree; +} ALL_IN_ONE; + +typedef struct st_ft_superdoc +{ + FT_DOC doc; + FT_WORD *word_ptr; + double tmp_weight; +} FT_SUPERDOC; + + +static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)), + FT_SUPERDOC *p1, FT_SUPERDOC *p2) +{ + if (p1->doc.dpos < p2->doc.dpos) + return -1; + if (p1->doc.dpos == p2->doc.dpos) + return 0; + return 1; +} + +static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio) +{ + FT_WEIGTH subkeys; + int r; + uint doc_cnt; + FT_SUPERDOC sdoc, *sptr; + TREE_ELEMENT *selem; + double gweight=1; + MARIA_HA *info= aio->info; + MARIA_SHARE *share= info->s; + uchar *keybuff= aio->keybuff; + MARIA_KEYDEF *keyinfo= share->keyinfo+aio->keynr; + my_off_t key_root; + uint extra=HA_FT_WLEN+share->rec_reflength; + MARIA_KEY key; + float tmp_weight; + DBUG_ENTER("walk_and_match"); + + word->weight=LWS_FOR_QUERY; + + _ma_ft_make_key(info, &key, aio->keynr, keybuff, word, 0); + key.data_length-= HA_FT_WLEN; + doc_cnt=0; + subkeys.i= 0; + + if (share->lock_key_trees) + mysql_rwlock_rdlock(&share->keyinfo[aio->keynr].root_lock); + + key_root= share->state.key_root[aio->keynr]; + + /* Skip rows inserted by current inserted */ + for (r= _ma_search(info, &key, SEARCH_FIND, key_root) ; + !r && + (subkeys.i= ft_sintXkorr(info->last_key.data + + info->last_key.data_length + + info->last_key.ref_length - extra)) > 0 && + info->cur_row.lastpos >= info->state->data_file_length ; + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root)) + ; + + if (share->lock_key_trees) + mysql_rwlock_unlock(&share->keyinfo[aio->keynr].root_lock); + + info->update|= HA_STATE_AKTIV; /* for _ma_test_if_changed() */ + + /* The following should be safe, even if we compare doubles */ + while (!r && gweight) + { + if (key.data_length && + ha_compare_word(aio->charset, + info->last_key.data + 1, + info->last_key.data_length + + info->last_key.ref_length - extra - 1, + key.data + 1, key.data_length - 1)) + break; + + if (subkeys.i < 0) + { + if (doc_cnt) + DBUG_RETURN(1); /* index is corrupted */ + /* + TODO here: unsafe optimization, should this word + be skipped (based on subkeys) ? + */ + keybuff+= key.data_length; + keyinfo= &share->ft2_keyinfo; + key_root= info->cur_row.lastpos; + key.data_length= 0; + if (share->lock_key_trees) + mysql_rwlock_rdlock(&share->keyinfo[aio->keynr].root_lock); + r= _ma_search_first(info, keyinfo, key_root); + goto do_skip; + } + /* The weight we read was actually a float */ + tmp_weight= subkeys.f; + /* The following should be safe, even if we compare doubles */ + if (tmp_weight==0) + DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */ + + sdoc.doc.dpos= info->cur_row.lastpos; + + /* saving document matched into dtree */ + if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg))) + DBUG_RETURN(1); + + sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem); + + if (selem->count==1) /* document's first match */ + sptr->doc.weight=0; + else + sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight; + + sptr->word_ptr=word; + sptr->tmp_weight=tmp_weight; + + doc_cnt++; + + gweight=word->weight*GWS_IN_USE; + if (gweight < 0 || doc_cnt > 2000000) + gweight=0; + + if (share->lock_key_trees) + mysql_rwlock_rdlock(&share->keyinfo[aio->keynr].root_lock); + + if (_ma_test_if_changed(info) == 0) + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root); + else + r= _ma_search(info, &info->last_key, SEARCH_BIGGER, key_root); +do_skip: + while ((subkeys.i= ft_sintXkorr(info->last_key.data + + info->last_key.data_length + + info->last_key.ref_length - extra)) > 0 && + !r && info->cur_row.lastpos >= info->state->data_file_length) + r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root); + + if (share->lock_key_trees) + mysql_rwlock_unlock(&share->keyinfo[aio->keynr].root_lock); + } + word->weight=gweight; + + DBUG_RETURN(0); +} + + +static int walk_and_copy(FT_SUPERDOC *from, + uint32 count __attribute__((unused)), FT_DOC **to) +{ + DBUG_ENTER("walk_and_copy"); + from->doc.weight+=from->tmp_weight*from->word_ptr->weight; + (*to)->dpos=from->doc.dpos; + (*to)->weight=from->doc.weight; + (*to)++; + DBUG_RETURN(0); +} + +static int walk_and_push(FT_SUPERDOC *from, + uint32 count __attribute__((unused)), QUEUE *best) +{ + DBUG_ENTER("walk_and_copy"); + from->doc.weight+=from->tmp_weight*from->word_ptr->weight; + set_if_smaller(best->elements, ft_query_expansion_limit-1); + queue_insert(best, (uchar *)& from->doc); + DBUG_RETURN(0); +} + + +static int FT_DOC_cmp(void *unused __attribute__((unused)), + FT_DOC *a, FT_DOC *b) +{ + return CMP_NUM(b->weight, a->weight); +} + + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query, + uint query_len, uint flags, uchar *record) +{ + TREE wtree; + ALL_IN_ONE aio; + FT_DOC *dptr; + FT_INFO *dlist=NULL; + MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos; + struct st_mysql_ftparser *parser; + MYSQL_FTPARSER_PARAM *ftparser_param; + DBUG_ENTER("maria_ft_init_nlq_search"); + + /* black magic ON */ + if ((int) (keynr = _ma_check_index(info,keynr)) < 0) + DBUG_RETURN(NULL); + if (_ma_readinfo(info,F_RDLCK,1)) + DBUG_RETURN(NULL); + /* black magic OFF */ + + aio.info=info; + aio.keynr=keynr; + aio.charset=info->s->keyinfo[keynr].seg->charset; + aio.keybuff= info->lastkey_buff2; + parser= info->s->keyinfo[keynr].parser; + if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0))) + goto err; + + bzero(&wtree,sizeof(wtree)); + + init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp, + NULL, NULL, MYF(0)); + + maria_ft_parse_init(&wtree, aio.charset); + ftparser_param->flags= 0; + if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param, + &wtree.mem_root)) + goto err; + + if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, + left_root_right)) + goto err; + + if (flags & FT_EXPAND && ft_query_expansion_limit) + { + QUEUE best; + init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp, + 0, 0, 0); + tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push, + &best, left_root_right); + while (best.elements) + { + my_off_t docid= ((FT_DOC *)queue_remove_top(&best))->dpos; + if (!(*info->read_record)(info, record, docid)) + { + info->update|= HA_STATE_AKTIV; + ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY; + if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param, + &wtree.mem_root))) + { + delete_queue(&best); + goto err; + } + } + } + delete_queue(&best); + reset_tree(&aio.dtree); + if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio, + left_root_right)) + goto err; + + } + + /* + If ndocs == 0, this will not allocate RAM for FT_INFO.doc[], + so if ndocs == 0, FT_INFO.doc[] must not be accessed. + */ + dlist=(FT_INFO *)my_malloc(PSI_INSTRUMENT_ME, sizeof(FT_INFO)+ + sizeof(FT_DOC)* + (int)(aio.dtree.elements_in_tree-1), + MYF(0)); + if (!dlist) + goto err; + + dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq; + dlist->ndocs=aio.dtree.elements_in_tree; + dlist->curdoc=-1; + dlist->info=aio.info; + dptr=dlist->doc; + + tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy, + &dptr, left_root_right); + + if (flags & FT_SORTED) + my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC), + (qsort2_cmp)&FT_DOC_cmp, 0); + +err: + delete_tree(&aio.dtree, 0); + delete_tree(&wtree, 0); + info->cur_row.lastpos= saved_lastpos; + DBUG_RETURN(dlist); +} + + +int maria_ft_nlq_read_next(FT_INFO *handler, char *record) +{ + MARIA_HA *info= (MARIA_HA *) handler->info; + + if (++handler->curdoc >= handler->ndocs) + { + --handler->curdoc; + return HA_ERR_END_OF_FILE; + } + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + info->cur_row.lastpos= handler->doc[handler->curdoc].dpos; + if (!(*info->read_record)(info, (uchar *) record, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + return 0; + } + return my_errno; +} + + +float maria_ft_nlq_find_relevance(FT_INFO *handler, + uchar *record __attribute__((unused)), + uint length __attribute__((unused))) +{ + int a,b,c; + FT_DOC *docs=handler->doc; + MARIA_RECORD_POS docid= handler->info->cur_row.lastpos; + + if (docid == HA_POS_ERROR) + return -5.0; + + /* Assuming docs[] is sorted by dpos... */ + + for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2) + { + if (docs[c].dpos > docid) + b=c; + else + a=c; + } + /* bounds check to avoid accessing unallocated handler->doc */ + if (a < handler->ndocs && docs[a].dpos == docid) + return (float) docs[a].weight; + else + return 0.0; +} + + +void maria_ft_nlq_close_search(FT_INFO *handler) +{ + my_free(handler); +} + + +float maria_ft_nlq_get_relevance(FT_INFO *handler) +{ + return (float) handler->doc[handler->curdoc].weight; +} + + +void maria_ft_nlq_reinit_search(FT_INFO *handler) +{ + handler->curdoc=-1; +} + diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c new file mode 100644 index 00000000..f600873d --- /dev/null +++ b/storage/maria/ma_ft_parser.c @@ -0,0 +1,398 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#include "ma_ftdefs.h" + +typedef struct st_maria_ft_docstat { + FT_WORD *list; + uint uniq; + double sum; +} FT_DOCSTAT; + + +typedef struct st_my_maria_ft_parser_param +{ + TREE *wtree; + MEM_ROOT *mem_root; +} MY_FT_PARSER_PARAM; + + +static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2) +{ + return ha_compare_word(cs, (uchar*) w1->pos, w1->len, + (uchar*) w2->pos, w2->len); +} + +static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat) +{ + word->weight=LWS_IN_USE; + docstat->sum+=word->weight; + memcpy((docstat->list)++, word, sizeof(FT_WORD)); + return 0; +} + +/* transforms tree of words into the array, applying normalization */ + +FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root) +{ + FT_WORD *wlist,*p; + FT_DOCSTAT docstat; + DBUG_ENTER("maria_ft_linearize"); + + if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)* + (1+wtree->elements_in_tree)))) + { + docstat.list=wlist; + docstat.uniq=wtree->elements_in_tree; + docstat.sum=0; + tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right); + } + delete_tree(wtree, 0); + if (!wlist) + DBUG_RETURN(NULL); + + docstat.list->pos=NULL; + + for (p=wlist;p->pos;p++) + { + p->weight=PRENORM_IN_USE; + } + + for (p=wlist;p->pos;p++) + { + p->weight/=NORM_IN_USE; + } + + DBUG_RETURN(wlist); +} + +/* + RETURN VALUE + 0 - eof + 1 - word found + 2 - left bracket + 3 - right bracket + 4 - stopword found +*/ +uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start, + const uchar *end, + FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param) +{ + const uchar *doc= *start; + int ctype; + uint mwc, length; + int mbl; + + param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); + param->weight_adjust= param->wasign= 0; + param->type= FT_TOKEN_EOF; + + while (doc<end) + { + for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + mbl= my_ci_ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + break; + if (*doc == FTB_RQUOT && param->quot) + { + param->quot= (char *) doc; + *start=doc+1; + param->type= FT_TOKEN_RIGHT_PAREN; + goto ret; + } + if (!param->quot) + { + if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT) + { + /* param->prev=' '; */ + *start=doc+1; + if (*doc == FTB_LQUOT) + param->quot= (char *) *start; + param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN); + goto ret; + } + if (param->prev == ' ') + { + if (*doc == FTB_YES ) { param->yesno=+1; continue; } else + if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else + if (*doc == FTB_NO ) { param->yesno=-1; continue; } else + if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else + if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else + if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; } + } + } + param->prev=*doc; + param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0); + param->weight_adjust= param->wasign= 0; + } + + mwc=length=0; + for (word->pos= doc; doc < end; length++, + doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + mbl= my_ci_ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + mwc=0; + else if (!misc_word_char(*doc) || mwc) + break; + else + mwc++; + } + param->prev='A'; /* be sure *prev is true_word_char */ + word->len= (uint)(doc-word->pos) - mwc; + if ((param->trunc=(doc<end && *doc == FTB_TRUNC))) + doc++; + + if (((length >= ft_min_word_len && !is_stopword((char *) word->pos, + word->len)) + || param->trunc) && length < ft_max_word_len) + { + *start=doc; + param->type= FT_TOKEN_WORD; + goto ret; + } + else if (length) /* make sure length > 0 (if start contains spaces only) */ + { + *start= doc; + param->type= FT_TOKEN_STOPWORD; + goto ret; + } + } + if (param->quot) + { + param->quot= (char *)(*start= doc); + param->type= 3; /* FT_RBR */ + goto ret; + } +ret: + return param->type; +} + +uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start, + const uchar *end, FT_WORD *word, + my_bool skip_stopwords) +{ + uchar *doc= *start; + uint mwc, length; + int ctype, mbl; + DBUG_ENTER("maria_ft_simple_get_word"); + + do + { + for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + if (doc >= end) + DBUG_RETURN(0); + mbl= my_ci_ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + break; + } + + mwc= length= 0; + for (word->pos= doc; doc < end; length++, + doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) + { + mbl= my_ci_ctype(cs, &ctype, doc, end); + if (true_word_char(ctype, *doc)) + mwc= 0; + else if (!misc_word_char(*doc) || mwc) + break; + else + mwc++; + } + + word->len= (uint)(doc-word->pos) - mwc; + + if (skip_stopwords == FALSE || + (length >= ft_min_word_len && length < ft_max_word_len && + !is_stopword((char *) word->pos, word->len))) + { + *start= doc; + DBUG_RETURN(1); + } + } while (doc < end); + DBUG_RETURN(0); +} + +void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs) +{ + DBUG_ENTER("maria_ft_parse_init"); + if (!is_tree_inited(wtree)) + init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp, NULL, + (void*) cs, MYF(0)); + DBUG_VOID_RETURN; +} + + +static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param, + const char *word, int word_len, + MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info + __attribute__((unused))) +{ + TREE *wtree; + FT_WORD w; + MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; + DBUG_ENTER("maria_ft_add_word"); + wtree= ft_param->wtree; + if (param->flags & MYSQL_FTFLAGS_NEED_COPY) + { + uchar *ptr; + DBUG_ASSERT(wtree->with_delete == 0); + ptr= (uchar *)alloc_root(ft_param->mem_root, word_len); + memcpy(ptr, word, word_len); + w.pos= ptr; + } + else + w.pos= (uchar*) word; + w.len= word_len; + if (!tree_insert(wtree, &w, 0, wtree->custom_arg)) + { + delete_tree(wtree, 0); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param, + const char *doc_arg, + int doc_len) +{ + uchar *doc= (uchar*) doc_arg; + uchar *end= doc + doc_len; + MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam; + TREE *wtree= ft_param->wtree; + FT_WORD w; + DBUG_ENTER("maria_ft_parse_internal"); + + while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE)) + if (param->mysql_add_word(param, (char*)w.pos, w.len, 0)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +int maria_ft_parse(TREE *wtree, uchar *doc, int doclen, + struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) +{ + MY_FT_PARSER_PARAM my_param; + DBUG_ENTER("maria_ft_parse"); + DBUG_ASSERT(parser); + my_param.wtree= wtree; + my_param.mem_root= mem_root; + + param->mysql_parse= maria_ft_parse_internal; + param->mysql_add_word= maria_ft_add_word; + param->mysql_ftparam= &my_param; + param->cs= wtree->custom_arg; + param->doc= (char*)doc; + param->length= doclen; + param->mode= MYSQL_FTPARSER_SIMPLE_MODE; + DBUG_RETURN(parser->parse(param)); +} + + +#define MAX_PARAM_NR 2 + +MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info) +{ + if (!info->ftparser_param) + { + /* +. info->ftparser_param can not be zero after the initialization, + because it always includes built-in fulltext parser. And built-in + parser can be called even if the table has no fulltext indexes and + no varchar/text fields. + + ftb_find_relevance... parser (ftb_find_relevance_parse, + ftb_find_relevance_add_word) calls ftb_check_phrase... parser + (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2. + */ + info->ftparser_param= (MYSQL_FTPARSER_PARAM *) + my_malloc(PSI_INSTRUMENT_ME, MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) * + info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL)); + init_alloc_root(PSI_INSTRUMENT_ME, &info->ft_memroot, + FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0)); + } + return info->ftparser_param; +} + + +MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, uint paramnr) +{ + uint32 ftparser_nr; + struct st_mysql_ftparser *parser; + + if (!maria_ftparser_alloc_param(info)) + return 0; + + if (keynr == NO_SUCH_KEY) + { + ftparser_nr= 0; + parser= &ft_default_parser; + } + else + { + ftparser_nr= info->s->keyinfo[keynr].ftkey_nr; + parser= info->s->keyinfo[keynr].parser; + } + DBUG_ASSERT(paramnr < MAX_PARAM_NR); + ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr; + if (! info->ftparser_param[ftparser_nr].mysql_add_word) + { + /* Note, that mysql_add_word is used here as a flag: + mysql_add_word == 0 - parser is not initialized + mysql_add_word != 0 - parser is initialized, or no + initialization needed. */ + info->ftparser_param[ftparser_nr].mysql_add_word= + (int (*)(struct st_mysql_ftparser_param *, const char *, + int, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1; + if (parser->init && parser->init(&info->ftparser_param[ftparser_nr])) + return 0; + } + return &info->ftparser_param[ftparser_nr]; +} + + +void maria_ftparser_call_deinitializer(MARIA_HA *info) +{ + uint i, j, keys= info->s->state.header.keys; + free_root(&info->ft_memroot, MYF(0)); + if (! info->ftparser_param) + return; + for (i= 0; i < keys; i++) + { + MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i]; + for (j=0; j < MAX_PARAM_NR; j++) + { + MYSQL_FTPARSER_PARAM *ftparser_param= + &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j]; + if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word) + { + if (keyinfo->parser->deinit) + keyinfo->parser->deinit(ftparser_param); + ftparser_param->mysql_add_word= 0; + } + else + break; + } + } +} diff --git a/storage/maria/ma_ft_stem.c b/storage/maria/ma_ft_stem.c new file mode 100644 index 00000000..9f3d2858 --- /dev/null +++ b/storage/maria/ma_ft_stem.c @@ -0,0 +1,18 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* mulitingual stem */ diff --git a/storage/maria/ma_ft_test1.c b/storage/maria/ma_ft_test1.c new file mode 100644 index 00000000..f1b1b53e --- /dev/null +++ b/storage/maria/ma_ft_test1.c @@ -0,0 +1,317 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code + added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */ + +#include "ma_ftdefs.h" +#include "maria_ft_test1.h" +#include <my_getopt.h> + +static int key_field=FIELD_VARCHAR,extra_field=FIELD_SKIP_ENDSPACE; +static uint key_length=200,extra_length=50; +static int key_type=HA_KEYTYPE_TEXT; +static int verbose=0,silent=0,skip_update=0, + no_keys=0,no_stopwords=0,no_search=0,no_fulltext=0; +static int create_flag=0,error=0; + +#define MAX_REC_LENGTH 300 +static char record[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH]; + +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void create_record(char *, int); +static void usage(); + +static struct my_option my_long_options[] = +{ + {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 's', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'N', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'K', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'F', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", 'U', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"", '#', "", 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + +int main(int argc, char *argv[]) +{ + MY_INIT(argv[0]); + + get_options(argc,argv); + maria_init(); + + exit(run_test("FT1")); +} + +static MARIA_COLUMNDEF recinfo[3]; +static MARIA_KEYDEF keyinfo[2]; +static HA_KEYSEG keyseg[10]; + +static int run_test(const char *filename) +{ + MARIA_HA *file; + int i,j; + my_off_t pos; + + bzero((char*) recinfo,sizeof(recinfo)); + + /* First define 2 columns */ + recinfo[0].type=extra_field; + recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : + extra_length); + if (extra_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length); + recinfo[1].type=key_field; + recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length); + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= (key_field == FIELD_BLOB) ? HA_BLOB_PART: + (key_field == FIELD_VARCHAR) ? HA_VAR_LENGTH_PART:0; + keyinfo[0].seg[0].start=recinfo[0].length; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + keyinfo[0].flag = (no_fulltext?HA_PACK_KEY:HA_FULLTEXT); + + if (!silent) + printf("- Creating isam-file\n"); + if (maria_create(filename,(no_keys?0:1),keyinfo,2,recinfo,0,NULL, + (MARIA_CREATE_INFO*) 0, create_flag)) + goto err; + if (!(file=maria_open(filename,2,0))) + goto err; + + if (!silent) + printf("- %s stopwords\n",no_stopwords?"Skipping":"Initializing"); + maria_ft_init_stopwords(no_stopwords?NULL:maria_ft_precompiled_stopwords); + + if (!silent) + printf("- Writing key:s\n"); + + my_errno=0; + for (i=NUPD ; i<NDATAS; i++ ) + { + create_record(record,i); + error=maria_write(file,record); + if (verbose || error) + printf("I= %2d maria_write: %d errno: %d, record: %s\n", + i,error,my_errno,data[i].f0); + } + + if (!skip_update) + { + if (!silent) + printf("- Updating rows\n"); + + /* Read through all rows and update them */ + pos=(ha_rows) 0; + i=0; + while ((error=maria_rrnd(file,read_record,pos)) == 0) + { + create_record(record,NUPD-i-1); + if (maria_update(file,read_record,record)) + { + printf("Can't update row: %.*s, error: %d\n", + keyinfo[0].seg[0].length,record,my_errno); + } + if(++i == NUPD) break; + pos=HA_OFFSET_ERROR; + } + if (i != NUPD) + printf("Found %d of %d rows\n", i,NUPD); + } + + if (maria_close(file)) goto err; + if(no_search) return 0; + if (!silent) + printf("- Reopening file\n"); + if (!(file=maria_open(filename,2,0))) goto err; + if (!silent) + printf("- Reading rows with key\n"); + for (i=0 ; i < NQUERIES ; i++) + { + FT_DOCLIST *result; + result=maria_ft_nlq_init_search(file,0,(char*) query[i],strlen(query[i]),1); + if(!result) + { + printf("Query %d: `%s' failed with errno %3d\n",i,query[i],my_errno); + continue; + } + printf("Query %d: `%s'. Found: %d. Top five documents:\n", + i,query[i],result->ndocs); + for (j=0;j<5;j++) + { + double w; int err; + err= maria_ft_nlq_read_next(result, read_record); + if (err==HA_ERR_END_OF_FILE) + { + printf("No more matches!\n"); + break; + } + else if (err) + { + printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno); + break; + } + w=maria_ft_nlq_get_relevance(result); + if (key_field == FIELD_VARCHAR) + { + uint l; + char *p; + p=recinfo[0].length+read_record; + l=uint2korr(p); + printf("%10.7f: %.*s\n",w,(int) l,p+2); + } + else + printf("%10.7f: %.*s\n",w,recinfo[1].length, + recinfo[0].length+read_record); + } + maria_ft_nlq_close_search(result); + } + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return (0); +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + +static char blob_key[MAX_REC_LENGTH]; +/* static char blob_record[MAX_REC_LENGTH+20*20]; */ + +void create_record(char *pos, int n) +{ + bzero((char*) pos,MAX_REC_LENGTH); + if (recinfo[0].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + strnmov(blob_key,data[n].f0,keyinfo[0].seg[0].length); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint tmp; + /* -1 is here because pack_length is stored in seg->length */ + uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1); + strnmov(pos+pack_length,data[n].f0,keyinfo[0].seg[0].length); + tmp=strlen(pos+pack_length); + if (pack_length == 1) + *pos= (char) tmp; + else + int2store(pos,tmp); + pos+=recinfo[0].length; + } + else + { + strnmov(pos,data[n].f0,keyinfo[0].seg[0].length); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + uint tmp; + char *ptr; + strnmov(blob_key,data[n].f2,keyinfo[0].seg[0].length); + tmp=strlen(blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy_fixed(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[1].length; + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + uint tmp; + /* -1 is here because pack_length is stored in seg->length */ + uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1); + strnmov(pos+pack_length,data[n].f2,keyinfo[0].seg[0].length); + tmp=strlen(pos+1); + if (pack_length == 1) + *pos= (char) tmp; + else + int2store(pos,tmp); + pos+=recinfo[1].length; + } + else + { + strnmov(pos,data[n].f2,keyinfo[0].seg[0].length); + pos+=recinfo[1].length; + } +} + + +static my_bool +get_one_option(int optid, const struct my_option *opt __attribute__((unused)), + char *argument) +{ + switch(optid) { + case 'v': verbose=1; break; + case 's': silent=1; break; + case 'F': no_fulltext=1; no_search=1; + case 'U': skip_update=1; break; + case 'K': no_keys=no_search=1; break; + case 'N': no_search=1; break; + case 'S': no_stopwords=1; break; + case '#': + DBUG_PUSH (argument); + break; + case 'V': + case '?': + case 'h': + usage(); + exit(1); + } + return 0; +} + +/* Read options */ + +static void get_options(int argc,char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + return; +} /* get options */ + + +static void usage() +{ + printf("%s [options]\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h new file mode 100644 index 00000000..df86eece --- /dev/null +++ b/storage/maria/ma_ft_test1.h @@ -0,0 +1,420 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +#define NUPD 20 +#define NDATAS 389 +struct { const char *f0, *f2; } data[NDATAS] = { + {"1", "General Information about MySQL"}, + {"1.1", "What is MySQL?"}, + {"1.2", "About this manual"}, + {"1.3", "History of MySQL"}, + {"1.4", "The main features of MySQL"}, + {"1.5", "General SQL information and tutorials"}, + {"1.6", "Useful MySQL-related links"}, + {"1.7", "What are stored procedures and triggers and so on?"}, + {"2", "MySQL mailing lists and how to ask questions/give error (bug) reports"}, + {"2.1", "Subscribing to/un-subscribing from the MySQL mailing list"}, + {"2.2", "Asking questions or reporting bugs"}, + {"2.3", "I think I have found a bug. What information do you need to help me?"}, + {"2.3.1", "MySQL keeps crashing"}, + {"2.4", "Guidelines for answering questions on the mailing list"}, + {"3", "Licensing or When do I have/want to pay for MySQL?"}, + {"3.1", "How much does MySQL cost?"}, + {"3.2", "How do I get commercial support?"}, + {"3.2.1", "Types of commercial support"}, + {"3.2.1.1", "Basic email support"}, + {"3.2.1.2", "Extended email support"}, +/*------------------------------- NUPD=20 -------------------------------*/ + {"3.2.1.3", "Asking: Login support"}, + {"3.2.1.4", "Extended login support"}, + {"3.3", "How do I pay for licenses/support?"}, + {"3.4", "Who do I contact when I want more information about licensing/support?"}, + {"3.5", "What Copyright does MySQL use?"}, + {"3.6", "When may I distribute MySQL commercially without a fee?"}, + {"3.7", "I want to sell a product that can be configured to use MySQL"}, + {"3.8", "I am running a commercial web server using MySQL"}, + {"3.9", "Do I need a license to sell commercial Perl/tcl/PHP/Web+ etc applications?"}, + {"3.10", "Possible future changes in the licensing"}, + {"4", "Compiling and installing MySQL"}, + {"4.1", "How do I get MySQL?"}, + {"4.2", "Which MySQL version should I use?"}, + {"4.3", "How/when will you release updates?"}, + {"4.4", "What operating systems does MySQL support?"}, + {"4.5", "Compiling MySQL from source code"}, + {"4.5.1", "Quick installation overview"}, + {"4.5.2", "Usual configure switches"}, + {"4.5.3", "Applying a patch"}, + {"4.6", "Problems compiling?"}, + {"4.7", "General compilation notes"}, + {"4.8", "MIT-pthreads notes (FreeBSD)"}, + {"4.9", "Perl installation comments"}, + {"4.10", "Special things to consider for some machine/OS combinations"}, + {"4.10.1", "Solaris notes"}, + {"4.10.2", "SunOS 4 notes"}, + {"4.10.3", "Linux notes for all versions"}, + {"4.10.3.1", "Linux-x86 notes"}, + {"4.10.3.2", "RedHat 5.0"}, + {"4.10.3.3", "RedHat 5.1"}, + {"4.10.3.4", "Linux-Sparc notes"}, + {"4.10.3.5", "Linux-Alpha notes"}, + {"4.10.3.6", "MkLinux notes"}, + {"4.10.4", "Alpha-DEC-Unix notes"}, + {"4.10.5", "Alpha-DEC-OSF1 notes"}, + {"4.10.6", "SGI-IRIX notes"}, + {"4.10.7", "FreeBSD notes"}, + {"4.10.7.1", "FreeBSD-3.0 notes"}, + {"4.10.8", "BSD/OS 2.# notes"}, + {"4.10.8.1", "BSD/OS 3.# notes"}, + {"4.10.9", "SCO notes"}, + {"4.10.10", "SCO Unixware 7.0 notes"}, + {"4.10.11", "IBM-AIX notes"}, + {"4.10.12", "HP-UX notes"}, + {"4.11", "TcX binaries"}, + {"4.12", "Win32 notes"}, + {"4.13", "Installation instructions for MySQL binary releases"}, + {"4.13.1", "How to get MySQL Perl support working"}, + {"4.13.2", "Linux notes"}, + {"4.13.3", "HP-UX notes"}, + {"4.13.4", "Linking client libraries"}, + {"4.14", "Problems running mysql_install_db"}, + {"4.15", "Problems starting MySQL"}, + {"4.16", "Automatic start/stop of MySQL"}, + {"4.17", "Option files"}, + {"5", "How standards-compatible is MySQL?"}, + {"5.1", "What extensions has MySQL to ANSI SQL92?"}, + {"5.2", "What functionality is missing in MySQL?"}, + {"5.2.1", "Sub-selects"}, + {"5.2.2", "SELECT INTO TABLE"}, + {"5.2.3", "Transactions"}, + {"5.2.4", "Triggers"}, + {"5.2.5", "Foreign Keys"}, + {"5.2.5.1", "Some reasons NOT to use FOREIGN KEYS"}, + {"5.2.6", "Views"}, + {"5.2.7", "-- as start of a comment"}, + {"5.3", "What standards does MySQL follow?"}, + {"5.4", "What functions exist only for compatibility?"}, + {"5.5", "Limitations of BLOB and TEXT types"}, + {"5.6", "How to cope without COMMIT-ROLLBACK"}, + {"6", "The MySQL access privilege system"}, + {"6.1", "What the privilege system does"}, + {"6.2", "Connecting to the MySQL server"}, + {"6.2.1", "Keeping your password secure"}, + {"6.3", "Privileges provided by MySQL"}, + {"6.4", "How the privilege system works"}, + {"6.5", "The privilege tables"}, + {"6.6", "Setting up the initial MySQL privileges"}, + {"6.7", "Adding new user privileges to MySQL"}, + {"6.8", "An example permission setup"}, + {"6.9", "Causes of Access denied errors"}, + {"6.10", "How to make MySQL secure against crackers"}, + {"7", "MySQL language reference"}, + {"7.1", "Literals: how to write strings and numbers"}, + {"7.1.1", "Strings"}, + {"7.1.2", "Numbers"}, + {"7.1.3", "NULL values"}, + {"7.1.4", "Database, table, index, column and alias names"}, + {"7.1.4.1", "Case sensitivity in names"}, + {"7.2", "Column types"}, + {"7.2.1", "Column type storage requirements"}, + {"7.2.5", "Numeric types"}, + {"7.2.6", "Date and time types"}, + {"7.2.6.1", "The DATE type"}, + {"7.2.6.2", "The TIME type"}, + {"7.2.6.3", "The DATETIME type"}, + {"7.2.6.4", "The TIMESTAMP type"}, + {"7.2.6.5", "The YEAR type"}, + {"7.2.6.6", "Miscellaneous date and time properties"}, + {"7.2.7", "String types"}, + {"7.2.7.1", "The CHAR and VARCHAR types"}, + {"7.2.7.2", "The BLOB and TEXT types"}, + {"7.2.7.3", "The ENUM type"}, + {"7.2.7.4", "The SET type"}, + {"7.2.8", "Choosing the right type for a column"}, + {"7.2.9", "Column indexes"}, + {"7.2.10", "Multiple-column indexes"}, + {"7.2.11", "Using column types from other database engines"}, + {"7.3", "Functions for use in SELECT and WHERE clauses"}, + {"7.3.1", "Grouping functions"}, + {"7.3.2", "Normal arithmetic operations"}, + {"7.3.3", "Bit functions"}, + {"7.3.4", "Logical operations"}, + {"7.3.5", "Comparison operators"}, + {"7.3.6", "String comparison functions"}, + {"7.3.7", "Control flow functions"}, + {"7.3.8", "Mathematical functions"}, + {"7.3.9", "String functions"}, + {"7.3.10", "Date and time functions"}, + {"7.3.11", "Miscellaneous functions"}, + {"7.3.12", "Functions for use with GROUP BY clauses"}, + {"7.4", "CREATE DATABASE syntax"}, + {"7.5", "DROP DATABASE syntax"}, + {"7.6", "CREATE TABLE syntax"}, + {"7.7", "ALTER TABLE syntax"}, + {"7.8", "OPTIMIZE TABLE syntax"}, + {"7.9", "DROP TABLE syntax"}, + {"7.10", "DELETE syntax"}, + {"7.11", "SELECT syntax"}, + {"7.12", "JOIN syntax"}, + {"7.13", "INSERT syntax"}, + {"7.14", "REPLACE syntax"}, + {"7.15", "LOAD DATA INFILE syntax"}, + {"7.16", "UPDATE syntax"}, + {"7.17", "USE syntax"}, + {"7.18", "SHOW syntax (Get information about tables, columns...)"}, + {"7.19", "EXPLAIN syntax (Get information about a SELECT)"}, + {"7.20", "DESCRIBE syntax (Get information about columns)"}, + {"7.21", "LOCK TABLES/UNLOCK TABLES syntax"}, + {"7.22", "SET OPTION syntax"}, + {"7.23", "GRANT syntax (Compatibility function)"}, + {"7.24", "CREATE INDEX syntax (Compatibility function)"}, + {"7.25", "DROP INDEX syntax (Compatibility function)"}, + {"7.26", "Comment syntax"}, + {"7.27", "CREATE FUNCTION/DROP FUNCTION syntax"}, + {"7.28", "Is MySQL picky about reserved words?"}, + {"8", "Example SQL queries"}, + {"8.1", "Queries from twin project"}, + {"8.1.1", "Find all non-distributed twins"}, + {"8.1.2", "Show a table on twin pair status"}, + {"9", "How safe/stable is MySQL?"}, + {"9.1", "How stable is MySQL?"}, + {"9.2", "Why are there is so many releases of MySQL?"}, + {"9.3", "Checking a table for errors"}, + {"9.4", "How to repair tables"}, + {"9.5", "Is there anything special to do when upgrading/downgrading MySQL?"}, + {"9.5.1", "Upgrading from a 3.21 version to 3.22"}, + {"9.5.2", "Upgrading from a 3.20 version to 3.21"}, + {"9.5.3", "Upgrading to another architecture"}, + {"9.6", "Year 2000 compliance"}, + {"10", "MySQL Server functions"}, + {"10.1", "What languages are supported by MySQL?"}, + {"10.1.1", "Character set used for data & sorting"}, + {"10.2", "The update log"}, + {"10.3", "How big can MySQL tables be?"}, + {"11", "Getting maximum performance from MySQL"}, + {"11.1", "How does one change the size of MySQL buffers?"}, + {"11.2", "How compiling and linking affects the speed of MySQL"}, + {"11.3", "How does MySQL use memory?"}, + {"11.4", "How does MySQL use indexes?"}, + {"11.5", "What optimizations are done on WHERE clauses?"}, + {"11.6", "How does MySQL open & close tables?"}, + {"11.6.0.1", "What are the drawbacks of creating possibly thousands of tables in a database?"}, + {"11.7", "How does MySQL lock tables?"}, + {"11.8", "How should I arrange my table to be as fast/small as possible?"}, + {"11.9", "What affects the speed of INSERT statements?"}, + {"11.10", "What affects the speed DELETE statements?"}, + {"11.11", "How do I get MySQL to run at full speed?"}, + {"11.12", "What are the different row formats? Or, when should VARCHAR/CHAR be used?"}, + {"11.13", "Why so many open tables?"}, + {"12", "MySQL benchmark suite"}, + {"13", "MySQL Utilites"}, + {"13.1", "Overview of the different MySQL programs"}, + {"13.2", "The MySQL table check, optimize and repair program"}, + {"13.2.1", "isamchk memory use"}, + {"13.2.2", "Getting low-level table information"}, + {"13.3", "The MySQL compressed read-only table generator"}, + {"14", "Adding new functions to MySQL"}, + {"15", "MySQL ODBC Support"}, + {"15.1", "Operating systems supported by MyODBC"}, + {"15.2", "How to report problems with MyODBC"}, + {"15.3", "Programs known to work with MyODBC"}, + {"15.4", "How to fill in the various fields in the ODBC administrator program"}, + {"15.5", "How to get the value of an AUTO_INCREMENT column in ODBC"}, + {"16", "Problems and common errors"}, + {"16.1", "Some common errors when using MySQL"}, + {"16.1.1", "MySQL server has gone away error"}, + {"16.1.2", "Can't connect to local MySQL server error"}, + {"16.1.3", "Out of memory error"}, + {"16.1.4", "Packet too large error"}, + {"16.1.5", "The table is full error"}, + {"16.1.6", "Commands out of sync error in client"}, + {"16.1.7", "Removing user error"}, + {"16.2", "How MySQL handles a full disk"}, + {"16.3", "How to run SQL commands from a text file"}, + {"16.4", "Where MySQL stores temporary files"}, + {"16.5", "Access denied error"}, + {"16.6", "How to run MySQL as a normal user"}, + {"16.7", "Problems with file permissions"}, + {"16.8", "File not found"}, + {"16.9", "Problems using DATE columns"}, + {"16.10", "Case sensitivity in searches"}, + {"16.11", "Problems with NULL values"}, + {"17", "Solving some common problems with MySQL"}, + {"17.1", "Database replication"}, + {"17.2", "Database backups"}, + {"18", "MySQL client tools and API's"}, + {"18.1", "MySQL C API"}, + {"18.2", "C API datatypes"}, + {"18.3", "C API function overview"}, + {"18.4", "C API function descriptions"}, + {"18.4.1", "mysql_affected_rows()"}, + {"18.4.2", "mysql_close()"}, + {"18.4.3", "mysql_connect()"}, + {"18.4.4", "mysql_create_db()"}, + {"18.4.5", "mysql_data_seek()"}, + {"18.4.6", "mysql_debug()"}, + {"18.4.7", "mysql_drop_db()"}, + {"18.4.8", "mysql_dump_debug_info()"}, + {"18.4.9", "mysql_eof()"}, + {"18.4.10", "mysql_errno()"}, + {"18.4.11", "mysql_error()"}, + {"18.4.12", "mysql_escape_string()"}, + {"18.4.13", "mysql_fetch_field()"}, + {"18.4.14", "mysql_fetch_fields()"}, + {"18.4.15", "mysql_fetch_field_direct()"}, + {"18.4.16", "mysql_fetch_lengths()"}, + {"18.4.17", "mysql_fetch_row()"}, + {"18.4.18", "mysql_field_seek()"}, + {"18.4.19", "mysql_field_tell()"}, + {"18.4.20", "mysql_free_result()"}, + {"18.4.21", "mysql_get_client_info()"}, + {"18.4.22", "mysql_get_host_info()"}, + {"18.4.23", "mysql_get_proto_info()"}, + {"18.4.24", "mysql_get_server_info()"}, + {"18.4.25", "mysql_info()"}, + {"18.4.26", "mysql_init()"}, + {"18.4.27", "mysql_insert_id()"}, + {"18.4.28", "mysql_kill()"}, + {"18.4.29", "mysql_list_dbs()"}, + {"18.4.30", "mysql_list_fields()"}, + {"18.4.31", "mysql_list_processes()"}, + {"18.4.32", "mysql_list_tables()"}, + {"18.4.33", "mysql_num_fields()"}, + {"18.4.34", "mysql_num_rows()"}, + {"18.4.35", "mysql_query()"}, + {"18.4.36", "mysql_real_connect()"}, + {"18.4.37", "mysql_real_query()"}, + {"18.4.38", "mysql_reload()"}, + {"18.4.39", "mysql_row_tell()"}, + {"18.4.40", "mysql_select_db()"}, + {"18.4.41", "mysql_shutdown()"}, + {"18.4.42", "mysql_stat()"}, + {"18.4.43", "mysql_store_result()"}, + {"18.4.44", "mysql_thread_id()"}, + {"18.4.45", "mysql_use_result()"}, + {"18.4.46", "Why is it that after mysql_query() returns success, mysql_store_result() sometimes returns NULL?"}, + {"18.4.47", "What results can I get from a query?"}, + {"18.4.48", "How can I get the unique ID for the last inserted row?"}, + {"18.4.49", "Problems linking with the C API"}, + {"18.4.50", "How to make a thread-safe client"}, + {"18.5", "MySQL Perl API's"}, + {"18.5.1", "DBI with DBD::MariaDB"}, + {"18.5.1.1", "The DBI interface"}, + {"18.5.1.2", "More DBI/DBD information"}, + {"18.6", "MySQL Java connectivity (JDBC)"}, + {"18.7", "MySQL PHP API's"}, + {"18.8", "MySQL C++ API's"}, + {"18.9", "MySQL Python API's"}, + {"18.10", "MySQL TCL API's"}, + {"19", "How MySQL compares to other databases"}, + {"19.1", "How MySQL compares to mSQL"}, + {"19.1.1", "How to convert mSQL tools for MySQL"}, + {"19.1.2", "How mSQL and MySQL client/server communications protocols differ"}, + {"19.1.3", "How mSQL 2.0 SQL syntax differs from MySQL"}, + {"19.2", "How MySQL compares to PostgreSQL"}, + {"A", "Some users of MySQL"}, + {"B", "Contributed programs"}, + {"C", "Contributors to MySQL"}, + {"D", "MySQL change history"}, + {"19.3", "Changes in release 3.22.x (Alpha version)"}, + {"19.3.1", "Changes in release 3.22.7"}, + {"19.3.2", "Changes in release 3.22.6"}, + {"19.3.3", "Changes in release 3.22.5"}, + {"19.3.4", "Changes in release 3.22.4"}, + {"19.3.5", "Changes in release 3.22.3"}, + {"19.3.6", "Changes in release 3.22.2"}, + {"19.3.7", "Changes in release 3.22.1"}, + {"19.3.8", "Changes in release 3.22.0"}, + {"19.4", "Changes in release 3.21.x"}, + {"19.4.1", "Changes in release 3.21.33"}, + {"19.4.2", "Changes in release 3.21.32"}, + {"19.4.3", "Changes in release 3.21.31"}, + {"19.4.4", "Changes in release 3.21.30"}, + {"19.4.5", "Changes in release 3.21.29"}, + {"19.4.6", "Changes in release 3.21.28"}, + {"19.4.7", "Changes in release 3.21.27"}, + {"19.4.8", "Changes in release 3.21.26"}, + {"19.4.9", "Changes in release 3.21.25"}, + {"19.4.10", "Changes in release 3.21.24"}, + {"19.4.11", "Changes in release 3.21.23"}, + {"19.4.12", "Changes in release 3.21.22"}, + {"19.4.13", "Changes in release 3.21.21a"}, + {"19.4.14", "Changes in release 3.21.21"}, + {"19.4.15", "Changes in release 3.21.20"}, + {"19.4.16", "Changes in release 3.21.19"}, + {"19.4.17", "Changes in release 3.21.18"}, + {"19.4.18", "Changes in release 3.21.17"}, + {"19.4.19", "Changes in release 3.21.16"}, + {"19.4.20", "Changes in release 3.21.15"}, + {"19.4.21", "Changes in release 3.21.14b"}, + {"19.4.22", "Changes in release 3.21.14a"}, + {"19.4.23", "Changes in release 3.21.13"}, + {"19.4.24", "Changes in release 3.21.12"}, + {"19.4.25", "Changes in release 3.21.11"}, + {"19.4.26", "Changes in release 3.21.10"}, + {"19.4.27", "Changes in release 3.21.9"}, + {"19.4.28", "Changes in release 3.21.8"}, + {"19.4.29", "Changes in release 3.21.7"}, + {"19.4.30", "Changes in release 3.21.6"}, + {"19.4.31", "Changes in release 3.21.5"}, + {"19.4.32", "Changes in release 3.21.4"}, + {"19.4.33", "Changes in release 3.21.3"}, + {"19.4.34", "Changes in release 3.21.2"}, + {"19.4.35", "Changes in release 3.21.0"}, + {"19.5", "Changes in release 3.20.x"}, + {"19.5.1", "Changes in release 3.20.18"}, + {"19.5.2", "Changes in release 3.20.17"}, + {"19.5.3", "Changes in release 3.20.16"}, + {"19.5.4", "Changes in release 3.20.15"}, + {"19.5.5", "Changes in release 3.20.14"}, + {"19.5.6", "Changes in release 3.20.13"}, + {"19.5.7", "Changes in release 3.20.11"}, + {"19.5.8", "Changes in release 3.20.10"}, + {"19.5.9", "Changes in release 3.20.9"}, + {"19.5.10", "Changes in release 3.20.8"}, + {"19.5.11", "Changes in release 3.20.7"}, + {"19.5.12", "Changes in release 3.20.6"}, + {"19.5.13", "Changes in release 3.20.3"}, + {"19.5.14", "Changes in release 3.20.0"}, + {"19.6", "Changes in release 3.19.x"}, + {"19.6.1", "Changes in release 3.19.5"}, + {"19.6.2", "Changes in release 3.19.4"}, + {"19.6.3", "Changes in release 3.19.3"}, + {"E", "Known errors and design deficiencies in MySQL"}, + {"F", "List of things we want to add to MySQL in the future (The TODO)"}, + {"19.7", "Things that must done in the real near future"}, + {"19.8", "Things that have to be done sometime"}, + {"19.9", "Some things we don't have any plans to do"}, + {"G", "Comments on porting to other systems"}, + {"19.10", "Debugging MySQL"}, + {"19.11", "Comments about RTS threads"}, + {"19.12", "What is the difference between different thread packages?"}, + {"H", "Description of MySQL regular expression syntax"}, + {"I", "What is Unireg?"}, + {"J", "The MySQL server license"}, + {"K", "The MySQL license for Microsoft operating systems"}, + {"*", "SQL command, type and function index"}, + {"*", "Concept Index"} +}; + +#define NQUERIES 5 +const char *query[NQUERIES]={ + "mysql information and manual", + "upgrading from previous version", + "column indexes", + "against about after more right the with/without", /* stopwords test */ + "mysql license and copyright" +}; diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c new file mode 100644 index 00000000..868e15f9 --- /dev/null +++ b/storage/maria/ma_ft_update.c @@ -0,0 +1,372 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* functions to work with full-text indices */ + +#include "ma_ftdefs.h" +#include <math.h> + +void _ma_ft_segiterator_init(MARIA_HA *info, uint keynr, const uchar *record, + FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator_init"); + + ftsi->num=info->s->keyinfo[keynr].keysegs; + ftsi->seg=info->s->keyinfo[keynr].seg; + ftsi->rec=record; + ftsi->pos= 0; /* Avoid warnings from gcc */ + ftsi->len= 0; /* Avoid warnings from gcc */ + DBUG_VOID_RETURN; +} + +void _ma_ft_segiterator_dummy_init(const uchar *record, uint len, + FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator_dummy_init"); + + ftsi->num=1; + ftsi->seg=0; + ftsi->pos=record; + ftsi->len=len; + DBUG_VOID_RETURN; +} + +/* + This function breaks convention "return 0 in success" + but it's easier to use like this + + while(_ma_ft_segiterator()) + + so "1" means "OK", "0" means "EOF" +*/ + +uint _ma_ft_segiterator(register FT_SEG_ITERATOR *ftsi) +{ + DBUG_ENTER("_ma_ft_segiterator"); + + if (!ftsi->num) + DBUG_RETURN(0); + + ftsi->num--; + if (!ftsi->seg) + DBUG_RETURN(1); + + ftsi->seg--; + + if (ftsi->seg->null_bit && + (ftsi->rec[ftsi->seg->null_pos] & ftsi->seg->null_bit)) + { + ftsi->pos=0; + DBUG_RETURN(1); + } + ftsi->pos= ftsi->rec+ftsi->seg->start; + if (ftsi->seg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= (ftsi->seg->bit_start); + ftsi->len= (pack_length == 1 ? (uint) * ftsi->pos : + uint2korr(ftsi->pos)); + ftsi->pos+= pack_length; /* Skip VARCHAR length */ + DBUG_RETURN(1); + } + if (ftsi->seg->flag & HA_BLOB_PART) + { + ftsi->len= _ma_calc_blob_length(ftsi->seg->bit_start,ftsi->pos); + memcpy((char**) &ftsi->pos, ftsi->pos+ftsi->seg->bit_start, sizeof(char*)); + DBUG_RETURN(1); + } + ftsi->len=ftsi->seg->length; + DBUG_RETURN(1); +} + + +/* parses a document i.e. calls maria_ft_parse for every keyseg */ + +uint _ma_ft_parse(TREE *parsed, MARIA_HA *info, uint keynr, const uchar *record, + MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root) +{ + FT_SEG_ITERATOR ftsi; + struct st_mysql_ftparser *parser; + DBUG_ENTER("_ma_ft_parse"); + + _ma_ft_segiterator_init(info, keynr, record, &ftsi); + + maria_ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset); + parser= info->s->keyinfo[keynr].parser; + while (_ma_ft_segiterator(&ftsi)) + { + /** @todo this casts ftsi.pos (const) to non-const */ + if (ftsi.pos) + if (maria_ft_parse(parsed, (uchar *)ftsi.pos, ftsi.len, parser, param, + mem_root)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +FT_WORD * _ma_ft_parserecord(MARIA_HA *info, uint keynr, const uchar *record, + MEM_ROOT *mem_root) +{ + TREE ptree; + MYSQL_FTPARSER_PARAM *param; + DBUG_ENTER("_ma_ft_parserecord"); + if (! (param= maria_ftparser_call_initializer(info, keynr, 0))) + DBUG_RETURN(NULL); + bzero((char*) &ptree, sizeof(ptree)); + param->flags= 0; + if (_ma_ft_parse(&ptree, info, keynr, record, param, mem_root)) + DBUG_RETURN(NULL); + + DBUG_RETURN(maria_ft_linearize(&ptree, mem_root)); +} + +static int _ma_ft_store(MARIA_HA *info, uint keynr, uchar *keybuf, + FT_WORD *wlist, my_off_t filepos) +{ + DBUG_ENTER("_ma_ft_store"); + + for (; wlist->pos; wlist++) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos); + if (_ma_ck_write(info, &key)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +static int _ma_ft_erase(MARIA_HA *info, uint keynr, uchar *keybuf, + FT_WORD *wlist, my_off_t filepos) +{ + uint err=0; + DBUG_ENTER("_ma_ft_erase"); + + for (; wlist->pos; wlist++) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos); + if (_ma_ck_delete(info, &key)) + err=1; + } + DBUG_RETURN(err); +} + +/* + Compares an appropriate parts of two WORD_KEY keys directly out of records + returns 1 if they are different +*/ + +#define THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT 1 +#define GEE_THEY_ARE_ABSOLUTELY_IDENTICAL 0 + +int _ma_ft_cmp(MARIA_HA *info, uint keynr, const uchar *rec1, const uchar *rec2) +{ + FT_SEG_ITERATOR ftsi1, ftsi2; + CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; + DBUG_ENTER("_ma_ft_cmp"); + + _ma_ft_segiterator_init(info, keynr, rec1, &ftsi1); + _ma_ft_segiterator_init(info, keynr, rec2, &ftsi2); + + while (_ma_ft_segiterator(&ftsi1) && _ma_ft_segiterator(&ftsi2)) + { + if ((ftsi1.pos != ftsi2.pos) && + (!ftsi1.pos || !ftsi2.pos || + ha_compare_word(cs, ftsi1.pos, ftsi1.len, ftsi2.pos, ftsi2.len))) + DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT); + } + DBUG_RETURN(GEE_THEY_ARE_ABSOLUTELY_IDENTICAL); +} + + +/* update a document entry */ + +int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf, + const uchar *oldrec, const uchar *newrec, my_off_t pos) +{ + int error= -1; + FT_WORD *oldlist,*newlist, *old_word, *new_word; + CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset; + int cmp, cmp2; + DBUG_ENTER("_ma_ft_update"); + + if (!(old_word=oldlist=_ma_ft_parserecord(info, keynr, oldrec, + &info->ft_memroot)) || + !(new_word=newlist=_ma_ft_parserecord(info, keynr, newrec, + &info->ft_memroot))) + goto err; + + error=0; + while(old_word->pos && new_word->pos) + { + cmp= ha_compare_word(cs, (uchar*) old_word->pos, old_word->len, + (uchar*) new_word->pos, new_word->len); + cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5); + + if (cmp < 0 || cmp2) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, old_word, pos); + if (_ma_ck_delete(info, &key)) + { + error= -1; + goto err; + } + } + if (cmp > 0 || cmp2) + { + MARIA_KEY key; + _ma_ft_make_key(info, &key, keynr, keybuf, new_word,pos); + if ((error= _ma_ck_write(info, &key))) + goto err; + } + if (cmp<=0) old_word++; + if (cmp>=0) new_word++; + } + if (old_word->pos) + error= _ma_ft_erase(info,keynr,keybuf,old_word,pos); + else if (new_word->pos) + error= _ma_ft_store(info,keynr,keybuf,new_word,pos); + +err: + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_RETURN(error); +} + + +/* adds a document to the collection */ + +int _ma_ft_add(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record, + my_off_t pos) +{ + int error= -1; + FT_WORD *wlist; + DBUG_ENTER("_ma_ft_add"); + DBUG_PRINT("enter",("keynr: %d",keynr)); + + if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot))) + error= _ma_ft_store(info,keynr,keybuf,wlist,pos); + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} + + +/* removes a document from the collection */ + +int _ma_ft_del(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record, + my_off_t pos) +{ + int error= -1; + FT_WORD *wlist; + DBUG_ENTER("_ma_ft_del"); + DBUG_PRINT("enter",("keynr: %d",keynr)); + + if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot))) + error= _ma_ft_erase(info,keynr,keybuf,wlist,pos); + free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE)); + DBUG_PRINT("exit",("Return: %d",error)); + DBUG_RETURN(error); +} + + +MARIA_KEY *_ma_ft_make_key(MARIA_HA *info, MARIA_KEY *key, uint keynr, + uchar *keybuf, + FT_WORD *wptr, my_off_t filepos) +{ + uchar buf[HA_FT_MAXBYTELEN+16]; + float weight=(float) ((filepos==HA_OFFSET_ERROR) ? 0 : wptr->weight); + DBUG_ENTER("_ma_ft_make_key"); + + mi_float4store(buf,weight); + int2store(buf+HA_FT_WLEN,wptr->len); + memcpy(buf+HA_FT_WLEN+2,wptr->pos,wptr->len); + /* Can't be spatial so it's ok to call _ma_make_key directly here */ + DBUG_RETURN(_ma_make_key(info, key, keynr, keybuf, buf, filepos, 0)); +} + + +/* + convert key value to ft2 +*/ + +my_bool _ma_ft_convert_to_ft2(MARIA_HA *info, MARIA_KEY *key) +{ + MARIA_SHARE *share= info->s; + my_off_t root; + DYNAMIC_ARRAY *da=info->ft1_to_ft2; + MARIA_KEYDEF *keyinfo=&share->ft2_keyinfo; + uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end; + uint length, key_length; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEY tmp_key; + MARIA_PAGE page; + DBUG_ENTER("_ma_ft_convert_to_ft2"); + + /* we'll generate one pageful at once, and insert the rest one-by-one */ + /* calculating the length of this page ...*/ + length=(keyinfo->block_length-2) / keyinfo->keylength; + set_if_smaller(length, (uint)da->elements); + length=length * keyinfo->keylength; + + get_key_full_length_rdonly(key_length, key->data); + while (_ma_ck_delete(info, key) == 0) + { + /* + nothing to do here. + _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys + */ + } + + /* creating pageful of keys */ + bzero(info->buff, share->keypage_header); + _ma_store_keynr(share, info->buff, keyinfo->key_nr); + _ma_store_page_used(share, info->buff, length + share->keypage_header); + memcpy(info->buff + share->keypage_header, key_ptr, length); + info->keyread_buff_used= info->page_changed=1; /* info->buff is used */ + /** + @todo RECOVERY BUG this is not logged yet. Ok as this code is never + called, but soon it will be. + */ + if ((root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR) + DBUG_RETURN(1); + + _ma_page_setup(&page, info, keyinfo, root, info->buff); + if (_ma_write_keypage(&page, page_link->write_lock, DFLT_INIT_HITS)) + DBUG_RETURN(1); + + /* inserting the rest of key values */ + end= (uchar*) dynamic_array_ptr(da, da->elements); + tmp_key.keyinfo= keyinfo; + tmp_key.data_length= keyinfo->keylength; + tmp_key.ref_length= 0; + tmp_key.flag= 0; + for (key_ptr+=length; key_ptr < end; key_ptr+=keyinfo->keylength) + { + tmp_key.data= key_ptr; + if (_ma_ck_real_write_btree(info, &tmp_key, &root, SEARCH_SAME)) + DBUG_RETURN(1); + } + + /* now, writing the word key entry */ + ft_intXstore(key->data + key_length, - (int) da->elements); + _ma_dpointer(share, key->data + key_length + HA_FT_WLEN, root); + + DBUG_RETURN(_ma_ck_real_write_btree(info, key, + &share->state.key_root[key->keyinfo-> + key_nr], + SEARCH_SAME)); +} diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h new file mode 100644 index 00000000..90ca6feb --- /dev/null +++ b/storage/maria/ma_ftdefs.h @@ -0,0 +1,156 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "ma_fulltext.h" +#include <m_ctype.h> +#include <my_tree.h> +#include <queues.h> +#include <mysql/plugin.h> + +#define true_word_char(ctype, character) \ + ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \ + (character) == '_') +#define misc_word_char(X) 0 + +#define FT_MAX_WORD_LEN_FOR_SORT 31 + +#define FTPARSER_MEMROOT_ALLOC_SIZE 65536 + +#define COMPILE_STOPWORDS_IN + +/* Interested readers may consult SMART + (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z) + for an excellent implementation of vector space model we use. + It also demonstrate the usage of different weghting techniques. + This code, though, is completely original and is not based on the + SMART code but was in some cases inspired by it. + + NORM_PIVOT was taken from the article + A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization", + ACM SIGIR'96, 21-29, 1996 + */ + +#define LWS_FOR_QUERY LWS_TF +#define LWS_IN_USE LWS_LOG +#define PRENORM_IN_USE PRENORM_AVG +#define NORM_IN_USE NORM_PIVOT +#define GWS_IN_USE GWS_PROB +/*==============================================================*/ +#define LWS_TF (count) +#define LWS_BINARY (count>0) +#define LWS_SQUARE (count*count) +#define LWS_LOG (count?(log( (double) count)+1):0) +/*--------------------------------------------------------------*/ +#define PRENORM_NONE (p->weight) +#define PRENORM_MAX (p->weight/docstat.max) +#define PRENORM_AUG (0.4+0.6*p->weight/docstat.max) +#define PRENORM_AVG (p->weight/docstat.sum*docstat.uniq) +#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq))) +/*--------------------------------------------------------------*/ +#define NORM_NONE (1) +#define NORM_SUM (docstat.nsum) +#define NORM_COS (sqrt(docstat.nsum2)) + +#define PIVOT_VAL (0.0115) +#define NORM_PIVOT (1+PIVOT_VAL*docstat.uniq) +/*---------------------------------------------------------------*/ +#define GWS_NORM (1/sqrt(sum2)) +#define GWS_GFIDF (sum/doc_cnt) +/* Mysterious, but w/o (double) GWS_IDF performs better :-o */ +#define GWS_IDF log(aio->info->state->records/doc_cnt) +#define GWS_IDF1 log((double)aio->info->state->records/doc_cnt) +#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 ) +#define GWS_FREQ (1.0/doc_cnt) +#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2) +#define GWS_CUBIC pow(log((double)aio->info->state->records/doc_cnt),3) +#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records)) +/*=================================================================*/ + +/* Boolean search operators */ +#define FTB_YES (ft_boolean_syntax[0]) +#define FTB_EGAL (ft_boolean_syntax[1]) +#define FTB_NO (ft_boolean_syntax[2]) +#define FTB_INC (ft_boolean_syntax[3]) +#define FTB_DEC (ft_boolean_syntax[4]) +#define FTB_LBR (ft_boolean_syntax[5]) +#define FTB_RBR (ft_boolean_syntax[6]) +#define FTB_NEG (ft_boolean_syntax[7]) +#define FTB_TRUNC (ft_boolean_syntax[8]) +#define FTB_LQUOT (ft_boolean_syntax[10]) +#define FTB_RQUOT (ft_boolean_syntax[11]) + +typedef struct st_maria_ft_word { + const uchar * pos; + uint len; + double weight; +} FT_WORD; + +int is_stopword(const char *word, size_t len); + +MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *, + my_off_t); + +uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, + FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *); +uchar maria_ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *, + FT_WORD *, my_bool); + +typedef struct _st_maria_ft_seg_iterator { + uint num, len; + HA_KEYSEG *seg; + const uchar *rec, *pos; +} FT_SEG_ITERATOR; + +void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *); +void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *); +uint _ma_ft_segiterator(FT_SEG_ITERATOR *); + +void maria_ft_parse_init(TREE *, CHARSET_INFO *); +int maria_ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); +FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *); +FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *); +uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *, + MYSQL_FTPARSER_PARAM *, MEM_ROOT *); + +FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, uint, uint, + uchar *); +FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, uint, + CHARSET_INFO *); + +extern const struct _ft_vft _ma_ft_vft_nlq; +int maria_ft_nlq_read_next(FT_INFO *, char *); +float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_nlq_close_search(FT_INFO *); +float maria_ft_nlq_get_relevance(FT_INFO *); +my_off_t maria_ft_nlq_get_docid(FT_INFO *); +void maria_ft_nlq_reinit_search(FT_INFO *); + +extern const struct _ft_vft _ma_ft_vft_boolean; +int maria_ft_boolean_read_next(FT_INFO *, char *); +float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint); +void maria_ft_boolean_close_search(FT_INFO *); +float maria_ft_boolean_get_relevance(FT_INFO *); +my_off_t maria_ft_boolean_get_docid(FT_INFO *); +void maria_ft_boolean_reinit_search(FT_INFO *); +MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info); +extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info, + uint keynr, + uint paramnr); +extern void maria_ftparser_call_deinitializer(MARIA_HA *info); diff --git a/storage/maria/ma_fulltext.h b/storage/maria/ma_fulltext.h new file mode 100644 index 00000000..55deb942 --- /dev/null +++ b/storage/maria/ma_fulltext.h @@ -0,0 +1,32 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Sergei A. Golubchik, who has a shared copyright to this code */ + +/* some definitions for full-text indices */ + +#include "maria_def.h" +#include "ft_global.h" + +/* If HA_FT_MAXLEN is change to 127 or over, it must be tested properly as + it may cause different representation on disk for full text indexes +*/ +#define HA_FT_MAXLEN 126 + +int _ma_ft_cmp(MARIA_HA *, uint, const uchar *, const uchar *); +int _ma_ft_add(MARIA_HA *, uint, uchar *, const uchar *, my_off_t); +int _ma_ft_del(MARIA_HA *, uint, uchar *, const uchar *, my_off_t); + +my_bool _ma_ft_convert_to_ft2(MARIA_HA *, MARIA_KEY *); diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c new file mode 100644 index 00000000..ddf92654 --- /dev/null +++ b/storage/maria/ma_info.c @@ -0,0 +1,228 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Return useful base information for an open table */ + +#include "maria_def.h" +#ifdef _WIN32 +#include <sys/stat.h> +#endif + + /* Get position to last record */ + +MARIA_RECORD_POS maria_position(MARIA_HA *info) +{ + return info->cur_row.lastpos; +} + + +uint maria_max_key_length() +{ + uint tmp= (_ma_max_key_length() - 8 - HA_MAX_KEY_SEG*3); + return MY_MIN(MARIA_MAX_KEY_LENGTH, tmp); +} + +/* Get information about the table */ +/* if flag == 2 one get current info (no sync from database */ + +int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag) +{ + MY_STAT state; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_status"); + DBUG_PRINT("info", ("records: %lld", info->state->records)); + + x->recpos= info->cur_row.lastpos; + if (flag == HA_STATUS_POS) + DBUG_RETURN(0); /* Compatible with ISAM */ + if (!(flag & HA_STATUS_NO_LOCK)) + { + mysql_mutex_lock(&share->intern_lock); + _ma_readinfo(info,F_RDLCK,0); + fast_ma_writeinfo(info); + mysql_mutex_unlock(&share->intern_lock); + } + if (flag & HA_STATUS_VARIABLE) + { + /* If table is locked, give versioned number otherwise last commited */ + if (info->lock_type == F_UNLCK) + x->records = share->state.state.records; + else + x->records = info->state->records; + x->deleted = share->state.state.del; + x->delete_length = share->state.state.empty; + x->data_file_length = share->state.state.data_file_length; + x->index_file_length= share->state.state.key_file_length; + + x->keys = share->state.header.keys; + x->check_time = share->state.check_time; + x->mean_reclength = x->records ? + (ulong) ((x->data_file_length - x->delete_length) /x->records) : + (ulong) share->min_pack_length; + } + if (flag & HA_STATUS_ERRKEY) + { + x->errkey= info->errkey; + x->dup_key_pos= info->dup_key_pos; + } + if (flag & HA_STATUS_CONST) + { + x->reclength = share->base.reclength; + x->max_data_file_length=share->base.max_data_file_length; + x->max_index_file_length=info->s->base.max_key_file_length; + x->filenr = info->dfile.file; + x->options = share->options; + x->create_time=share->state.create_time; + x->reflength= maria_get_pointer_length(share->base.max_data_file_length, + maria_data_pointer_size); + x->record_offset= (info->s->data_file_type == STATIC_RECORD ? + share->base.pack_reclength: 0); + x->sortkey= -1; /* No clustering */ + x->rec_per_key = share->state.rec_per_key_part; + x->key_map = share->state.key_map; + x->data_file_name = share->data_file_name.str; + x->index_file_name = share->index_file_name.str; + x->data_file_type = share->data_file_type; + } + if ((flag & HA_STATUS_TIME) && !my_fstat(info->dfile.file, &state, MYF(0))) + x->update_time=state.st_mtime; + else + x->update_time=0; + if (flag & HA_STATUS_AUTO) + { + x->auto_increment= share->state.auto_increment+1; + if (!x->auto_increment) /* This shouldn't happen */ + x->auto_increment= ~(ulonglong) 0; + } + DBUG_RETURN(0); +} + + +/* + Write a message to the user or the error log. + + SYNOPSIS + _ma_report_error() + file_name Name of table file (e.g. index_file_name). + errcode Error number. + flags Flags to my_error + + DESCRIPTION + This function supplies my_error() with a table name. Most error + messages need one. Since string arguments in error messages are limited + to 64 characters by convention, we ensure that in case of truncation, + that the end of the index file path is in the message. This contains + the most valuable information (the table name and the database name). + + RETURN + void +*/ + +void _ma_report_error(int errcode, const LEX_STRING *name, myf flags) +{ + size_t length; + const char *file_name= name->str; + DBUG_ENTER("_ma_report_error"); + DBUG_PRINT("enter",("error: %d table: '%s'", errcode, file_name)); + + if ((length= name->length) > 64) + { + /* we first remove the directory */ + size_t dir_length= dirname_length(file_name); + file_name+= dir_length; + if ((length-= dir_length) > 64) + { + /* still too long, chop start of table name */ + file_name+= length - 64; + } + } + my_printf_error(errcode, "Got error '%M' for '%s'", + flags, (int) errcode, file_name); + DBUG_VOID_RETURN; +} + + +/** + If standalone report all errors to the user + If run trough the Aria handler, only report first error to the user + to not spam him + + @param info Aria Handler + @param error Error code + @apram write_to_log If set to 1, print the error to the log. This is only set + when a table was found to be crashed the first time +*/ + +void _ma_print_error(MARIA_HA *info, int error, my_bool write_to_log) +{ + DBUG_ENTER("_ma_print_error"); + DBUG_PRINT("error", ("error: %d log: %d", error, write_to_log)); + if (!info->error_count++ || !maria_in_ha_maria || write_to_log) + { + MARIA_SHARE *share= info->s; + _ma_report_error(error, + (share->index_file_name.length ? + &share->index_file_name : + &share->unique_file_name), + MYF(write_to_log ? ME_ERROR_LOG : 0)); + } + DBUG_VOID_RETURN; +} + + +/* + Handle a fatal error + + - Mark the table as crashed + - Print an error message, if we had not issued an error message before + that the table had been crashed. + - set my_errno to error + - If 'maria_assert_if_crashed_table is set, then assert. +*/ + +void _ma_set_fatal_error(MARIA_HA *info, int error) +{ + MARIA_SHARE *share= info->s; + _ma_print_error(info, error, + (share->state.changed & STATE_CRASHED_PRINTED) == 0); + maria_mark_crashed_share(share); + share->state.changed|= STATE_CRASHED_PRINTED; + my_errno= error; + DBUG_ASSERT(!maria_assert_if_crashed_table); +} + + +/* + Similar to the above, but only used from maria_open() where we don't have + an active handler object. Here we don't set a fatal error as we may + still want to do an automatic repair on the table +*/ + +void _ma_set_fatal_error_with_share(MARIA_SHARE *share, int error) +{ + DBUG_PRINT("error", ("error: %d", error)); + + if (!(share->state.changed & STATE_CRASHED_PRINTED)) + { + _ma_report_error(error, + (share->index_file_name.length ? + &share->index_file_name : + &share->unique_file_name), + MYF(ME_WARNING | ME_ERROR_LOG)); + } + maria_mark_crashed_share(share); + share->state.changed|= STATE_CRASHED_PRINTED; + DBUG_ASSERT(!maria_assert_if_crashed_table); +} diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c new file mode 100644 index 00000000..14c4c996 --- /dev/null +++ b/storage/maria/ma_init.c @@ -0,0 +1,186 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Initialize an maria-database */ + +#include "maria_def.h" +#include <ft_global.h> +#include "ma_blockrec.h" +#include "trnman_public.h" +#include "ma_checkpoint.h" +#include <hash.h> + +void history_state_free(MARIA_STATE_HISTORY_CLOSED *closed_history) +{ + MARIA_STATE_HISTORY *history, *next; + + /* + Free all active history + In case of maria_open() this list should be empty as the history is moved + to handler->share. + */ + for (history= closed_history->state_history; history ; history= next) + { + next= history->next; + my_free(history); + } + my_free(closed_history); +} + + +static int dummy_maria_create_trn_hook(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + +/* + Initialize maria + + SYNOPSIS + maria_init() + + TODO + Open log files and do recovery if need + + RETURN + 0 ok + # error number +*/ + +int maria_init(void) +{ + DBUG_ASSERT(maria_block_size && + maria_block_size % MARIA_MIN_KEY_BLOCK_LENGTH == 0); + if (!maria_inited) + { + maria_inited= TRUE; + mysql_mutex_init(key_THR_LOCK_maria, &THR_LOCK_maria, MY_MUTEX_INIT_SLOW); + _ma_init_block_record_data(); + trnman_end_trans_hook= _ma_trnman_end_trans_hook; + maria_create_trn_hook= dummy_maria_create_trn_hook; + } + my_hash_init(PSI_INSTRUMENT_ME, &maria_stored_state, &my_charset_bin, 32, 0, + sizeof(LSN), 0, (my_hash_free_key) history_state_free, 0); + DBUG_PRINT("info",("dummy_transaction_object: %p", &dummy_transaction_object)); + return 0; +} + + +void maria_end(void) +{ + DBUG_ENTER("maria_end"); + if (maria_inited) + { + TrID trid; + maria_inited= maria_multi_threaded= FALSE; + ft_free_stopwords(); + ma_checkpoint_end(); + if (translog_status == TRANSLOG_OK && !aria_readonly) + { + translog_soft_sync_end(); + translog_sync(); + } + if ((trid= trnman_get_max_trid()) > max_trid_in_control_file && + !aria_readonly) + { + /* + Store max transaction id into control file, in case logs are removed + by user, or maria_chk wants to check tables (it cannot access max trid + from the log, as it cannot process REDOs). + */ + (void)ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + trid, recovery_failures); + } + trnman_destroy(); + if (translog_status == TRANSLOG_OK || translog_status == TRANSLOG_READONLY) + translog_destroy(); + end_pagecache(maria_log_pagecache, TRUE); + end_pagecache(maria_pagecache, TRUE); + ma_control_file_end(); + mysql_mutex_destroy(&THR_LOCK_maria); + my_hash_free(&maria_stored_state); + } + DBUG_VOID_RETURN; +} + +/** + Upgrade from older Aria versions: + + - In MariaDB 5.1, the name of the control file and log files had the + 'maria' prefix, now they have the 'aria' prefix. + + @return: 0 ok + 1 error + +*/ + +my_bool maria_upgrade() +{ + char name[FN_REFLEN], new_name[FN_REFLEN]; + DBUG_ENTER("maria_upgrade"); + + fn_format(name, "maria_log_control", maria_data_root, "", MYF(MY_WME)); + + if (!my_access(name,F_OK)) + { + /* + Old style control file found; Rename the control file and the log files. + We start by renaming all log files, so that if we get a crash + we will continue from where we left. + */ + size_t i; + MY_DIR *dir= my_dir(maria_data_root, MYF(MY_WME)); + if (!dir) + DBUG_RETURN(1); + + my_message(HA_ERR_INITIALIZATION, + "Found old style Maria log files; " + "Converting them to Aria names", + MYF(ME_NOTE)); + + for (i= 0; i < dir->number_of_files; i++) + { + const char *file= dir->dir_entry[i].name; + if (strncmp(file, "maria_log.", 10) == 0 && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] >= '0' && file[17] <= '9' && + file[18] == '\0') + { + /* Remove the 'm' in 'maria' */ + char old_logname[FN_REFLEN], new_logname[FN_REFLEN]; + fn_format(old_logname, file, maria_data_root, "", MYF(0)); + fn_format(new_logname, file+1, maria_data_root, "", MYF(0)); + if (mysql_file_rename(key_file_translog, old_logname, + new_logname, MYF(MY_WME))) + { + my_dirend(dir); + DBUG_RETURN(1); + } + } + } + my_dirend(dir); + + fn_format(new_name, CONTROL_FILE_BASE_NAME, maria_data_root, "", MYF(0)); + if (mysql_file_rename(key_file_control, name, new_name, MYF(MY_WME))) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c new file mode 100644 index 00000000..d47e8cf7 --- /dev/null +++ b/storage/maria/ma_key.c @@ -0,0 +1,788 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Functions to handle keys */ + +#include "maria_def.h" +#include "m_ctype.h" +#include "ma_sp_defs.h" +#include "ma_blockrec.h" /* For ROW_FLAG_TRANSID */ +#include "trnman.h" +#ifdef HAVE_IEEEFP_H +#include <ieeefp.h> +#endif + +#define CHECK_KEYS /* Enable safety checks */ + +static int _ma_put_key_in_record(MARIA_HA *info, uint keynr, + my_bool unpack_blobs, uchar *record); + +#define FIX_LENGTH(cs, pos, length, char_length) \ + do { \ + if (length > char_length) \ + char_length= (uint) my_ci_charpos(cs, (const char *) pos, \ + (const char *) pos+length, \ + char_length); \ + set_if_smaller(char_length,length); \ + } while(0) + + +/** + Store trid in a packed format as part of a key + + @fn transid_store_packed + @param info Maria handler + @param to End of key to which we should store a packed transid + @param trid Trid to be stored + + @notes + + Keys that have a transid has the lowest bit set for the last byte of the key + This function sets this bit for the key. + + Trid is max 6 bytes long + + First Trid it's converted to a smaller number by using + trid= trid - create_trid. + Then trid is then shifted up one bit so that we can use the + lowest bit as a marker if it's followed by another trid. + + Trid is then stored as follows: + + if trid < 256-12 + one byte + else + one byte prefix length_of_trid_in_bytes + 249 followed by data + in high-byte-first order + + Prefix bytes 244 to 249 are reserved for negative transid, that can be used + when we pack transid relative to each other on a key block. + + We have to store transid in high-byte-first order so that we can compare + them unpacked byte per byte and as soon we find a difference we know + which is smaller. + + For example, assuming we the following data: + + key_data: 1 (4 byte integer) + pointer_to_row: 2 << 8 + 3 = 515 (page 2, row 3) + table_create_transid 1000 Defined at create table time and + stored in table definition + transid 1010 Transaction that created row + delete_transid 2011 Transaction that deleted row + + In addition we assume the table is created with a data pointer length + of 4 bytes (this is automatically calculated based on the medium + length of rows and the given max number of rows) + + The binary data for the key would then look like this in hex: + + 00 00 00 01 Key data (1 stored high byte first) + 00 00 00 47 (515 << 1) + 1 ; The last 1 is marker that key cont. + 15 ((1010-1000) << 1) + 1 ; The last 1 is marker that key cont. + FB 07 E6 Length byte (FE = 249 + 2 means 2 bytes) and + ((2011 - 1000) << 1) = 07 E6 +*/ + +uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid) +{ + uchar *start; + uint length; + uchar buff[8]; + DBUG_ASSERT(trid < (1LL << (MARIA_MAX_PACK_TRANSID_SIZE*8))); + DBUG_ASSERT(trid >= info->s->state.create_trid); + + trid= (trid - info->s->state.create_trid) << 1; + + /* Mark that key contains transid */ + to[-1]|= 1; + + if (trid < MARIA_MIN_TRANSID_PACK_OFFSET) + { + to[0]= (uchar) trid; + return 1; + } + start= to; + + /* store things in low-byte-first-order in buff */ + to= buff; + do + { + *to++= (uchar) trid; + trid= trid>>8; + } while (trid); + + length= (uint) (to - buff); + /* Store length prefix */ + start[0]= (uchar) (length + MARIA_TRANSID_PACK_OFFSET); + start++; + /* Copy things in high-byte-first order to output buffer */ + do + { + *start++= *--to; + } while (to != buff); + return length+1; +} + + +/** + Read packed transid + + @fn transid_get_packed + @param info Maria handler + @param from Transid is stored here + + See transid_store_packed() for how transid is packed + +*/ + +ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from) +{ + ulonglong value; + uint length; + + if (from[0] < MARIA_MIN_TRANSID_PACK_OFFSET) + value= (ulonglong) from[0]; + else + { + value= 0; + for (length= (uint) (from[0] - MARIA_TRANSID_PACK_OFFSET), + value= (ulonglong) from[1], from+=2; + --length ; + from++) + value= (value << 8) + ((ulonglong) *from); + } + return (value >> 1) + share->state.create_trid; +} + + +/* + Make a normal (not spatial or fulltext) intern key from a record + + SYNOPSIS + _ma_make_key() + info MyiSAM handler + int_key Store created key here + keynr key number + key Buffer used to store key data + record Record + filepos Position to record in the data file + + NOTES + This is used to generate keys from the record on insert, update and delete + + RETURN + key +*/ + +MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr, + uchar *key, const uchar *record, + MARIA_RECORD_POS filepos, ulonglong trid) +{ + const uchar *pos; + reg1 HA_KEYSEG *keyseg; + my_bool is_ft; + DBUG_ENTER("_ma_make_key"); + + int_key->data= key; + int_key->flag= 0; /* Always return full key */ + int_key->keyinfo= info->s->keyinfo + keynr; + + is_ft= int_key->keyinfo->flag & HA_FULLTEXT; + for (keyseg= int_key->keyinfo->seg ; keyseg->type ;keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint length=keyseg->length; + uint char_length; + CHARSET_INFO *cs=keyseg->charset; + + if (keyseg->null_bit) + { + if (record[keyseg->null_pos] & keyseg->null_bit) + { + *key++= 0; /* NULL in key */ + continue; + } + *key++=1; /* Not NULL */ + } + + char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : + length); + + pos= record+keyseg->start; + if (type == HA_KEYTYPE_BIT) + { + if (keyseg->bit_length) + { + uchar bits= get_rec_bits(record + keyseg->bit_pos, + keyseg->bit_start, keyseg->bit_length); + *key++= (char) bits; + length--; + } + memcpy(key, pos, length); + key+= length; + continue; + } + if (keyseg->flag & HA_SPACE_PACK) + { + if (type != HA_KEYTYPE_NUM) + { + length= (uint) my_ci_lengthsp(cs, (const char*)pos, length); + } + else + { + const uchar *end= pos + length; + while (pos < end && pos[0] == ' ') + pos++; + length= (uint) (end-pos); + } + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key, pos, (size_t) char_length); + key+=char_length; + continue; + } + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= (keyseg->bit_start == 1 ? 1 : 2); + uint tmp_length= (pack_length == 1 ? (uint) *pos : + uint2korr(pos)); + pos+= pack_length; /* Skip VARCHAR length */ + set_if_smaller(length,tmp_length); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); + uchar *blob_pos; + memcpy(&blob_pos, pos+keyseg->bit_start,sizeof(char*)); + set_if_smaller(length,tmp_length); + FIX_LENGTH(cs, blob_pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key, blob_pos, (size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_SWAP_KEY) + { /* Numerical column */ + if (type == HA_KEYTYPE_FLOAT) + { + float nr; + float4get(nr,pos); + if (isnan(nr)) + { + /* Replace NAN with zero */ + bzero(key,length); + key+=length; + continue; + } + } + else if (type == HA_KEYTYPE_DOUBLE) + { + double nr; + float8get(nr,pos); + if (isnan(nr)) + { + bzero(key,length); + key+=length; + continue; + } + } + pos+=length; + while (length--) + { + *key++ = *--pos; + } + continue; + } + FIX_LENGTH(cs, pos, length, char_length); + memcpy(key, pos, char_length); + if (length > char_length) + my_ci_fill(cs, (char*) key+char_length, length-char_length, ' '); + key+= length; + } + _ma_dpointer(info->s, key, filepos); + int_key->data_length= (uint)(key - int_key->data); + int_key->ref_length= info->s->rec_reflength; + int_key->flag= 0; + if (_ma_have_versioning(info) && trid) + { + int_key->ref_length+= transid_store_packed(info, + key + int_key->ref_length, + (TrID) trid); + int_key->flag|= SEARCH_USER_KEY_HAS_TRANSID; + } + + DBUG_PRINT("exit",("keynr: %d",keynr)); + DBUG_DUMP_KEY("key", int_key); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, int_key);); + DBUG_RETURN(int_key); +} /* _ma_make_key */ + + +/* + Pack a key to intern format from given format (c_rkey) + + SYNOPSIS + _ma_pack_key() + info MARIA handler + int_key Store key here + keynr key number + key Buffer for key data + old Original not packed key + keypart_map bitmap of used keyparts + last_used_keyseg out parameter. May be NULL + + RETURN + int_key + + last_use_keyseg Store pointer to the keyseg after the last used one +*/ + +MARIA_KEY *_ma_pack_key(register MARIA_HA *info, MARIA_KEY *int_key, + uint keynr, uchar *key, + const uchar *old, key_part_map keypart_map, + HA_KEYSEG **last_used_keyseg) +{ + HA_KEYSEG *keyseg; + my_bool is_ft; + DBUG_ENTER("_ma_pack_key"); + + int_key->data= key; + int_key->keyinfo= info->s->keyinfo + keynr; + + /* "one part" rtree key is 2*SPDIMS part key in Maria */ + if (int_key->keyinfo->key_alg == HA_KEY_ALG_RTREE) + keypart_map= (((key_part_map)1) << (2*SPDIMS)) - 1; + + /* only key prefixes are supported */ + DBUG_ASSERT(((keypart_map+1) & keypart_map) == 0); + + is_ft= int_key->keyinfo->flag & HA_FULLTEXT; + for (keyseg=int_key->keyinfo->seg ; keyseg->type && keypart_map; + old+= keyseg->length, keyseg++) + { + enum ha_base_keytype type= (enum ha_base_keytype) keyseg->type; + uint length= keyseg->length; + uint char_length; + const uchar *pos; + CHARSET_INFO *cs=keyseg->charset; + + keypart_map>>= 1; + if (keyseg->null_bit) + { + if (!(*key++= (char) 1-*old++)) /* Copy null marker */ + { + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + old+= 2; + continue; /* Found NULL */ + } + } + char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : + length); + pos= old; + if (keyseg->flag & HA_SPACE_PACK) + { + const uchar *end= pos + length; + if (type == HA_KEYTYPE_NUM) + { + while (pos < end && pos[0] == ' ') + pos++; + } + else if (type != HA_KEYTYPE_BINARY) + { + while (end > pos && end[-1] == ' ') + end--; + } + length=(uint) (end-pos); + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + memcpy(key,pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) + { + /* Length of key-part used with maria_rkey() always 2 */ + uint tmp_length=uint2korr(pos); + pos+=2; + set_if_smaller(length,tmp_length); /* Safety */ + FIX_LENGTH(cs, pos, length, char_length); + store_key_length_inc(key,char_length); + old+=2; /* Skip length */ + memcpy(key, pos,(size_t) char_length); + key+= char_length; + continue; + } + else if (keyseg->flag & HA_SWAP_KEY) + { /* Numerical column */ + pos+=length; + while (length--) + *key++ = *--pos; + continue; + } + FIX_LENGTH(cs, pos, length, char_length); + memcpy(key, pos, char_length); + if (length > char_length) + my_ci_fill(cs, (char*) key+char_length, length-char_length, ' '); + key+= length; + } + if (last_used_keyseg) + *last_used_keyseg= keyseg; + + /* set flag to SEARCH_PART_KEY if we are not using all key parts */ + int_key->flag= keyseg->type ? SEARCH_PART_KEY : 0; + int_key->ref_length= 0; + int_key->data_length= (uint)(key - int_key->data); + + DBUG_PRINT("exit", ("length: %u", int_key->data_length)); + DBUG_RETURN(int_key); +} /* _ma_pack_key */ + + +/** + Copy a key +*/ + +void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from) +{ + memcpy(to->data, from->data, from->data_length + from->ref_length); + to->keyinfo= from->keyinfo; + to->data_length= from->data_length; + to->ref_length= from->ref_length; + to->flag= from->flag; +} + + +/* + Store found key in record + + SYNOPSIS + _ma_put_key_in_record() + info MARIA handler + keynr Key number that was used + unpack_blobs TRUE <=> Unpack blob columns + FALSE <=> Skip them. This is used by index condition + pushdown check function + record Store key here + + Last read key is in info->lastkey + + NOTES + Used when only-keyread is wanted + + RETURN + 0 ok + 1 error +*/ + +static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr, + my_bool unpack_blobs, uchar *record) +{ + reg2 uchar *key; + uchar *pos,*key_end; + reg1 HA_KEYSEG *keyseg; + uchar *blob_ptr; + DBUG_ENTER("_ma_put_key_in_record"); + + blob_ptr= info->lastkey_buff2; /* Place to put blob parts */ + key= info->last_key.data; /* Key that was read */ + key_end= key + info->last_key.data_length; + for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++) + { + if (keyseg->null_bit) + { + if (!*key++) + { + record[keyseg->null_pos]|= keyseg->null_bit; + continue; + } + record[keyseg->null_pos]&= ~keyseg->null_bit; + } + if (keyseg->type == HA_KEYTYPE_BIT) + { + uint length= keyseg->length; + + if (keyseg->bit_length) + { + uchar bits= *key++; + set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start, + keyseg->bit_length); + length--; + } + else + { + clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start, + keyseg->bit_length); + } + memcpy(record + keyseg->start, key, length); + key+= length; + continue; + } + if (keyseg->flag & HA_SPACE_PACK) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + pos= record+keyseg->start; + if (keyseg->type != (int) HA_KEYTYPE_NUM) + { + memcpy(pos,key,(size_t) length); + my_ci_fill(keyseg->charset, (char*) pos + length, + keyseg->length - length, + ' '); + } + else + { + bfill(pos,keyseg->length-length,' '); + memcpy(pos+keyseg->length-length,key,(size_t) length); + } + key+=length; + continue; + } + + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + /* Store key length */ + if (keyseg->bit_start == 1) + *(uchar*) (record+keyseg->start)= (uchar) length; + else + int2store(record+keyseg->start, length); + /* And key data */ + memcpy(record+keyseg->start + keyseg->bit_start, key, length); + key+= length; + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint length; + get_key_length(length,key); +#ifdef CHECK_KEYS + if (length > keyseg->length || key+length > key_end) + goto err; +#endif + if (unpack_blobs) + { + memcpy(record+keyseg->start+keyseg->bit_start, + &blob_ptr, sizeof(char*)); + memcpy(blob_ptr,key,length); + blob_ptr+=length; + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + _ma_store_blob_length(record+keyseg->start, + (uint) keyseg->bit_start,length); + } + key+=length; + } + else if (keyseg->flag & HA_SWAP_KEY) + { + uchar *to= record+keyseg->start+keyseg->length; + uchar *end= key+keyseg->length; +#ifdef CHECK_KEYS + if (end > key_end) + goto err; +#endif + do + { + *--to= *key++; + } while (key != end); + continue; + } + else + { +#ifdef CHECK_KEYS + if (key+keyseg->length > key_end) + goto err; +#endif + memcpy(record+keyseg->start, key, (size_t) keyseg->length); + key+= keyseg->length; + } + } + DBUG_RETURN(0); + +err: + DBUG_PRINT("info",("error")); + DBUG_RETURN(1); /* Crashed row */ +} /* _ma_put_key_in_record */ + + + /* Here when key reads are used */ + +int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + fast_ma_writeinfo(info); + if (filepos != HA_OFFSET_ERROR) + { + if (info->lastinx >= 0) + { /* Read only key */ + if (_ma_put_key_in_record(info, (uint)info->lastinx, TRUE, buf)) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + return -1; + } + info->update|= HA_STATE_AKTIV; /* We should find a record */ + return 0; + } + my_errno=HA_ERR_WRONG_INDEX; + } + return(-1); /* Wrong data to read */ +} + + + +/* + Save current key tuple to record and call index condition check function + + SYNOPSIS + ma_check_index_cond() + info MyISAM handler + keynr Index we're running a scan on + record Record buffer to use (it is assumed that index check function + will look for column values there) + + RETURN + CHECK_ERROR Error ; my_errno set to HA_ERR_CRASHED + CHECK_NEG Index condition is not satisfied, continue scanning + CHECK_POS Index condition is satisfied + CHECK_OUT_OF_RANGE Index condition is not satisfied, end the scan. + my_errno set to HA_ERR_END_OF_FILE + + info->cur_row.lastpos is set to HA_OFFSET_ERROR in case of CHECK_ERROR or + CHECK_OUT_OF_RANGE to indicate that we don't have any active row. +*/ + +check_result_t ma_check_index_cond(register MARIA_HA *info, uint keynr, + uchar *record) +{ + check_result_t res= CHECK_POS; + if (info->index_cond_func) + { + if (_ma_put_key_in_record(info, keynr, FALSE, record)) + { + /* Impossible case; Can only happen if bug in code */ + _ma_print_error(info, HA_ERR_CRASHED, 0); + info->cur_row.lastpos= HA_OFFSET_ERROR; /* No active record */ + my_errno= HA_ERR_CRASHED; + res= CHECK_ERROR; + } + else if ((res= info->index_cond_func(info->index_cond_func_arg)) == + CHECK_OUT_OF_RANGE) + { + /* We got beyond the end of scanned range */ + info->cur_row.lastpos= HA_OFFSET_ERROR; /* No active record */ + my_errno= HA_ERR_END_OF_FILE; + } + } + return res; +} + + +/* + Retrieve auto_increment info + + SYNOPSIS + retrieve_auto_increment() + key Auto-increment key + key_type Key's type + + NOTE + 'key' should in "record" format, that is, how it is packed in a record + (this matters with HA_SWAP_KEY). + + IMPLEMENTATION + For signed columns we don't retrieve the auto increment value if it's + less than zero. +*/ + +ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type) +{ + ulonglong value= 0; /* Store unsigned values here */ + longlong s_value= 0; /* Store signed values here */ + + switch (key_type) { + case HA_KEYTYPE_INT8: + s_value= (longlong) *(const signed char*) key; + break; + case HA_KEYTYPE_BINARY: + value=(ulonglong) *key; + break; + case HA_KEYTYPE_SHORT_INT: + s_value= (longlong) sint2korr(key); + break; + case HA_KEYTYPE_USHORT_INT: + value=(ulonglong) uint2korr(key); + break; + case HA_KEYTYPE_LONG_INT: + s_value= (longlong) sint4korr(key); + break; + case HA_KEYTYPE_ULONG_INT: + value=(ulonglong) uint4korr(key); + break; + case HA_KEYTYPE_INT24: + s_value= (longlong) sint3korr(key); + break; + case HA_KEYTYPE_UINT24: + value=(ulonglong) uint3korr(key); + break; + case HA_KEYTYPE_FLOAT: /* This shouldn't be used */ + { + float f_1; + float4get(f_1,key); + /* Ignore negative values */ + value = (f_1 < (float) 0.0) ? 0 : (ulonglong) f_1; + break; + } + case HA_KEYTYPE_DOUBLE: /* This shouldn't be used */ + { + double f_1; + float8get(f_1,key); + /* Ignore negative values */ + value = (f_1 < 0.0) ? 0 : (ulonglong) f_1; + break; + } + case HA_KEYTYPE_LONGLONG: + s_value= sint8korr(key); + break; + case HA_KEYTYPE_ULONGLONG: + value= uint8korr(key); + break; + default: + DBUG_ASSERT(0); + value=0; /* Error */ + break; + } + + /* + The following code works becasue if s_value < 0 then value is 0 + and if s_value == 0 then value will contain either s_value or the + correct value. + */ + return (s_value > 0) ? (ulonglong) s_value : value; +} diff --git a/storage/maria/ma_key_recover.c b/storage/maria/ma_key_recover.c new file mode 100644 index 00000000..acec592b --- /dev/null +++ b/storage/maria/ma_key_recover.c @@ -0,0 +1,1441 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Redo of index */ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_rt_index.h" + +/**************************************************************************** + Some helper functions used both by key page loggin and block page loggin +****************************************************************************/ + +/** + @brief Unpin all pinned pages + + @fn _ma_unpin_all_pages() + @param info Maria handler + @param undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write + undo (like on duplicate key errors) + + info->pinned_pages is the list of pages to unpin. Each member of the list + must have its 'changed' saying if the page was changed or not. + + @note + We unpin pages in the reverse order as they where pinned; This is not + necessary now, but may simplify things in the future. + + @return + @retval 0 ok + @retval 1 error (fatal disk error) +*/ + +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) +{ + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&info->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements; + DBUG_ENTER("_ma_unpin_all_pages"); + DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); + + if (!info->s->now_transactional) + DBUG_ASSERT(undo_lsn == LSN_IMPOSSIBLE || maria_in_recovery); + + while (pinned_page-- != page_link) + { + /* + Note this assert fails if we got a disk error or the record file + is corrupted, which means we should have this enabled only in debug + builds. + */ +#ifdef EXTRA_DEBUG + DBUG_ASSERT((!pinned_page->changed || + undo_lsn != LSN_IMPOSSIBLE || !info->s->now_transactional) || + (info->s->state.changed & STATE_CRASHED_FLAGS)); +#endif + pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + info->trn->rec_lsn, undo_lsn, + pinned_page->changed, FALSE); + } + + info->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn, + enum translog_record_type undo_type, + my_bool store_checksum, ha_checksum checksum, + LSN *res_lsn, void *extra_msg) +{ + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE + + HA_CHECKSUM_STORE_SIZE+ KEY_NR_STORE_SIZE + PAGE_STORE_SIZE]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + struct st_msg_to_write_hook_for_clr_end msg; + my_bool res; + DBUG_ENTER("_ma_write_clr"); + + /* undo_lsn must be first for compression to work */ + lsn_store(log_data, undo_lsn); + clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, undo_type); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE; + + /* Extra_msg is handled in write_hook_for_clr_end() */ + msg.undone_record_type= undo_type; + msg.previous_undo_lsn= undo_lsn; + msg.extra_msg= extra_msg; + msg.checksum_delta= 0; + + if (store_checksum) + { + msg.checksum_delta= checksum; + ha_checksum_store(log_pos, checksum); + log_pos+= HA_CHECKSUM_STORE_SIZE; + } + else if (undo_type == LOGREC_UNDO_KEY_INSERT_WITH_ROOT || + undo_type == LOGREC_UNDO_KEY_DELETE_WITH_ROOT) + { + /* Key root changed. Store new key root */ + struct st_msg_to_write_hook_for_undo_key *undo_msg= extra_msg; + pgcache_page_no_t page; + key_nr_store(log_pos, undo_msg->keynr); + page= (undo_msg->value == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO : + undo_msg->value / info->s->block_size); + page_store(log_pos + KEY_NR_STORE_SIZE, page); + log_pos+= KEY_NR_STORE_SIZE + PAGE_STORE_SIZE; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + + + /* + We need intern_lock mutex for calling _ma_state_info_write in the trigger. + We do it here to have the same sequence of mutexes locking everywhere + (first intern_lock then transactional log buffer lock) + */ + if (undo_type == LOGREC_UNDO_BULK_INSERT) + mysql_mutex_lock(&info->s->intern_lock); + + res= translog_write_record(res_lsn, LOGREC_CLR_END, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE, &msg); + if (undo_type == LOGREC_UNDO_BULK_INSERT) + mysql_mutex_unlock(&info->s->intern_lock); + DBUG_RETURN(res); +} + + +/** + @brief Sets transaction's undo_lsn, first_undo_lsn if needed + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_clr_end(enum translog_record_type type + __attribute__ ((unused)), + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn __attribute__ ((unused)), + void *hook_arg) +{ + MARIA_SHARE *share= tbl_info->s; + struct st_msg_to_write_hook_for_clr_end *msg= + (struct st_msg_to_write_hook_for_clr_end *)hook_arg; + my_bool error= FALSE; + DBUG_ENTER("write_hook_for_clr_end"); + DBUG_ASSERT(trn->trid != 0); + trn->undo_lsn= msg->previous_undo_lsn; + + switch (msg->undone_record_type) { + case LOGREC_UNDO_ROW_DELETE: + share->state.state.records++; + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_ROW_INSERT: + share->state.state.records--; + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_ROW_UPDATE: + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_KEY_INSERT_WITH_ROOT: + case LOGREC_UNDO_KEY_DELETE_WITH_ROOT: + { + /* Update key root */ + struct st_msg_to_write_hook_for_undo_key *extra_msg= + (struct st_msg_to_write_hook_for_undo_key *) msg->extra_msg; + *extra_msg->root= extra_msg->value; + break; + } + case LOGREC_UNDO_KEY_INSERT: + case LOGREC_UNDO_KEY_DELETE: + break; + case LOGREC_UNDO_BULK_INSERT: + mysql_mutex_assert_owner(&share->intern_lock); + error= (maria_enable_indexes(tbl_info) || + /* we enabled indices, need '2' below */ + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_FULL_INFO)); + /* no need for _ma_reset_status(): REDO_DELETE_ALL is just before us */ + break; + default: + DBUG_ASSERT(0); + } + if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */ + trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + DBUG_RETURN(error); +} + + +/** + @brief write hook for undo key +*/ + +my_bool write_hook_for_undo_key(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + + *msg->root= msg->value; + _ma_fast_unlock_key_del(tbl_info); + return write_hook_for_undo(type, trn, tbl_info, lsn, 0); +} + + +/** + Updates "auto_increment" and calls the generic UNDO_KEY hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_key_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + MARIA_SHARE *share= tbl_info->s; + if (msg->auto_increment > 0) + { + /* + Only reason to set it here is to have a mutex protect from checkpoint + reading at the same time (would see a corrupted value). + + The purpose of the following code is to set auto_increment if the row + has a with auto_increment value higher than the current one. We also + want to be able to restore the old value, in case of rollback, + if no one else has tried to set the value. + + The logic used is that we only restore the auto_increment value if + tbl_info->last_auto_increment == share->last_auto_increment + when it's time to do the rollback. + */ + DBUG_PRINT("info",("auto_inc: %lu new auto_inc: %lu", + (ulong)share->state.auto_increment, + (ulong)msg->auto_increment)); + if (share->state.auto_increment < msg->auto_increment) + { + /* Remember the original value, in case of rollback */ + tbl_info->last_auto_increment= share->last_auto_increment= + share->state.auto_increment; + share->state.auto_increment= msg->auto_increment; + } + else + { + /* + If the current value would have affected the original auto_increment + value, set it to an impossible value so that it's not restored on + rollback + */ + if (msg->auto_increment > share->last_auto_increment) + share->last_auto_increment= ~(ulonglong) 0; + } + } + return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg); +} + + +/** + @brief Updates "share->auto_increment" in case of abort and calls + generic UNDO_KEY hook + + @return Operation status, always 0 (success) +*/ + +my_bool write_hook_for_undo_key_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + MARIA_SHARE *share= tbl_info->s; + if (msg->auto_increment > 0) /* If auto increment key */ + { + /* Restore auto increment if no one has changed it in between */ + if (share->last_auto_increment == tbl_info->last_auto_increment && + tbl_info->last_auto_increment != ~(ulonglong) 0) + share->state.auto_increment= tbl_info->last_auto_increment; + } + return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg); +} + + +/***************************************************************************** + Functions for logging of key page changes +*****************************************************************************/ + +/** + @brief + Write log entry for page that has got data added or deleted at start of page +*/ + +my_bool _ma_log_prefix(MARIA_PAGE *ma_page, uint changed_length, + int move_length, + enum en_key_debug debug_marker __attribute__((unused))) +{ + uint translog_parts; + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 7 + 7 + 2 + 2]; + uchar *log_pos; + uchar *buff= ma_page->buff; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + MARIA_HA *info= ma_page->info; + pgcache_page_no_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_prefix"); + DBUG_PRINT("enter", ("page: %lu changed_length: %u move_length: %d", + (ulong) page, changed_length, move_length)); + + DBUG_ASSERT(ma_page->size == ma_page->org_size + move_length); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + (*log_pos++)= KEY_OP_DEBUG; + (*log_pos++)= debug_marker; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, buff); + + if (move_length < 0) + { + /* Delete prefix */ + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, -move_length); + log_pos+= 3; + if (changed_length) + { + /* + We don't need a KEY_OP_OFFSET as KEY_OP_DEL_PREFIX has an implicit + offset + */ + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, changed_length); + log_pos+= 3; + } + } + else + { + /* Add prefix */ + DBUG_ASSERT(changed_length >0 && (int) changed_length >= move_length); + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, move_length); + int2store(log_pos+3, changed_length); + log_pos+= 5; + } + + translog_parts= 1; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + if (changed_length) + { + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length; + translog_parts= 2; + } + + _ma_log_key_changes(ma_page, log_array + TRANSLOG_INTERNAL_PARTS + + translog_parts, log_pos, &changed_length, + &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + changed_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added or deleted at end of page +*/ + +my_bool _ma_log_suffix(MARIA_PAGE *ma_page, uint org_length, uint new_length) +{ + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 10 + 7 + 2], *log_pos; + uchar *buff= ma_page->buff; + int diff; + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + pgcache_page_no_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_suffix"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + DBUG_ASSERT(ma_page->size == new_length); + DBUG_ASSERT(ma_page->org_size == org_length); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, buff); + + if ((diff= (int) (new_length - org_length)) < 0) + { + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, -diff); + log_pos+= 3; + translog_parts= 1; + extra_length= 0; + } + else + { + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, diff); + log_pos+= 3; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= buff + org_length; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (uint) diff; + translog_parts= 2; + extra_length= (uint) diff; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief Log that a key was added to the page + + @param ma_page Changed page + @param org_page_length Length of data in page before key was added + Final length in ma_page->size + + @note + If handle_overflow is set, then we have to protect against + logging changes that is outside of the page. + This may happen during underflow() handling where the buffer + in memory temporary contains more data than block_size + + ma_page may be a page that was previously logged and cuted down + becasue it's too big. (org_page_length > ma_page->org_size) +*/ + +my_bool _ma_log_add(MARIA_PAGE *ma_page, + uint org_page_length __attribute__ ((unused)), + uchar *key_pos, uint changed_length, int move_length, + my_bool handle_overflow __attribute__ ((unused)), + enum en_key_debug debug_marker __attribute__((unused))) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 3 + 3 + 3 + 3 + 7 + + 3 + 2]; + uchar *log_pos; + uchar *buff= ma_page->buff; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + MARIA_HA *info= ma_page->info; + uint offset= (uint) (key_pos - buff); + uint max_page_size= info->s->max_index_block_size; + uint translog_parts, current_size; + pgcache_page_no_t page_pos= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_add"); + DBUG_PRINT("enter", ("page: %lu org_page_length: %u changed_length: %u " + "move_length: %d", + (ulong) page_pos, org_page_length, changed_length, + move_length)); + DBUG_ASSERT(info->s->now_transactional); + DBUG_ASSERT(move_length <= (int) changed_length); + DBUG_ASSERT(ma_page->org_size == MY_MIN(org_page_length, max_page_size)); + DBUG_ASSERT(ma_page->size == org_page_length + move_length); + DBUG_ASSERT(offset <= ma_page->org_size); + + /* + Write REDO entry that contains the logical operations we need + to do the page + */ + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page_pos); + current_size= ma_page->org_size; + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= debug_marker; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, buff); + + /* + Don't overwrite page boundary + It's ok to cut this as we will append the data at end of page + in the next log entry + */ + if (offset + changed_length > max_page_size) + { + DBUG_ASSERT(handle_overflow); + changed_length= max_page_size - offset; /* Update to end of page */ + move_length= 0; /* Nothing to move */ + /* Extend the page to max length on recovery */ + *log_pos++= KEY_OP_MAX_PAGELENGTH; + current_size= max_page_size; + } + + /* Check if adding the key made the page overflow */ + if (current_size + move_length > max_page_size) + { + /* + Adding the key caused an overflow. Cut away the part of the + page that doesn't fit. + */ + uint diff; + DBUG_ASSERT(handle_overflow); + diff= current_size + move_length - max_page_size; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, diff); + log_pos+= 3; + current_size= max_page_size - move_length; + } + + if (offset == current_size) + { + log_pos[0]= KEY_OP_ADD_SUFFIX; + current_size+= changed_length; + } + else + { + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + if (move_length) + { + if (move_length < 0) + { + DBUG_ASSERT(offset - move_length <= org_page_length); + if (offset - move_length > current_size) + { + /* + Truncate to end of page. We will add data to it from + the page buffer below + */ + move_length= (int) offset - (int) current_size; + } + } + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + current_size+= move_length; + } + /* + Handle case where page was shortend but 'changed_length' goes over + 'current_size'. This can only happen when there was a page overflow + and we will below add back the overflow part + */ + if (offset + changed_length > current_size) + { + DBUG_ASSERT(offset + changed_length <= ma_page->size); + changed_length= current_size - offset; + } + log_pos[0]= KEY_OP_CHANGE; + } + int2store(log_pos+1, changed_length); + log_pos+= 3; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length; + translog_parts= TRANSLOG_INTERNAL_PARTS + 2; + + /* + If page was originally > block_size before operation and now all data + fits, append the end data that was not part of the previous logged + page to it. + */ + DBUG_ASSERT(current_size <= max_page_size && current_size <= ma_page->size); + if (current_size != ma_page->size && current_size != max_page_size) + { + uint length= MY_MIN(ma_page->size, max_page_size) - current_size; + uchar *data= ma_page->buff + current_size; + + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, length); + log_array[translog_parts].str= log_pos; + log_array[translog_parts].length= 3; + log_array[translog_parts+1].str= data; + log_array[translog_parts+1].length= length; + log_pos+= 3; + translog_parts+= 2; + current_size+= length; + changed_length+= length + 3; + } + + _ma_log_key_changes(ma_page, log_array + translog_parts, + log_pos, &changed_length, &translog_parts); + /* + Remember new page length for future log entries for same page + Note that this can be different from ma_page->size in case of page + overflow! + */ + ma_page->org_size= current_size; + DBUG_ASSERT(ma_page->org_size == MY_MIN(ma_page->size, max_page_size)); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + changed_length, translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(-1); + DBUG_RETURN(0); +} + + +#ifdef EXTRA_DEBUG_KEY_CHANGES + +/* Log checksum and optionally key page to log */ + +void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array, + uchar *log_pos, uint *changed_length, + uint *translog_parts) +{ + MARIA_SHARE *share= ma_page->info->s; + int page_length= MY_MIN(ma_page->size, share->max_index_block_size); + uint org_length; + ha_checksum crc; + + DBUG_ASSERT(ma_page->flag == (uint) _ma_get_keypage_flag(share, ma_page->buff)); + + /* We have to change length as the page may have been shortened */ + org_length= _ma_get_page_used(share, ma_page->buff); + _ma_store_page_used(share, ma_page->buff, page_length); + crc= my_checksum(0, ma_page->buff + LSN_STORE_SIZE, + page_length - LSN_STORE_SIZE); + _ma_store_page_used(share, ma_page->buff, org_length); + + log_pos[0]= KEY_OP_CHECK; + int2store(log_pos+1, page_length); + int4store(log_pos+3, crc); + + log_array[0].str= log_pos; + log_array[0].length= 7; + (*changed_length)+= 7; + (*translog_parts)++; +#ifdef EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES + log_array[1].str= ma_page->buff; + log_array[1].length= page_length; + (*changed_length)+= page_length; + (*translog_parts)++; +#endif /* EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES */ +} + +#endif /* EXTRA_DEBUG_KEY_CHANGES */ + +/**************************************************************************** + Redo of key pages +****************************************************************************/ + +/** + @brief Apply LOGREC_REDO_INDEX_NEW_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length) +{ + pgcache_page_no_t root_page= page_korr(header); + pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE); + uint key_nr= key_nr_korr(header + PAGE_STORE_SIZE * 2); + my_bool page_type_flag= header[PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE]; + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + MARIA_PINNED_PAGE page_link; + my_off_t file_size; + uchar *buff; + uint result; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_apply_redo_index_new_page"); + DBUG_PRINT("enter", ("root_page: %lu free_page: %lu", + (ulong) root_page, (ulong) free_page)); + + /* Set header to point at key data */ + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + header+= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1; + length-= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1; + + file_size= (my_off_t) (root_page + 1) * share->block_size; + if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0) + { + /* free_page is 0 if we shouldn't set key_del */ + if (free_page) + { + if (free_page != IMPOSSIBLE_PAGE_NO) + share->state.key_del= (my_off_t) free_page * share->block_size; + else + share->state.key_del= HA_OFFSET_ERROR; + } + if (page_type_flag) /* root page */ + share->state.key_root[key_nr]= file_size - share->block_size; + } + + if (file_size > share->state.state.key_file_length) + { + share->state.state.key_file_length= file_size; + buff= info->keyread_buff; + info->keyread_buff_used= 1; + unlock_method= PAGECACHE_LOCK_WRITE; + unpin_method= PAGECACHE_PIN; + } + else + { + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + root_page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + if (my_errno != HA_ERR_FILE_TOO_SHORT && + my_errno != HA_ERR_WRONG_CRC && + my_errno != HA_ERR_DECRYPTION_FAILED) + { + result= 1; + _ma_set_fatal_error(info, my_errno); + goto err; + } + buff= pagecache_block_link_to_buffer(page_link.link); + } + else if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + check_skipped_lsn(info, lsn_korr(buff), 0, root_page); + result= 0; + goto err; + } + unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + unpin_method= PAGECACHE_PIN_LEFT_PINNED; + } + + /* Write modified page */ + bzero(buff, LSN_STORE_SIZE); + memcpy(buff + LSN_STORE_SIZE, header, length); + bzero(buff + LSN_STORE_SIZE + length, + share->max_index_block_size - LSN_STORE_SIZE - length); + bfill(buff + share->block_size - KEYPAGE_CHECKSUM_SIZE, + KEYPAGE_CHECKSUM_SIZE, (uchar) 255); + + result= 0; + if (unlock_method == PAGECACHE_LOCK_WRITE && + pagecache_write(share->pagecache, + &share->kfile, root_page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE)) + result= 1; + + /* Mark page to be unlocked and written at _ma_unpin_all_pages() */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + DBUG_RETURN(result); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX_FREE_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_free_page(MARIA_HA *info, + LSN lsn, + const uchar *header) +{ + pgcache_page_no_t page= page_korr(header); + pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE); + my_off_t old_link; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uchar *buff; + int result; + DBUG_ENTER("_ma_apply_redo_index_free_page"); + DBUG_PRINT("enter", ("page: %lu free_page: %lu", + (ulong) page, (ulong) free_page)); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + + if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0) + share->state.key_del= (my_off_t) page * share->block_size; + + old_link= ((free_page != IMPOSSIBLE_PAGE_NO) ? + (my_off_t) free_page * share->block_size : + HA_OFFSET_ERROR); + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + _ma_set_fatal_error(info, my_errno); + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + check_skipped_lsn(info, lsn_korr(buff), 0, page); + result= 0; + goto err; + } + /* Free page */ + bzero(buff + LSN_STORE_SIZE, share->keypage_header - LSN_STORE_SIZE); + _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR); + _ma_store_page_used(share, buff, share->keypage_header + 8); + mi_sizestore(buff + share->keypage_header, old_link); + +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + bzero(buff + share->keypage_header + 8, + share->block_size - share->keypage_header - 8 - + KEYPAGE_CHECKSUM_SIZE); + } +#endif + + /* Mark page to be unlocked and written at _ma_unpin_all_pages() */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX + + @fn ma_apply_redo_index() + @param info Maria handler + @param header Header (without FILEID) + + @notes + Data for this part is a set of logical instructions of how to + construct the key page. + + Information of the layout of the components for REDO_INDEX: + + Name Parameters (in byte) Information + KEY_OP_OFFSET 2 Set position for next operations + KEY_OP_SHIFT 2 (signed int) How much to shift down or up + KEY_OP_CHANGE 2 length, data Data to replace at 'pos' + KEY_OP_ADD_PREFIX 2 move-length How much data should be moved up + 2 change-length Data to be replaced at page start + KEY_OP_DEL_PREFIX 2 length Bytes to be deleted at page start + KEY_OP_ADD_SUFFIX 2 length, data Add data to end of page + KEY_OP_DEL_SUFFIX 2 length Reduce page length with this + Sets position to start of page + KEY_OP_CHECK 6 page_length[2],CRC Used only when debugging + This may be followed by page_length + of data (until end of log record) + KEY_OP_COMPACT_PAGE 6 transid + KEY_OP_SET_PAGEFLAG 1 flag for page + KEY_OP_MAX_PAGELENGTH 0 Set page to max length + KEY_OP_DEBUG 1 Info where logging was done + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint head_length) +{ + MARIA_SHARE *share= info->s; + pgcache_page_no_t page_pos= page_korr(header); + MARIA_PINNED_PAGE page_link; + uchar *buff; + const uchar *header_end= header + head_length; + uint page_offset= 0, org_page_length; + uint page_length, keypage_header, keynr; + uint max_page_size= share->max_index_block_size; +#ifdef DBUG_ASSERT_EXISTS + uint new_page_length= 0; +#endif + int result, mark_crashed; + MARIA_PAGE page; + DBUG_ENTER("_ma_apply_redo_index"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page_pos)); + + /* Set header to point at key data */ + header+= PAGE_STORE_SIZE; + + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + page_pos, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= 1; mark_crashed= 0; + _ma_set_fatal_error(info, my_errno); + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + check_skipped_lsn(info, lsn_korr(buff), 0, page_pos); + result= mark_crashed= 0; + goto err; + } + + keynr= _ma_get_keynr(share, buff); + _ma_page_setup(&page, info, share->keyinfo + keynr, page_pos, buff); + org_page_length= page_length= page.size; + + keypage_header= share->keypage_header; + DBUG_PRINT("redo", ("page_length: %u", page_length)); + + /* Apply modifications to page */ + do + { + switch ((enum en_key_op) (*header++)) { + case KEY_OP_OFFSET: /* 1 */ + page_offset= uint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_offset: %u", page_offset)); + DBUG_ASSERT(page_offset >= keypage_header && page_offset <= page_length); + break; + case KEY_OP_SHIFT: /* 2 */ + { + int length= sint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_shift: %d", length)); + DBUG_ASSERT(page_offset != 0 && page_offset <= page_length && + page_length + length <= max_page_size); + + if (length < 0) + { + DBUG_ASSERT(page_offset - length <= page_length); + bmove(buff + page_offset, buff + page_offset - length, + page_length - page_offset + length); + } + else if (page_length != page_offset) + bmove_upp(buff + page_length + length, buff + page_length, + page_length - page_offset); + page_length+= length; + break; + } + case KEY_OP_CHANGE: /* 3 */ + { + uint length= uint2korr(header); + DBUG_PRINT("redo", ("key_op_change: %u", length)); + DBUG_ASSERT(page_offset != 0 && page_offset + length <= page_length); + + memcpy(buff + page_offset, header + 2 , length); + page_offset+= length; /* Put offset after changed length */ + header+= 2 + length; + break; + } + case KEY_OP_ADD_PREFIX: /* 4 */ + { + uint insert_length= uint2korr(header); + uint changed_length= uint2korr(header+2); + DBUG_PRINT("redo", ("key_op_add_prefix: %u %u", + insert_length, changed_length)); + + DBUG_ASSERT(insert_length <= changed_length && + page_length + insert_length <= max_page_size); + + bmove_upp(buff + page_length + insert_length, buff + page_length, + page_length - keypage_header); + memcpy(buff + keypage_header, header + 4 , changed_length); + header+= 4 + changed_length; + page_length+= insert_length; + break; + } + case KEY_OP_DEL_PREFIX: /* 5 */ + { + uint length= uint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_del_prefix: %u", length)); + DBUG_ASSERT(length <= page_length - keypage_header); + + bmove(buff + keypage_header, buff + keypage_header + + length, page_length - keypage_header - length); + page_length-= length; + + page_offset= keypage_header; /* Prepare for change */ + break; + } + case KEY_OP_ADD_SUFFIX: /* 6 */ + { + uint insert_length= uint2korr(header); + DBUG_PRINT("redo", ("key_op_add_suffix: %u", insert_length)); + DBUG_ASSERT(page_length + insert_length <= max_page_size); + memcpy(buff + page_length, header+2, insert_length); + + page_length+= insert_length; + header+= 2 + insert_length; + break; + } + case KEY_OP_DEL_SUFFIX: /* 7 */ + { + uint del_length= uint2korr(header); + header+= 2; + DBUG_PRINT("redo", ("key_op_del_suffix: %u", del_length)); + DBUG_ASSERT(page_length - del_length >= keypage_header); + page_length-= del_length; + break; + } + case KEY_OP_CHECK: /* 8 */ + { +#ifdef EXTRA_DEBUG_KEY_CHANGES + uint check_page_length; + ha_checksum crc; + check_page_length= uint2korr(header); + crc= uint4korr(header+2); + _ma_store_page_used(share, buff, page_length); + if (check_page_length != page_length || + crc != (uint32) my_checksum(0, buff + LSN_STORE_SIZE, + page_length - LSN_STORE_SIZE)) + { + DBUG_DUMP("KEY_OP_CHECK bad page", buff, page_length); + if (header + 6 + check_page_length <= header_end) + { + DBUG_DUMP("KEY_OP_CHECK org page", header + 6, check_page_length); + } + DBUG_ASSERT("crc failure in REDO_INDEX" == 0); + } +#endif + DBUG_PRINT("redo", ("key_op_check")); + /* + This is the last entry in the block and it can contain page_length + data or not + */ + DBUG_ASSERT(header + 6 == header_end || + header + 6 + page_length == header_end); + header= header_end; + break; + } + case KEY_OP_DEBUG: + DBUG_PRINT("redo", ("Debug: %u", (uint) header[0])); + header++; + break; + case KEY_OP_DEBUG_2: + DBUG_PRINT("redo", ("org_page_length: %u new_page_length: %u", + uint2korr(header), uint2korr(header+2))); + DBUG_ASSERT(uint2korr(header) == page_length); +#ifdef DBUG_ASSERT_EXISTS + new_page_length= MY_MIN(uint2korr(header+2), max_page_size); +#endif + header+= 4; + break; + case KEY_OP_MAX_PAGELENGTH: + DBUG_PRINT("redo", ("key_op_max_page_length")); + page_length= max_page_size; + break; + case KEY_OP_MULTI_COPY: /* 9 */ + { + /* + List of fixed-len memcpy() operations with their source located inside + the page. The log record's piece looks like: + first the length 'full_length' to be used by memcpy() + then the number of bytes used by the list of (to,from) pairs + then the (to,from) pairs, so we do: + for (t,f) in [list of (to,from) pairs]: + memcpy(t, f, full_length). + */ + uint full_length, log_memcpy_length; + const uchar *log_memcpy_end; + + DBUG_PRINT("redo", ("key_op_multi_copy")); + full_length= uint2korr(header); + header+= 2; + log_memcpy_length= uint2korr(header); + header+= 2; + log_memcpy_end= header + log_memcpy_length; + DBUG_ASSERT(full_length <= max_page_size); + while (header < log_memcpy_end) + { + uint to, from; + to= uint2korr(header); + header+= 2; + from= uint2korr(header); + header+= 2; + /* "from" is a place in the existing page */ + DBUG_ASSERT(MY_MAX(from, to) < max_page_size); + memcpy(buff + to, buff + from, full_length); + } + break; + } + case KEY_OP_SET_PAGEFLAG: + DBUG_PRINT("redo", ("key_op_set_pageflag")); + _ma_store_keypage_flag(share, buff, *header++); + break; + case KEY_OP_COMPACT_PAGE: + { + TrID transid= transid_korr(header); + + DBUG_PRINT("redo", ("key_op_compact_page")); + header+= TRANSID_SIZE; + if (_ma_compact_keypage(&page, transid)) + { + result= mark_crashed= 1; + goto err; + } + page_length= page.size; + break; + } + case KEY_OP_NONE: + default: + DBUG_ASSERT(0); + result= mark_crashed= 1; + goto err; + } + } while (header < header_end); + DBUG_ASSERT(header == header_end); + DBUG_ASSERT(new_page_length == 0 || new_page_length == page_length); + + /* Write modified page */ + page.size= page_length; + _ma_store_page_used(share, buff, page_length); + + /* + Clean old stuff up. Gives us better compression of we archive things + and makes things easer to debug + */ + if (page_length < org_page_length) + bzero(buff + page_length, org_page_length-page_length); + + /* Mark page to be unlocked and written at _ma_unpin_all_pages() */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0, FALSE); + if (mark_crashed) + _ma_mark_file_crashed(share); + DBUG_RETURN(result); +} + + +/**************************************************************************** + Undo of key block changes +****************************************************************************/ + +/** + @brief Undo of insert of key (ie, delete the inserted key) +*/ + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length) +{ + LSN lsn; + my_bool res; + uint keynr; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + my_off_t new_root; + struct st_msg_to_write_hook_for_undo_key msg; + DBUG_ENTER("_ma_apply_undo_key_insert"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + keynr= key_nr_korr(header); + length-= KEY_NR_STORE_SIZE; + + /* We have to copy key as _ma_ck_real_delete() may change it */ + memcpy(key_buff, header + KEY_NR_STORE_SIZE, length); + DBUG_DUMP("key_buff", key_buff, length); + + new_root= share->state.key_root[keynr]; + /* + Change the key to an internal structure. + It's safe to have SEARCH_USER_KEY_HAS_TRANSID even if there isn't + a transaction id, as ha_key_cmp() will stop comparison when key length + is reached. + For index with transid flag, the ref_length of the key is not correct. + This should however be safe as long as this key is only used for + comparsion against other keys (not for packing or for read-next etc as + in this case we use data_length + ref_length, which is correct. + */ + key.keyinfo= share->keyinfo + keynr; + key.data= key_buff; + key.data_length= length - share->rec_reflength; + key.ref_length= share->rec_reflength; + key.flag= SEARCH_USER_KEY_HAS_TRANSID; + + res= ((share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ? + maria_rtree_real_delete(info, &key, &new_root) : + _ma_ck_real_delete(info, &key, &new_root)); + if (res) + _ma_mark_file_crashed(share); + msg.root= &share->state.key_root[keynr]; + msg.value= new_root; + msg.keynr= keynr; + + if (_ma_write_clr(info, undo_lsn, *msg.root == msg.value ? + LOGREC_UNDO_KEY_INSERT : LOGREC_UNDO_KEY_INSERT_WITH_ROOT, + 0, 0, &lsn, (void*) &msg)) + res= 1; + + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/** + @brief Undo of delete of key (ie, insert the deleted key) + + @param with_root If the UNDO is UNDO_KEY_DELETE_WITH_ROOT +*/ + +my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length, + my_bool with_root) +{ + LSN lsn; + my_bool res; + uint keynr, skip_bytes; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_SHARE *share= info->s; + my_off_t new_root; + struct st_msg_to_write_hook_for_undo_key msg; + MARIA_KEY key; + DBUG_ENTER("_ma_apply_undo_key_delete"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + keynr= key_nr_korr(header); + skip_bytes= KEY_NR_STORE_SIZE + (with_root ? PAGE_STORE_SIZE : 0); + header+= skip_bytes; + length-= skip_bytes; + + /* We have to copy key as _ma_ck_real_write_btree() may change it */ + memcpy(key_buff, header, length); + DBUG_DUMP("key", key_buff, length); + + key.keyinfo= share->keyinfo + keynr; + key.data= key_buff; + key.data_length= length - share->rec_reflength; + key.ref_length= share->rec_reflength; + key.flag= SEARCH_USER_KEY_HAS_TRANSID; + + new_root= share->state.key_root[keynr]; + res= (share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ? + maria_rtree_insert_level(info, &key, -1, &new_root) : + _ma_ck_real_write_btree(info, &key, &new_root, + share->keyinfo[keynr].write_comp_flag | + key.flag); + if (res) + _ma_mark_file_crashed(share); + + msg.root= &share->state.key_root[keynr]; + msg.value= new_root; + msg.keynr= keynr; + if (_ma_write_clr(info, undo_lsn, + *msg.root == msg.value ? + LOGREC_UNDO_KEY_DELETE : LOGREC_UNDO_KEY_DELETE_WITH_ROOT, + 0, 0, &lsn, + (void*) &msg)) + res= 1; + + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Handle some local variables +****************************************************************************/ + +/** + @brief lock key_del for other threads usage + + @fn _ma_lock_key_del() + @param info Maria handler + @param insert_at_end Set to 1 if we are doing an insert + + @note + To allow higher concurrency in the common case where we do inserts + and we don't have any linked blocks we do the following: + - Mark in info->key_del_used that we are not using key_del + - Return at once (without marking key_del as used) + + This is safe as we in this case don't write key_del_current into + the redo log and during recover we are not updating key_del. + + @retval 1 Use page at end of file + @retval 0 Use page at share->key_del_current +*/ + +my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end) +{ + MARIA_SHARE *share= info->s; + + /* + info->key_del_used is 0 initially. + If the caller needs a block (_ma_new()), we look at the free list: + - looks empty? then caller will create a new block at end of file and + remember (through info->key_del_used==2) that it will not change + state.key_del and does not need to wake up waiters as nobody will wait for + it. + - non-empty? then we wait for other users of the state.key_del list to + have finished, then we lock this list (through share->key_del_used==1) + because we need to prevent some other thread to also read state.key_del + and use the same page as ours. We remember through info->key_del_used==1 + that we will have to set state.key_del at unlock time and wake up + waiters. + If the caller wants to free a block (_ma_dispose()), "empty" and + "non-empty" are treated as "non-empty" is treated above. + When we are ready to unlock, we copy share->key_del_current into + state.key_del. Unlocking happens when writing the UNDO log record, that + can make a long lock time. + Why we wrote "*looks* empty": because we are looking at state.key_del + which may be slightly old (share->key_del_current may be more recent and + exact): when we want a new page, we tolerate to treat "there was no free + page 1 millisecond ago" as "there is no free page". It's ok to non-pop + (_ma_new(), page will be found later anyway) but it's not ok to non-push + (_ma_dispose(), page would be lost). + When we leave this function, info->key_del_used is always 1 or 2. + */ + if (info->key_del_used != 1) + { + mysql_mutex_lock(&share->key_del_lock); + if (share->state.key_del == HA_OFFSET_ERROR && insert_at_end) + { + mysql_mutex_unlock(&share->key_del_lock); + info->key_del_used= 2; /* insert-with-append */ + return 1; + } + while (share->key_del_used) + mysql_cond_wait(&share->key_del_cond, &share->key_del_lock); + info->key_del_used= 1; + share->key_del_used= 1; + share->key_del_current= share->state.key_del; + mysql_mutex_unlock(&share->key_del_lock); + } + return share->key_del_current == HA_OFFSET_ERROR; +} + + +/** + @brief copy changes to key_del and unlock it + + @notes + In case of many threads using the maria table, we always have a lock + on the translog when comming here. +*/ + +void _ma_unlock_key_del(MARIA_HA *info) +{ + DBUG_ASSERT(info->key_del_used); + if (info->key_del_used == 1) /* Ignore insert-with-append */ + { + MARIA_SHARE *share= info->s; + mysql_mutex_lock(&share->key_del_lock); + share->key_del_used= 0; + share->state.key_del= share->key_del_current; + mysql_mutex_unlock(&share->key_del_lock); + mysql_cond_signal(&share->key_del_cond); + } + info->key_del_used= 0; +} diff --git a/storage/maria/ma_key_recover.h b/storage/maria/ma_key_recover.h new file mode 100644 index 00000000..b5b50279 --- /dev/null +++ b/storage/maria/ma_key_recover.h @@ -0,0 +1,122 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + When we have finished the write/update/delete of a row, we have cleanups to + do. For now it is signalling to Checkpoint that all dirtied pages have + their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called), + and that bitmap pages are correct (_ma_bitmap_release_unused() has been + called). +*/ + +/* Struct for clr_end */ + +struct st_msg_to_write_hook_for_clr_end +{ + LSN previous_undo_lsn; + enum translog_record_type undone_record_type; + ha_checksum checksum_delta; + void *extra_msg; +}; + +struct st_msg_to_write_hook_for_undo_key +{ + my_off_t *root; + my_off_t value; + uint keynr; + ulonglong auto_increment; +}; + + +/* Function definitions for some redo functions */ + +my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn, + enum translog_record_type undo_type, + my_bool store_checksum, ha_checksum checksum, + LSN *res_lsn, void *extra_msg); +int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key, + my_off_t *root, my_off_t new_root, + LSN *res_lsn); +my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key, + my_off_t new_root, LSN *res_lsn); +my_bool write_hook_for_clr_end(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, LSN *lsn, + void *hook_arg); +extern my_bool write_hook_for_undo_key(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +extern my_bool write_hook_for_undo_key_insert(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); +extern my_bool write_hook_for_undo_key_delete(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); + +my_bool _ma_log_prefix(MARIA_PAGE *page, uint changed_length, int move_length, + enum en_key_debug debug_marker); +my_bool _ma_log_suffix(MARIA_PAGE *page, uint org_length, + uint new_length); +my_bool _ma_log_add(MARIA_PAGE *page, uint buff_length, uchar *key_pos, + uint changed_length, int move_length, + my_bool handle_overflow, + enum en_key_debug debug_marker); +my_bool _ma_log_delete(MARIA_PAGE *page, const uchar *key_pos, + uint changed_length, uint move_length, + uint append_length, enum en_key_debug debug_marker); +my_bool _ma_log_change(MARIA_PAGE *page, const uchar *key_pos, uint length, + enum en_key_debug debug_marker); +my_bool _ma_log_new(MARIA_PAGE *page, my_bool root_page); +#ifdef EXTRA_DEBUG_KEY_CHANGES +void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array, + uchar *log_pos, uint *changed_length, + uint *translog_parts); +#else +#define _ma_log_key_changes(A,B,C,D,E) +#endif + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length); +uint _ma_apply_redo_index_free_page(MARIA_HA *info, LSN lsn, + const uchar *header); +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint length); + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length); +my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length, + my_bool with_root); + +static inline void _ma_finalize_row(MARIA_HA *info) +{ + info->trn->rec_lsn= LSN_IMPOSSIBLE; +} + +/* unpinning is often the last operation before finalizing */ + +static inline void _ma_unpin_all_pages_and_finalize_row(MARIA_HA *info, + LSN undo_lsn) +{ + _ma_unpin_all_pages(info, undo_lsn); + _ma_finalize_row(info); +} + +extern my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end); +extern void _ma_unlock_key_del(MARIA_HA *info); +static inline void _ma_fast_unlock_key_del(MARIA_HA *info) +{ + if (info->key_del_used) + _ma_unlock_key_del(info); +} diff --git a/storage/maria/ma_keycache.c b/storage/maria/ma_keycache.c new file mode 100644 index 00000000..2ff8d019 --- /dev/null +++ b/storage/maria/ma_keycache.c @@ -0,0 +1,164 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Key cache assignments +*/ + +#include "maria_def.h" + +/* + Assign pages of the index file for a table to a key cache + + SYNOPSIS + maria_assign_to_pagecache() + info open table + key_map map of indexes to assign to the key cache + pagecache_ptr pointer to the key cache handle + assign_lock Mutex to lock during assignment + + PREREQUESTS + One must have a READ lock or a WRITE lock on the table when calling + the function to ensure that there is no other writers to it. + + The caller must also ensure that one doesn't call this function from + two different threads with the same table. + + NOTES + At present pages for all indexes must be assigned to the same key cache. + In future only pages for indexes specified in the key_map parameter + of the table will be assigned to the specified key cache. + + RETURN VALUE + 0 If a success + # Error code +*/ + +int maria_assign_to_pagecache(MARIA_HA *info, + ulonglong key_map __attribute__((unused)), + PAGECACHE *pagecache) +{ + int error= 0; + MARIA_SHARE* share= info->s; + DBUG_ENTER("maria_assign_to_pagecache"); + DBUG_PRINT("enter", + ("old_pagecache_handle:%p new_pagecache_handle:%p", + share->pagecache, pagecache)); + + /* + Skip operation if we didn't change key cache. This can happen if we + call this for all open instances of the same table + */ + if (share->pagecache == pagecache) + DBUG_RETURN(0); + + /* + First flush all blocks for the table in the old key cache. + This is to ensure that the disk is consistent with the data pages + in memory (which may not be the case if the table uses delayed_key_write) + + Note that some other read thread may still fill in the key cache with + new blocks during this call and after, but this doesn't matter as + all threads will start using the new key cache for their next call to + maria library and we know that there will not be any changed blocks + in the old key cache. + */ + + if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE)) + { + error= my_errno; + /* Mark that table must be checked */ + _ma_set_fatal_error(info, error); + } + + /* + Flush the new key cache for this file. This is needed to ensure + that there is no old blocks (with outdated data) left in the new key + cache from an earlier assign_to_keycache operation + + (This can never fail as there is never any not written data in the + new key cache) + */ + (void) flush_pagecache_blocks(pagecache, &share->kfile, FLUSH_RELEASE); + + /* + ensure that setting the key cache and changing the multi_pagecache + is done atomicly + */ + mysql_mutex_lock(&share->intern_lock); + /* + Tell all threads to use the new key cache + This should be seen at the lastes for the next call to an maria function. + */ + share->pagecache= pagecache; + + /* store the key cache in the global hash structure for future opens */ + if (multi_pagecache_set((uchar*) share->unique_file_name.str, + (uint)share->unique_file_name.length, + share->pagecache)) + error= my_errno; + mysql_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +} + + +/* + Change all MARIA entries that uses one key cache to another key cache + + SYNOPSIS + maria_change_pagecache() + old_pagecache Old key cache + new_pagecache New key cache + + NOTES + This is used when we delete one key cache. + + To handle the case where some other threads tries to open an MARIA + table associated with the to-be-deleted key cache while this operation + is running, we have to call 'multi_pagecache_change()' from this + function while we have a lock on the MARIA table list structure. + + This is safe as long as it's only MARIA that is using this specific + key cache. +*/ + + +void maria_change_pagecache(PAGECACHE *old_pagecache, + PAGECACHE *new_pagecache) +{ + LIST *pos; + DBUG_ENTER("maria_change_pagecache"); + + /* + Lock list to ensure that no one can close the table while we manipulate it + */ + mysql_mutex_lock(&THR_LOCK_maria); + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info= (MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (share->pagecache == old_pagecache) + maria_assign_to_pagecache(info, (ulonglong) ~0, new_pagecache); + } + + /* + We have to do the following call while we have the lock on the + MARIA list structure to ensure that another thread is not trying to + open a new table that will be associted with the old key cache + */ + multi_pagecache_change(old_pagecache, new_pagecache); + mysql_mutex_unlock(&THR_LOCK_maria); + DBUG_VOID_RETURN; +} diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c new file mode 100644 index 00000000..9084be1d --- /dev/null +++ b/storage/maria/ma_locking.c @@ -0,0 +1,607 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Locking of Maria-tables. + Must be first request before doing any furter calls to any Maria function. + Is used to allow many process use the same non transactional Maria table +*/ + +#include "ma_ftdefs.h" + + /* lock table by F_UNLCK, F_RDLCK or F_WRLCK */ + +int maria_lock_database(MARIA_HA *info, int lock_type) +{ + int error; + uint count; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_lock_database"); + DBUG_PRINT("enter",("lock_type: %d old lock %d r_locks: %u w_locks: %u " + "global_changed: %d open_count: %u name: '%s'", + lock_type, info->lock_type, share->r_locks, + share->w_locks, + share->global_changed, share->state.open_count, + share->index_file_name.str)); + if (share->options & HA_OPTION_READ_ONLY_DATA || + info->lock_type == lock_type) + DBUG_RETURN(0); + if (lock_type == F_EXTRA_LCK) /* Used by TMP tables */ + { + ++share->w_locks; + ++share->tot_locks; + info->lock_type= lock_type; + DBUG_RETURN(0); + } + + error=0; + if (!info->intern_lock_locked) + mysql_mutex_lock(&share->intern_lock); + if (share->kfile.file >= 0) /* May only be false on windows */ + { + switch (lock_type) { + case F_UNLCK: + maria_ftparser_call_deinitializer(info); + if (info->lock_type == F_RDLCK) + { + count= --share->r_locks; + if (share->lock_restore_status) + (*share->lock_restore_status)(info); + } + else + { + count= --share->w_locks; + if (share->lock.update_status) + _ma_update_status_with_lock(info); + } + --share->tot_locks; + if (info->lock_type == F_WRLCK && !share->w_locks) + { + /* pages of transactional tables get flushed at Checkpoint */ + if (!share->base.born_transactional && !share->temporary && + _ma_flush_table_files(info, + share->delay_key_write ? MARIA_FLUSH_DATA : + MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_KEEP, FLUSH_KEEP)) + error= my_errno; + } + if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED)) + { + if (end_io_cache(&info->rec_cache)) + { + error= my_errno; + _ma_set_fatal_error(info, error); + } + } + if (!count) + { + DBUG_PRINT("info",("changed: %u w_locks: %u", + (uint) share->changed, share->w_locks)); + if (share->changed && !share->w_locks) + { +#ifdef HAVE_MMAP + if ((share->mmaped_length != + share->state.state.data_file_length) && + (share->nonmmaped_inserts > MAX_NONMAPPED_INSERTS)) + { + if (share->lock_key_trees) + mysql_rwlock_wrlock(&share->mmap_lock); + _ma_remap_file(info, share->state.state.data_file_length); + share->nonmmaped_inserts= 0; + if (share->lock_key_trees) + mysql_rwlock_unlock(&share->mmap_lock); + } +#endif +#ifdef MARIA_EXTERNAL_LOCKING + share->state.process= share->last_process=share->this_process; + share->state.unique= info->last_unique= info->this_unique; + share->state.update_count= info->last_loop= ++info->this_loop; +#endif + /* transactional tables rather flush their state at Checkpoint */ + if (!share->base.born_transactional) + { + if (_ma_state_info_write_sub(share->kfile.file, &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)) + error= my_errno; + else + { + /* A value of 0 means below means "state flushed" */ + share->changed= 0; + } + } + if (maria_flush) + { + if (_ma_sync_table_files(info)) + error= my_errno; + } + else + share->not_flushed=1; + if (error) + _ma_set_fatal_error(info, error); + } + } + info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); + info->lock_type= F_UNLCK; + break; + case F_RDLCK: + if (info->lock_type == F_WRLCK) + { + /* + Change RW to READONLY + + mysqld does not turn write locks to read locks, + so we're never here in mysqld. + */ + share->w_locks--; + share->r_locks++; + info->lock_type=lock_type; + break; + } +#ifdef MARIA_EXTERNAL_LOCKING + if (!share->r_locks && !share->w_locks) + { + /* note that a transactional table should not do this */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + error=my_errno; + break; + } + } +#endif + _ma_test_if_changed(info); + share->r_locks++; + share->tot_locks++; + info->lock_type=lock_type; + break; + case F_WRLCK: + if (info->lock_type == F_RDLCK) + { /* Change READONLY to RW */ + if (share->r_locks == 1) + { + share->r_locks--; + share->w_locks++; + info->lock_type=lock_type; + break; + } + } +#ifdef MARIA_EXTERNAL_LOCKING + if (!(share->options & HA_OPTION_READ_ONLY_DATA)) + { + if (!share->w_locks) + { + if (!share->r_locks) + { + /* + Note that transactional tables should not do this. + If we enabled this code, we should make sure to skip it if + born_transactional is true. We should not test + now_transactional to decide if we can call + _ma_state_info_read_dsk(), because it can temporarily be 0 + (TRUNCATE on a partitioned table) and thus it would make a state + modification below without mutex, confusing a concurrent + checkpoint running. + Even if this code was enabled only for non-transactional tables: + in scenario LOCK TABLE t1 WRITE; INSERT INTO t1; DELETE FROM t1; + state on disk read by DELETE is obsolete as it was not flushed + at the end of INSERT. MyISAM same. It however causes no issue as + maria_delete_all_rows() calls _ma_reset_status() thus is not + influenced by the obsolete read values. + */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + error=my_errno; + break; + } + } + } + } +#endif /* defined(MARIA_EXTERNAL_LOCKING) */ + _ma_test_if_changed(info); + + info->lock_type=lock_type; + info->invalidator=share->invalidator; + share->w_locks++; + share->tot_locks++; + break; + default: + DBUG_ASSERT(0); + break; /* Impossible */ + } + } +#ifdef _WIN32 + else + { + /* + Check for bad file descriptors if this table is part + of a merge union. Failing to capture this may cause + a crash on windows if the table is renamed and + later on referenced by the merge table. + */ + if( info->owned_by_merge && (info->s)->kfile.file < 0 ) + { + error = HA_ERR_NO_SUCH_TABLE; + } + } +#endif + if (!info->intern_lock_locked) + mysql_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +} /* maria_lock_database */ + + +/**************************************************************************** + ** functions to read / write the state +****************************************************************************/ + +int _ma_readinfo(register MARIA_HA *info __attribute__ ((unused)), + int lock_type __attribute__ ((unused)), + int check_keybuffer __attribute__ ((unused))) +{ +#ifdef MARIA_EXTERNAL_LOCKING + DBUG_ENTER("_ma_readinfo"); + + if (info->lock_type == F_UNLCK) + { + MARIA_SHARE *share= info->s; + if (!share->tot_locks) + { + /* should not be done for transactional tables */ + if (_ma_state_info_read_dsk(share->kfile.file, &share->state)) + { + if (!my_errno) + my_errno= HA_ERR_FILE_TOO_SHORT; + DBUG_RETURN(1); + } + } + if (check_keybuffer) + VOID(_ma_test_if_changed(info)); + info->invalidator=share->invalidator; + } + else if (lock_type == F_WRLCK && info->lock_type == F_RDLCK) + { + my_errno=EACCES; /* Not allowed to change */ + DBUG_RETURN(-1); /* when have read_lock() */ + } + DBUG_RETURN(0); +#else + return 0; +#endif /* defined(MARIA_EXTERNAL_LOCKING) */ +} /* _ma_readinfo */ + + +/* + Every isam-function that updates the isam-database MUST end with this + request + + NOTES + my_errno is not changed if this succeeds! +*/ + +int _ma_writeinfo(register MARIA_HA *info, uint operation) +{ + int error,olderror; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_writeinfo"); + DBUG_PRINT("info",("operation: %u tot_locks: %u", operation, + share->tot_locks)); + + error=0; + if (share->tot_locks == 0 && !share->base.born_transactional) + { + /* transactional tables flush their state at Checkpoint */ + if (operation) + { /* Two threads can't be here */ + CRASH_IF_S3_TABLE(info->s); /* S3 readonly doesn't come here */ + + olderror= my_errno; /* Remember last error */ + +#ifdef MARIA_EXTERNAL_LOCKING + /* + The following only makes sense if we want to be allow two different + processes access the same table at the same time + */ + share->state.process= share->last_process= share->this_process; + share->state.unique= info->last_unique= info->this_unique; + share->state.update_count= info->last_loop= ++info->this_loop; +#endif + + if ((error= + _ma_state_info_write_sub(share->kfile.file, + &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))) + olderror=my_errno; +#ifdef _WIN32 + if (maria_flush) + { + _commit(share->kfile.file); + _commit(info->dfile.file); + } +#endif + my_errno=olderror; + } + } + else if (operation) + share->changed= 1; /* Mark keyfile changed */ + DBUG_RETURN(error); +} /* _ma_writeinfo */ + + +/* + Test if an external process has changed the database + (Should be called after readinfo) +*/ + +int _ma_test_if_changed(register MARIA_HA *info) +{ +#ifdef MARIA_EXTERNAL_LOCKING + MARIA_SHARE *share= info->s; + if (share->state.process != share->last_process || + share->state.unique != info->last_unique || + share->state.update_count != info->last_loop) + { /* Keyfile has changed */ + DBUG_PRINT("info",("index file changed")); + if (share->state.process != share->this_process) + VOID(flush_pagecache_blocks(share->pagecache, &share->kfile, + FLUSH_RELEASE)); + share->last_process=share->state.process; + info->last_unique= share->state.unique; + info->last_loop= share->state.update_count; + info->update|= HA_STATE_WRITTEN; /* Must use file on next */ + info->data_changed= 1; /* For maria_is_changed */ + return 1; + } +#endif + return (!(info->update & HA_STATE_AKTIV) || + (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED | + HA_STATE_KEY_CHANGED))); +} /* _ma_test_if_changed */ + + +/* + Put a mark in the .MAI file that someone is updating the table + + DOCUMENTATION + state.open_count in the .MAI file is used the following way: + - For the first change of the .MYI file in this process open_count is + incremented by _ma_mark_file_changed(). (We have a write lock on the file + when this happens) + - In maria_close() it's decremented by _ma_decrement_open_count() if it + was incremented in the same process. + + This mean that if we are the only process using the file, the open_count + tells us if the MARIA file wasn't properly closed. (This is true if + my_disable_locking is set). + + open_count is not maintained on disk for temporary tables. +*/ + +#define _MA_ALREADY_MARKED_FILE_CHANGED \ + ((share->state.changed & STATE_CHANGED) && share->global_changed) + +int _ma_mark_file_changed(register MARIA_SHARE *share) +{ + if (!share->base.born_transactional) + { + if (!_MA_ALREADY_MARKED_FILE_CHANGED) + { + int res= _ma_mark_file_changed_now(share); + /* + Ensure that STATE_NOT_ANALYZED is reset on table changes + */ + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS); + return res; + } + } + else + { + /* + For transactional tables, the table is marked changed when the first page + is written. Here we just mark the state to be updated so that caller + can do 'analyze table' and find that is has changed before any pages + are written. + */ + if (! test_all_bits(share->state.changed, + (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS))) + { + mysql_mutex_lock(&share->intern_lock); + share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS); + mysql_mutex_unlock(&share->intern_lock); + } + } + return 0; +} + +int _ma_mark_file_changed_now(register MARIA_SHARE *share) +{ + uchar buff[3]; + int error= 1; + DBUG_ENTER("_ma_mark_file_changed_now"); + + if (_MA_ALREADY_MARKED_FILE_CHANGED) + DBUG_RETURN(0); + mysql_mutex_lock(&share->intern_lock); /* recheck under mutex */ + if (! _MA_ALREADY_MARKED_FILE_CHANGED) + { + share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_KEYS); + if (!share->global_changed) + { + share->changed= share->global_changed= 1; + share->state.open_count++; + } + /* + Temp tables don't need an open_count as they are removed on crash. + In theory transactional tables are fixed by log-based recovery, so don't + need an open_count either, but if recovery has failed and logs have been + removed (by maria-force-start-after-recovery-failures), we still need to + detect dubious tables. + If we didn't maintain open_count on disk for a table, after a crash + we wouldn't know if it was closed at crash time (thus does not need a + check) or not. So we would have to check all tables: overkill. + */ + if (!share->temporary) + { + CRASH_IF_S3_TABLE(share); + mi_int2store(buff,share->state.open_count); + buff[2]=1; /* Mark that it's changed */ + if (my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header) + + MARIA_FILE_OPEN_COUNT_OFFSET, + MYF(MY_NABP))) + goto err; + } + /* Set uuid of file if not yet set (zerofilled file) */ + if (share->base.born_transactional && + !(share->state.org_changed & STATE_NOT_MOVABLE)) + { + CRASH_IF_S3_TABLE(share); + /* Lock table to current installation */ + if (_ma_set_uuid(share, 0) || + (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS && + _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE, + trnman_get_min_trid(), + TRUE, TRUE))) + goto err; + share->state.changed|= STATE_NOT_MOVABLE; + share->state.org_changed|= STATE_NOT_MOVABLE; + } + } + error= 0; +err: + mysql_mutex_unlock(&share->intern_lock); + DBUG_RETURN(error); +#undef _MA_ALREADY_MARKED_FILE_CHANGED +} + +/* + Check that a region is all zero + + SYNOPSIS + check_if_zero() + pos Start of memory to check + length length of memory region + + NOTES + Used mainly to detect rows with wrong extent information +*/ + +my_bool _ma_check_if_zero(uchar *pos, size_t length) +{ + uchar *end; + for (end= pos+ length; pos != end ; pos++) + if (pos[0] != 0) + return 1; + return 0; +} + +/* + This is only called by close or by extra(HA_FLUSH) if the OS has the pwrite() + call. In these context the following code should be safe! + */ + +int _ma_decrement_open_count(MARIA_HA *info, my_bool lock_tables) +{ + uchar buff[2]; + register MARIA_SHARE *share= info->s; + int lock_error=0,write_error=0; + DBUG_ENTER("_ma_decrement_open_count"); + + if (share->global_changed) + { + uint old_lock=info->lock_type; + share->global_changed=0; + lock_error= (my_disable_locking || ! lock_tables ? 0 : + maria_lock_database(info, F_WRLCK)); + /* Its not fatal even if we couldn't get the lock ! */ + if (share->state.open_count > 0) + { + CRASH_IF_S3_TABLE(share); + share->state.open_count--; + share->changed= 1; /* We have to update state */ + /* + For temporary tables that will just be deleted, we don't have + to decrement state. For transactional tables the state will be + updated in maria_close(). + */ + + if (!share->temporary && !share->now_transactional) + { + mi_int2store(buff,share->state.open_count); + write_error= (int) my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header) + + MARIA_FILE_OPEN_COUNT_OFFSET, + MYF(MY_NABP)); + } + } + if (!lock_error && !my_disable_locking && lock_tables) + lock_error=maria_lock_database(info,old_lock); + } + DBUG_RETURN(MY_TEST(lock_error || write_error)); +} + + +/** @brief mark file as crashed */ + +void _ma_mark_file_crashed(MARIA_SHARE *share) +{ + uchar buff[2]; + DBUG_ENTER("_ma_mark_file_crashed"); + + share->state.changed|= STATE_CRASHED; + if (share->no_status_updates) + DBUG_VOID_RETURN; /* Safety */ + + mi_int2store(buff, share->state.changed); + + /* + We can ignore the errors, as if the mark failed, there isn't anything + else we can do; The user should already have got an error that the + table was crashed. + */ + (void) my_pwrite(share->kfile.file, buff, sizeof(buff), + sizeof(share->state.header) + + MARIA_FILE_CHANGED_OFFSET, + MYF(MY_NABP)); + DBUG_VOID_RETURN; +} + + +/** + @brief Set uuid of for a Maria file + + @fn _ma_set_uuid() + @param share Maria share + @param reset_uuid Instead of setting file to maria_uuid, set it to + 0 to mark it as movable +*/ + +my_bool _ma_set_uuid(MARIA_SHARE *share, my_bool reset_uuid) +{ + uchar buff[MY_UUID_SIZE], *uuid; + + uuid= maria_uuid; + if (reset_uuid) + { + bzero(buff, sizeof(buff)); + uuid= buff; + } + CRASH_IF_S3_TABLE(share); + return (my_bool) my_pwrite(share->kfile.file, uuid, MY_UUID_SIZE, + mi_uint2korr(share->state.header.base_pos), + MYF(MY_NABP)); +} diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c new file mode 100644 index 00000000..bfeb5e0c --- /dev/null +++ b/storage/maria/ma_loghandler.c @@ -0,0 +1,9359 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin. 2010 Monty Program Ab. + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_blockrec.h" /* for some constants and in-write hooks */ +#include "ma_key_recover.h" /* For some in-write hooks */ +#include "ma_checkpoint.h" +#include "ma_servicethread.h" +#include "ma_recovery.h" +#include "ma_loghandler_lsn.h" +#include "ma_recovery_util.h" + +/* + On Windows, neither my_open() nor mysql_file_sync() work for directories. + Also there is no need to flush filesystem changes ,i.e to sync() + directories. +*/ +#ifdef _WIN32 +#define sync_dir(A,B) 0 +#else +#define sync_dir(A,B) mysql_file_sync(A,B) +#endif + +/** + @file + @brief Module which writes and reads to a transaction log +*/ + +/* 0xFF can never be valid first byte of a chunk */ +#define TRANSLOG_FILLER 0xFF + +/* number of opened log files in the pagecache (should be at least 2) */ +#define OPENED_FILES_NUM 3 +#define CACHED_FILES_NUM 5 +#define CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT 7 +#if CACHED_FILES_NUM > CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT +#include <hash.h> +#include <m_ctype.h> +#endif + +/** @brief protects checkpoint_in_progress */ +static mysql_mutex_t LOCK_soft_sync; +/** @brief for killing the background checkpoint thread */ +static mysql_cond_t COND_soft_sync; +/** @brief control structure for checkpoint background thread */ +static MA_SERVICE_THREAD_CONTROL soft_sync_control= + {0, FALSE, FALSE, &LOCK_soft_sync, &COND_soft_sync}; + +uint log_purge_disabled= 0; + + +/* transaction log file descriptor */ +typedef struct st_translog_file +{ + uint32 number; + PAGECACHE_FILE handler; + my_bool was_recovered; + my_bool is_sync; +} TRANSLOG_FILE; + +/* records buffer size (should be TRANSLOG_PAGE_SIZE * n) */ +#define TRANSLOG_WRITE_BUFFER (1024*1024) +/* + pagecache_read/write/inject() use bmove512() on their buffers so those must + be long-aligned, which we guarantee by using the type below: +*/ +typedef union +{ + ulonglong dummy; + uchar buffer[TRANSLOG_PAGE_SIZE]; +} TRANSLOG_PAGE_SIZE_BUFF; + +#define MAX_TRUNSLOG_USED_BUFFERS 3 + +typedef struct +{ + struct st_translog_buffer *buff[MAX_TRUNSLOG_USED_BUFFERS]; + uint8 wrt_ptr; + uint8 unlck_ptr; +} TRUNSLOG_USED_BUFFERS; + +static void +used_buffs_init(TRUNSLOG_USED_BUFFERS *buffs) +{ + buffs->unlck_ptr= buffs->wrt_ptr= 0; +} + +static void +used_buffs_add(TRUNSLOG_USED_BUFFERS *buffs, + struct st_translog_buffer *buff); + +static void +used_buffs_register_unlock(TRUNSLOG_USED_BUFFERS *buffs, + struct st_translog_buffer *buff); + +static void +used_buffs_urgent_unlock(TRUNSLOG_USED_BUFFERS *buffs); + +/* min chunk length */ +#define TRANSLOG_MIN_CHUNK 3 +/* + Number of buffers used by loghandler + + Should be at least 4, because one thread can block up to 2 buffers in + normal circumstances (less then half of one and full other, or just + switched one and other), But if we met end of the file in the middle and + have to switch buffer it will be 3. + 1 buffer for flushing/writing. + We have a bigger number here for higher concurrency and to make division + faster. + + The number should be power of 2 to be fast. +*/ +#define TRANSLOG_BUFFERS_NO 8 +/* number of bytes (+ header) which can be unused on first page in sequence */ +#define TRANSLOG_MINCHUNK_CONTENT 1 +/* version of log file */ +#define TRANSLOG_VERSION_ID 10000 /* 1.00.00 */ + +#define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */ + +/* Maximum length of compressed LSNs (the worst case of whole LSN storing) */ +#define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE) +#define MAX_NUMBER_OF_LSNS_PER_RECORD 2 + + +/* max lsn calculation for buffer */ +#define BUFFER_MAX_LSN(B) \ + ((B)->last_lsn == LSN_IMPOSSIBLE ? (B)->prev_last_lsn : (B)->last_lsn) + +/* log write buffer descriptor */ +struct st_translog_buffer +{ + /* + Cache for current log. Comes first to be aligned for bmove512() in + pagecache_inject() + */ + uchar buffer[TRANSLOG_WRITE_BUFFER]; + /* + Maximum LSN of records which ends in this buffer (or IMPOSSIBLE_LSN + if no LSNs ends here) + */ + LSN last_lsn; + /* last_lsn of previous buffer or IMPOSSIBLE_LSN if it is very first one */ + LSN prev_last_lsn; + /* This buffer offset in the file */ + TRANSLOG_ADDRESS offset; + /* + Next buffer offset in the file (it is not always offset + size, + in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE) + */ + TRANSLOG_ADDRESS next_buffer_offset; + /* Previous buffer offset to detect it flush finish */ + TRANSLOG_ADDRESS prev_buffer_offset; + /* + If the buffer was forced to close it save value of its horizon + otherwise LSN_IMPOSSIBLE + */ + TRANSLOG_ADDRESS pre_force_close_horizon; + /* + How much is written (or will be written when copy_to_buffer_in_progress + become 0) to this buffer + */ + translog_size_t size; + /* + When moving from one log buffer to another, we write the last of the + previous buffer to file and then move to start using the new log + buffer. In the case of a part filed last page, this page is not moved + to the start of the new buffer but instead we set the 'skip_data' + variable to tell us how much data at the beginning of the buffer is not + relevant. + */ + uint skipped_data; + /* File handler for this buffer */ + TRANSLOG_FILE *file; + /* Threads which are waiting for buffer filling/freeing */ + mysql_cond_t waiting_filling_buffer; + /* + Number of records which are in copy progress. + + Controlled via translog_buffer_increase_writers() and + translog_buffer_decrease_writers(). + + 1 Simple case: translog_force_current_buffer_to_finish both called in + the same procedure. + + 2 Simple case: translog_write_variable_record_1group: + translog_advance_pointer() increase writer of the buffer and + translog_buffer_decrease_writers() decrease it. + + Usual case: + 1) translog_advance_pointer (i.e. reserve place for future writing) + increase writers for all buffers where place reserved. + Simpliest case: just all space reserved in one buffer + complex case: end of the first buffer, all second buffer, beginning + of the third buffer. + 2) When we finish with writing translog_chaser_page_next() will be + called and unlock the buffer by decreasing number of writers. + */ + uint copy_to_buffer_in_progress; + /* list of waiting buffer ready threads */ + struct st_my_thread_var *waiting_flush; + /* + If true then previous buffer overlap with this one (due to flush of + loghandler, the last page of that buffer is the same as the first page + of this buffer) and have to be written first (because contain old + content of page which present in both buffers) + */ + my_bool overlay; + uint buffer_no; + /* + Lock for the buffer. + + Current buffer also lock the whole handler (if one want lock the handler + one should lock the current buffer). + + Buffers are locked only in one direction (with overflow and beginning + from the first buffer). If we keep lock on buffer N we can lock only + buffer N+1 (never N-1). + + One thread do not lock more then 2 buffer in a time, so to make dead + lock it should be N thread (where N equal number of buffers) takes one + buffer and try to lock next. But it is impossible because there is only + 2 cases when thread take 2 buffers: 1) one thread finishes current + buffer (where horizon is) and start next (to which horizon moves). 2) + flush start from buffer after current (oldest) and go till the current + crabbing by buffer sequence. And there is only one flush in a moment + (they are serialised). + + Because of above and number of buffers equal 5 we can't get dead lock (it is + impossible to get all 5 buffers locked simultaneously). + */ + mysql_mutex_t mutex; + /* + Some thread is going to close the buffer and it should be + done only by that thread + */ + my_bool is_closing_buffer; + /* + Version of the buffer increases every time buffer the buffer flushed. + With file and offset it allow detect buffer changes + */ + uint8 ver; + + /* + When previous buffer sent to disk it set its address here to allow + to detect when it is done + (we have to keep it in this buffer to lock buffers only in one direction). + */ + TRANSLOG_ADDRESS prev_sent_to_disk; + mysql_cond_t prev_sent_to_disk_cond; +}; + + +struct st_buffer_cursor +{ + TRUNSLOG_USED_BUFFERS buffs; + /* pointer into the buffer */ + uchar *ptr; + /* current buffer */ + struct st_translog_buffer *buffer; + /* How many bytes we wrote on the current page */ + uint16 current_page_fill; + /* + How many times we write the page on the disk during flushing process + (for sector protection). + */ + uint16 write_counter; + /* previous write offset */ + uint16 previous_offset; + /* Number of current buffer */ + uint8 buffer_no; + /* + True if it is just filling buffer after advancing the pointer to + the horizon. + */ + my_bool chaser; + /* + Is current page of the cursor already finished (sector protection + should be applied if it is needed) + */ + my_bool protected; +}; + + +typedef uint8 dirty_buffer_mask_t; + +struct st_translog_descriptor +{ + /* *** Parameters of the log handler *** */ + + /* Page cache for the log reads */ + PAGECACHE *pagecache; + uint flags; + /* File open flags */ + uint open_flags; + /* max size of one log size (for new logs creation) */ + uint32 log_file_max_size; + uint32 server_version; + /* server ID (used for replication) */ + uint32 server_id; + /* Loghandler's buffer capacity in case of chunk 2 filling */ + uint32 buffer_capacity_chunk_2; + /* + Half of the buffer capacity in case of chunk 2 filling, + used to decide will we write a record in one group or many. + It is written to the variable just to avoid devision every + time we need it. + */ + uint32 half_buffer_capacity_chunk_2; + /* Page overhead calculated by flags (whether CRC is enabled, etc) */ + uint16 page_overhead; + /* + Page capacity ("useful load") calculated by flags + (TRANSLOG_PAGE_SIZE - page_overhead-1) + */ + uint16 page_capacity_chunk_2; + /* Path to the directory where we store log store files */ + char directory[FN_REFLEN]; + + /* *** Current state of the log handler *** */ + /* list of opened files */ + DYNAMIC_ARRAY open_files; + /* min/max number of file in the array */ + uint32 max_file, min_file; + /* the opened files list guard */ + mysql_rwlock_t open_files_lock; + + /* + File descriptor of the directory where we store log files for syncing + it. + */ + File directory_fd; + /* buffers for log writing */ + struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO]; + /* Mask where 1 in position N mean that buffer N is not flushed */ + dirty_buffer_mask_t dirty_buffer_mask; + /* The above variable protection */ + mysql_mutex_t dirty_buffer_mask_lock; + /* + horizon - visible end of the log (here is absolute end of the log: + position where next chunk can start + */ + TRANSLOG_ADDRESS horizon; + /* horizon buffer cursor */ + struct st_buffer_cursor bc; + /* maximum LSN of the current (not finished) file */ + LSN max_lsn; + + /* + Last flushed LSN (protected by log_flush_lock). + Pointers in the log ordered like this: + last_lsn_checked <= flushed <= sent_to_disk <= in_buffers_only <= + max_lsn <= horizon + */ + LSN flushed; + /* Last LSN sent to the disk (but maybe not written yet) */ + LSN sent_to_disk; + /* Horizon from which log started after initialization */ + TRANSLOG_ADDRESS log_start; + TRANSLOG_ADDRESS previous_flush_horizon; + /* All what is after this address is not sent to disk yet */ + TRANSLOG_ADDRESS in_buffers_only; + /* protection of sent_to_disk and in_buffers_only */ + mysql_mutex_t sent_to_disk_lock; + /* + Protect flushed (see above) and for flush serialization (will + be removed in v1.5 + */ + mysql_mutex_t log_flush_lock; + mysql_cond_t log_flush_cond; + mysql_cond_t new_goal_cond; + + /* Protects changing of headers of finished files (max_lsn) */ + mysql_mutex_t file_header_lock; + + /* + Sorted array (with protection) of files where we started writing process + and so we can't give last LSN yet + */ + mysql_mutex_t unfinished_files_lock; + DYNAMIC_ARRAY unfinished_files; + + /* + minimum number of still need file calculeted during last + translog_purge call + */ + uint32 min_need_file; + /* Purger data: minimum file in the log (or 0 if unknown) */ + uint32 min_file_number; + /* Protect purger from many calls and it's data */ + mysql_mutex_t purger_lock; + /* last low water mark checked */ + LSN last_lsn_checked; + /** + Must be set to 0 under loghandler lock every time a new LSN + is generated. + */ + my_bool is_everything_flushed; + /* True when flush pass is in progress */ + my_bool flush_in_progress; + /* The flush number (used to distinguish two flushes goes one by one) */ + volatile int flush_no; + /* Next flush pass variables */ + TRANSLOG_ADDRESS next_pass_max_lsn; + pthread_t max_lsn_requester; +}; + +static struct st_translog_descriptor log_descriptor; + +ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE; +ulong log_file_size= TRANSLOG_FILE_SIZE; +/* sync() of log files directory mode */ +ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE; +ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE; +ulong maria_group_commit_interval= 0; + +/* Marker for end of log */ +static uchar end_of_log= 0; +#define END_OF_LOG &end_of_log +/** + Switch for "soft" sync (no real sync() but periodical sync by service + thread) +*/ +static volatile my_bool soft_sync= FALSE; +/** + Switch for "hard" group commit mode +*/ +static volatile my_bool hard_group_commit= FALSE; +/** + File numbers interval which have to be sync() +*/ +static uint32 soft_sync_min= 0; +static uint32 soft_sync_max= 0; +static uint32 soft_need_sync= 1; +/** + stores interval in microseconds +*/ +static uint32 group_commit_wait= 0; + +enum enum_translog_status translog_status= TRANSLOG_UNINITED; +ulonglong translog_syncs= 0; /* Number of sync()s */ + +/* time of last flush */ +static ulonglong flush_start= 0; + +/* chunk types */ +#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */ +#define TRANSLOG_CHUNK_FIXED (1 << 6) /* 1 (pseudo)fixed record (also LSN) */ +#define TRANSLOG_CHUNK_NOHDR (2 << 6) /* 2 no head chunk (till page end) */ +#define TRANSLOG_CHUNK_LNGTH (3 << 6) /* 3 chunk with chunk length */ +#define TRANSLOG_CHUNK_TYPE (3 << 6) /* Mask to get chunk type */ +#define TRANSLOG_REC_TYPE 0x3F /* Mask to get record type */ +#define TRANSLOG_CHUNK_0_CONT 0x3F /* the type to mark chunk 0 continue */ + +/* compressed (relative) LSN constants */ +#define TRANSLOG_CLSN_LEN_BITS 0xC0 /* Mask to get compressed LSN length */ + + +/* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */ +static MARIA_SHARE **id_to_share= NULL; + +static my_bool translog_page_validator(int res, PAGECACHE_IO_HOOK_ARGS *args); + +static my_bool translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner); +static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected); +LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon); + + +/* + Initialize log_record_type_descriptors +*/ + +LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; + + +#ifndef DBUG_OFF + +#define translog_buffer_lock_assert_owner(B) \ + mysql_mutex_assert_owner(&(B)->mutex) +#define translog_lock_assert_owner() \ + mysql_mutex_assert_owner(&log_descriptor.bc.buffer->mutex) +void translog_lock_handler_assert_owner() +{ + translog_lock_assert_owner(); +} + +/** + @brief check the description table validity + + @param num how many records should be filled +*/ + +static uint max_allowed_translog_type= 0; + +void check_translog_description_table(int num) +{ + int i; + DBUG_ENTER("check_translog_description_table"); + DBUG_PRINT("enter", ("last record: %d", num)); + DBUG_ASSERT(num > 0); + /* last is reserved for extending the table */ + DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1); + DBUG_ASSERT(log_record_type_descriptor[0].rclass == LOGRECTYPE_NOT_ALLOWED); + max_allowed_translog_type= num; + + for (i= 0; i <= num; i++) + { + DBUG_PRINT("info", + ("record type: %d class: %d fixed: %u header: %u LSNs: %u " + "name: %s", + i, log_record_type_descriptor[i].rclass, + (uint)log_record_type_descriptor[i].fixed_length, + (uint)log_record_type_descriptor[i].read_header_len, + (uint)log_record_type_descriptor[i].compressed_LSN, + log_record_type_descriptor[i].name)); + switch (log_record_type_descriptor[i].rclass) { + case LOGRECTYPE_NOT_ALLOWED: + DBUG_ASSERT(i == 0); + break; + case LOGRECTYPE_VARIABLE_LENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0); + DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) || + ((log_record_type_descriptor[i].compressed_LSN == 1) && + (log_record_type_descriptor[i].read_header_len >= + LSN_STORE_SIZE)) || + ((log_record_type_descriptor[i].compressed_LSN == 2) && + (log_record_type_descriptor[i].read_header_len >= + LSN_STORE_SIZE * 2))); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == + log_record_type_descriptor[i].read_header_len); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2); + break; + case LOGRECTYPE_FIXEDLENGTH: + DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == + log_record_type_descriptor[i].read_header_len); + DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0); + break; + default: + DBUG_ASSERT(0); + } + } + for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++) + { + DBUG_ASSERT(log_record_type_descriptor[i].rclass == + LOGRECTYPE_NOT_ALLOWED); + } + DBUG_VOID_RETURN; +} +#else +#define translog_buffer_lock_assert_owner(B) {} +#define translog_lock_assert_owner() {} +#endif + +static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23= +{LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0, + "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL }; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_new_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_TAIL= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_new_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS= +{LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL, + write_hook_for_redo, NULL, 0, + "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_FREE_BLOCKS= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_free_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL= +{LOGRECTYPE_FIXEDLENGTH, + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_redo, NULL, 0, + "redo_free_head_or_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/* not yet used; for when we have versioning */ +static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW= +{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, + "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +/** @todo RECOVERY BUG unused, remove? */ +static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, + "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, + "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX_NEW_PAGE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1, + NULL, write_hook_for_redo, NULL, 0, + "redo_index_new_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX_FREE_PAGE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + NULL, write_hook_for_redo, NULL, 0, + "redo_index_free_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW= +{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, + "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_CLR_END= +{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE + + CLR_TYPE_STORE_SIZE, NULL, write_hook_for_clr_end, NULL, 1, + "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PURGE_END= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1, + "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_insert, NULL, 1, + "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_delete, NULL, 1, + "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + NULL, write_hook_for_undo_row_update, NULL, 1, + "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, + NULL, write_hook_for_undo_key_insert, NULL, 1, + "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +/* This will never be in the log, only in the clr */ +static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_undo_key, NULL, 1, + "undo_key_insert_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, + NULL, write_hook_for_undo_key_delete, NULL, 1, + "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_undo_key_delete, NULL, 1, + "undo_key_delete_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PREPARE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1, + "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_COMMIT= +{LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL, + write_hook_for_commit, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL, + NULL}; + +static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, write_hook_for_commit, NULL, 1, + "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_CHECKPOINT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0, +"redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, + NULL, write_hook_for_redo_delete_all, NULL, 0, + "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 8 + 8, FILEID_STORE_SIZE + 8 + 8, + NULL, NULL, NULL, 0, + "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FILE_ID= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0, + "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID= +{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, + "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_INCOMPLETE_LOG= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE, + NULL, NULL, NULL, 0, + "incomplete_log", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_INCOMPLETE_GROUP= +{LOGRECTYPE_FIXEDLENGTH, 0, 0, + NULL, NULL, NULL, 0, + "incomplete_group", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_UNDO_BULK_INSERT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE, + NULL, write_hook_for_undo_bulk_insert, NULL, 1, + "undo_bulk_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_BITMAP_NEW_PAGE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + NULL, NULL, NULL, 0, + "redo_create_bitmap", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_IMPORTED_TABLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_DEBUG_INFO= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, + "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; + +const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL; + +void translog_table_init() +{ + int i; + log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]= + INIT_LOGREC_RESERVED_FOR_CHUNKS23; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]= + INIT_LOGREC_REDO_INSERT_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]= + INIT_LOGREC_REDO_INSERT_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_NEW_ROW_HEAD]= + INIT_LOGREC_REDO_NEW_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_NEW_ROW_TAIL]= + INIT_LOGREC_REDO_NEW_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]= + INIT_LOGREC_REDO_INSERT_ROW_BLOBS; + log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]= + INIT_LOGREC_REDO_PURGE_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]= + INIT_LOGREC_REDO_PURGE_ROW_TAIL; + log_record_type_descriptor[LOGREC_REDO_FREE_BLOCKS]= + INIT_LOGREC_REDO_FREE_BLOCKS; + log_record_type_descriptor[LOGREC_REDO_FREE_HEAD_OR_TAIL]= + INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL; + log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]= + INIT_LOGREC_REDO_DELETE_ROW; + log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]= + INIT_LOGREC_REDO_UPDATE_ROW_HEAD; + log_record_type_descriptor[LOGREC_REDO_INDEX]= + INIT_LOGREC_REDO_INDEX; + log_record_type_descriptor[LOGREC_REDO_INDEX_NEW_PAGE]= + INIT_LOGREC_REDO_INDEX_NEW_PAGE; + log_record_type_descriptor[LOGREC_REDO_INDEX_FREE_PAGE]= + INIT_LOGREC_REDO_INDEX_FREE_PAGE; + log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]= + INIT_LOGREC_REDO_UNDELETE_ROW; + log_record_type_descriptor[LOGREC_CLR_END]= + INIT_LOGREC_CLR_END; + log_record_type_descriptor[LOGREC_PURGE_END]= + INIT_LOGREC_PURGE_END; + log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]= + INIT_LOGREC_UNDO_ROW_INSERT; + log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]= + INIT_LOGREC_UNDO_ROW_DELETE; + log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]= + INIT_LOGREC_UNDO_ROW_UPDATE; + log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]= + INIT_LOGREC_UNDO_KEY_INSERT; + log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT_WITH_ROOT]= + INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT; + log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]= + INIT_LOGREC_UNDO_KEY_DELETE; + log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE_WITH_ROOT]= + INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT; + log_record_type_descriptor[LOGREC_PREPARE]= + INIT_LOGREC_PREPARE; + log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]= + INIT_LOGREC_PREPARE_WITH_UNDO_PURGE; + log_record_type_descriptor[LOGREC_COMMIT]= + INIT_LOGREC_COMMIT; + log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]= + INIT_LOGREC_COMMIT_WITH_UNDO_PURGE; + log_record_type_descriptor[LOGREC_CHECKPOINT]= + INIT_LOGREC_CHECKPOINT; + log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]= + INIT_LOGREC_REDO_CREATE_TABLE; + log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]= + INIT_LOGREC_REDO_RENAME_TABLE; + log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]= + INIT_LOGREC_REDO_DROP_TABLE; + log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]= + INIT_LOGREC_REDO_DELETE_ALL; + log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]= + INIT_LOGREC_REDO_REPAIR_TABLE; + log_record_type_descriptor[LOGREC_FILE_ID]= + INIT_LOGREC_FILE_ID; + log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]= + INIT_LOGREC_LONG_TRANSACTION_ID; + log_record_type_descriptor[LOGREC_INCOMPLETE_LOG]= + INIT_LOGREC_INCOMPLETE_LOG; + log_record_type_descriptor[LOGREC_INCOMPLETE_GROUP]= + INIT_LOGREC_INCOMPLETE_GROUP; + log_record_type_descriptor[LOGREC_UNDO_BULK_INSERT]= + INIT_LOGREC_UNDO_BULK_INSERT; + log_record_type_descriptor[LOGREC_REDO_BITMAP_NEW_PAGE]= + INIT_LOGREC_REDO_BITMAP_NEW_PAGE; + log_record_type_descriptor[LOGREC_IMPORTED_TABLE]= + INIT_LOGREC_IMPORTED_TABLE; + log_record_type_descriptor[LOGREC_DEBUG_INFO]= + INIT_LOGREC_DEBUG_INFO; + + for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++) + log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED; +#ifndef DBUG_OFF + check_translog_description_table(LOGREC_FIRST_FREE -1); +#endif +} + + +/* all possible flags page overheads */ +static uint page_overhead[TRANSLOG_FLAGS_NUM]; + +typedef struct st_translog_validator_data +{ + TRANSLOG_ADDRESS *addr; + my_bool was_recovered; +} TRANSLOG_VALIDATOR_DATA; + + +/* + Check cursor/buffer consistence + + SYNOPSIS + translog_check_cursor + cursor cursor which will be checked +*/ + +static void translog_check_cursor(struct st_buffer_cursor *cursor + __attribute__((unused))) +{ + DBUG_ASSERT(cursor->chaser || + ((ulong) (cursor->ptr - cursor->buffer->buffer) == + cursor->buffer->size)); + DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no); + DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE == + cursor->current_page_fill % TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); +} + + +/** + @brief switch the loghandler in read only mode in case of write error +*/ + +void translog_stop_writing() +{ + DBUG_ENTER("translog_stop_writing"); + DBUG_PRINT("error", ("errno: %d my_errno: %d", errno, my_errno)); + translog_status= (translog_status == TRANSLOG_SHUTDOWN ? + TRANSLOG_UNINITED : + TRANSLOG_READONLY); + log_descriptor.is_everything_flushed= 1; + log_descriptor.open_flags= O_BINARY | O_RDONLY; + DBUG_ASSERT(0); + DBUG_VOID_RETURN; +} + + +/* + @brief Get file name of the log by log number + + @param file_no Number of the log we want to open + @param path Pointer to buffer where file name will be + stored (must be FN_REFLEN bytes at least) + + @return pointer to path +*/ + +char *translog_filename_by_fileno(uint32 file_no, char *path) +{ + char buff[11], *end; + uint length; + DBUG_ENTER("translog_filename_by_fileno"); + DBUG_ASSERT(file_no <= 0xfffffff); + + /* log_descriptor.directory is already formatted */ + end= strxmov(path, log_descriptor.directory, "aria_log.0000000", NullS); + length= (uint) (int10_to_str(file_no, buff, 10) - buff); + strmov(end - length +1, buff); + + DBUG_PRINT("info", ("Path: '%s' path: %p", path, path)); + DBUG_RETURN(path); +} + + +/** + @brief Create log file with given number without cache + + @param file_no Number of the log we want to open + + retval -1 error + retval # file descriptor number +*/ + +static File create_logfile_by_number_no_cache(uint32 file_no) +{ + File file; + char path[FN_REFLEN]; + DBUG_ENTER("create_logfile_by_number_no_cache"); + + if (translog_status != TRANSLOG_OK) + DBUG_RETURN(-1); + + /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ + if ((file= mysql_file_create(key_file_translog, + translog_filename_by_fileno(file_no, path), + 0, O_BINARY | O_RDWR | O_CLOEXEC, MYF(MY_WME))) < 0) + { + DBUG_PRINT("error", ("Error %d during creating file '%s'", errno, path)); + translog_stop_writing(); + DBUG_RETURN(-1); + } + if (sync_log_dir >= TRANSLOG_SYNC_DIR_NEWFILE && + sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))) + { + DBUG_PRINT("error", ("Error %d during syncing directory '%s'", + errno, log_descriptor.directory)); + mysql_file_close(file, MYF(0)); + translog_stop_writing(); + DBUG_RETURN(-1); + } + DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); + DBUG_RETURN(file); +} + +/** + @brief Open (not create) log file with given number without cache + + @param file_no Number of the log we want to open + + retval -1 error + retval # file descriptor number +*/ + +static File open_logfile_by_number_no_cache(uint32 file_no) +{ + File file; + char path[FN_REFLEN]; + DBUG_ENTER("open_logfile_by_number_no_cache"); + + /* TODO: add O_DIRECT to open flags (when buffer is aligned) */ + /* TODO: use mysql_file_create() */ + if ((file= mysql_file_open(key_file_translog, + translog_filename_by_fileno(file_no, path), + log_descriptor.open_flags | O_CLOEXEC, + MYF(MY_WME))) < 0) + { + DBUG_PRINT("error", ("Error %d during opening file '%s'", errno, path)); + DBUG_RETURN(-1); + } + DBUG_PRINT("info", ("File: '%s' handler: %d", path, file)); + DBUG_RETURN(file); +} + + +/** + @brief get file descriptor by given number using cache + + @param file_no Number of the log we want to open + + retval # file descriptor + retval NULL file is not opened +*/ + +static TRANSLOG_FILE *get_logfile_by_number(uint32 file_no) +{ + TRANSLOG_FILE *file; + DBUG_ENTER("get_logfile_by_number"); + mysql_rwlock_rdlock(&log_descriptor.open_files_lock); + if (log_descriptor.max_file - file_no >= + log_descriptor.open_files.elements) + { + DBUG_PRINT("info", ("File #%u is not opened", file_no)); + mysql_rwlock_unlock(&log_descriptor.open_files_lock); + DBUG_RETURN(NULL); + } + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + DBUG_ASSERT(log_descriptor.max_file >= file_no); + DBUG_ASSERT(log_descriptor.min_file <= file_no); + + file= *dynamic_element(&log_descriptor.open_files, + log_descriptor.max_file - file_no, TRANSLOG_FILE **); + mysql_rwlock_unlock(&log_descriptor.open_files_lock); + DBUG_PRINT("info", ("File %p File no: %u, File handler: %d", + file, file_no, + (file ? file->handler.file : -1))); + DBUG_ASSERT(!file || file->number == file_no); + DBUG_RETURN(file); +} + + +/** + @brief get current file descriptor + + retval # file descriptor +*/ + +static TRANSLOG_FILE *get_current_logfile() +{ + TRANSLOG_FILE *file; + DBUG_ENTER("get_current_logfile"); + mysql_rwlock_rdlock(&log_descriptor.open_files_lock); + DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu", + (ulong) log_descriptor.max_file, + (ulong) log_descriptor.min_file, + (ulong) log_descriptor.open_files.elements)); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **); + mysql_rwlock_unlock(&log_descriptor.open_files_lock); + DBUG_RETURN(file); +} + +uchar maria_trans_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A', + 'L', 'O', 'G' }; +#define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \ + 8 + 4 + 4 + 4 + 2 + 3 + \ + LSN_STORE_SIZE) + + +/* + Write log file page header in the just opened new log file + + SYNOPSIS + translog_write_file_header(); + + NOTES + First page is just a marker page; We don't store any real log data in it. + + RETURN + 0 OK + 1 ERROR +*/ + +static my_bool translog_write_file_header() +{ + TRANSLOG_FILE *file; + ulonglong timestamp; + uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff; + my_bool rc; + DBUG_ENTER("translog_write_file_header"); + + /* file tag */ + memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic)); + page+= sizeof(maria_trans_file_magic); + /* timestamp */ + timestamp= my_hrtime().val; + int8store(page, timestamp); + page+= 8; + /* maria version */ + int4store(page, TRANSLOG_VERSION_ID); + page+= 4; + /* mysql version (MYSQL_VERSION_ID) */ + int4store(page, log_descriptor.server_version); + page+= 4; + /* server ID */ + int4store(page, log_descriptor.server_id); + page+= 4; + /* loghandler page_size */ + int2store(page, TRANSLOG_PAGE_SIZE - 1); + page+= 2; + /* file number */ + int3store(page, LSN_FILE_NO(log_descriptor.horizon)); + page+= 3; + lsn_store(page, LSN_IMPOSSIBLE); + page+= LSN_STORE_SIZE; + memset(page, TRANSLOG_FILLER, sizeof(page_buff) - (page- page_buff)); + + file= get_current_logfile(); + rc= my_pwrite(file->handler.file, page_buff, sizeof(page_buff), 0, + log_write_flags) != 0; + /* + Dropping the flag in such way can make false alarm: signalling than the + file in not sync when it is sync, but the situation is quite rare and + protections with mutexes give much more overhead to the whole engine + */ + file->is_sync= 0; + DBUG_RETURN(rc); +} + +/* + @brief write the new LSN on the given file header + + @param file The file descriptor + @param lsn That LSN which should be written + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_max_lsn_to_header(File file, LSN lsn) +{ + uchar lsn_buff[LSN_STORE_SIZE]; + my_bool rc; + DBUG_ENTER("translog_max_lsn_to_header"); + DBUG_PRINT("enter", ("File descriptor: %ld " + "lsn: " LSN_FMT, + (long) file, + LSN_IN_PARTS(lsn))); + + lsn_store(lsn_buff, lsn); + + rc= (my_pwrite(file, lsn_buff, + LSN_STORE_SIZE, + (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE), + log_write_flags) != 0 || + mysql_file_sync(file, MYF(MY_WME)) != 0); + /* + We should not increase counter in case of error above, but it is so + unlikely that we can ignore this case + */ + translog_syncs++; + DBUG_RETURN(rc); +} + + +/* + @brief Extract hander file information from loghandler file page + + @param desc header information descriptor to be filled with information + @param page_buff buffer with the page content +*/ + +void translog_interpret_file_header(LOGHANDLER_FILE_INFO *desc, + uchar *page_buff) +{ + uchar *ptr; + + ptr= page_buff + sizeof(maria_trans_file_magic); + desc->timestamp= uint8korr(ptr); + ptr+= 8; + desc->maria_version= uint4korr(ptr); + ptr+= 4; + desc->mysql_version= uint4korr(ptr); + ptr+= 4; + desc->server_id= uint4korr(ptr); + ptr+= 4; + desc->page_size= uint2korr(ptr) + 1; + ptr+= 2; + desc->file_number= uint3korr(ptr); + ptr+= 3; + desc->max_lsn= lsn_korr(ptr); +} + + +/* + @brief Read hander file information from loghandler file + + @param desc header information descriptor to be filled with information + @param file file descriptor to read + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file) +{ + uchar page_buff[LOG_HEADER_DATA_SIZE]; + DBUG_ENTER("translog_read_file_header"); + + if (mysql_file_pread(file, page_buff, + sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME))) + { + DBUG_PRINT("info", ("log read fail error: %d", my_errno)); + DBUG_RETURN(1); + } + translog_interpret_file_header(desc, page_buff); + DBUG_PRINT("info", ("timestamp: %llu aria ver: %lu mysql ver: %lu " + "server id %lu page size %lu file number %lu " + "max lsn: " LSN_FMT, + (ulonglong) desc->timestamp, + (ulong) desc->maria_version, + (ulong) desc->mysql_version, + (ulong) desc->server_id, + desc->page_size, (ulong) desc->file_number, + LSN_IN_PARTS(desc->max_lsn))); + DBUG_RETURN(0); +} + + +/* + @brief set the lsn to the files from_file - to_file if it is greater + then written in the file + + @param from_file first file number (min) + @param to_file last file number (max) + @param lsn the lsn for writing + @param is_locked true if current thread locked the log handler + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file, + LSN lsn, my_bool is_locked) +{ + uint32 file; + DBUG_ENTER("translog_set_lsn_for_files"); + DBUG_PRINT("enter", ("From: %lu to: %lu lsn: " LSN_FMT " locked: %d", + (ulong) from_file, (ulong) to_file, + LSN_IN_PARTS(lsn), + is_locked)); + DBUG_ASSERT(from_file <= to_file); + DBUG_ASSERT(from_file > 0); /* we have not file 0 */ + + /* Checks the current file (not finished yet file) */ + if (!is_locked) + translog_lock(); + if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon)) + { + if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0)) + log_descriptor.max_lsn= lsn; + to_file--; + } + if (!is_locked) + translog_unlock(); + + /* Checks finished files if they are */ + mysql_mutex_lock(&log_descriptor.file_header_lock); + for (file= from_file; file <= to_file; file++) + { + LOGHANDLER_FILE_INFO info; + File fd; + + fd= open_logfile_by_number_no_cache(file); + if ((fd < 0) || + ((translog_read_file_header(&info, fd) || + (cmp_translog_addr(lsn, info.max_lsn) > 0 && + translog_max_lsn_to_header(fd, lsn))) | + mysql_file_close(fd, MYF(MY_WME)))) + { + translog_stop_writing(); + mysql_mutex_unlock(&log_descriptor.file_header_lock); + DBUG_RETURN(1); + } + } + mysql_mutex_unlock(&log_descriptor.file_header_lock); + + DBUG_RETURN(0); +} + + +/* descriptor of file in unfinished_files */ +struct st_file_counter +{ + uint32 file; /* file number */ + uint32 counter; /* counter for started writes */ +}; + + +/* + @brief mark file "in progress" (for multi-group records) + + @param file log file number +*/ + +static void translog_mark_file_unfinished(uint32 file) +{ + ssize_t place, i; + struct st_file_counter fc, *fc_ptr; + + DBUG_ENTER("translog_mark_file_unfinished"); + DBUG_PRINT("enter", ("file: %lu", (ulong) file)); + + fc.file= file; fc.counter= 1; + mysql_mutex_lock(&log_descriptor.unfinished_files_lock); + + if (log_descriptor.unfinished_files.elements == 0) + { + insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); + DBUG_PRINT("info", ("The first element inserted")); + goto end; + } + + for (place= log_descriptor.unfinished_files.elements - 1; + place >= 0; + place--) + { + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + place, struct st_file_counter *); + if (fc_ptr->file <= file) + break; + } + + if (place >= 0 && fc_ptr->file == file) + { + fc_ptr->counter++; + DBUG_PRINT("info", ("counter increased")); + goto end; + } + + if (place == (ssize_t)log_descriptor.unfinished_files.elements) + { + insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc); + DBUG_PRINT("info", ("The last element inserted")); + goto end; + } + /* shift and assign new element */ + insert_dynamic(&log_descriptor.unfinished_files, + (uchar*) + dynamic_element(&log_descriptor.unfinished_files, + log_descriptor.unfinished_files.elements- 1, + struct st_file_counter *)); + for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--) + { + /* we do not use set_dynamic() to avoid unneeded checks */ + memcpy(dynamic_element(&log_descriptor.unfinished_files, + i, struct st_file_counter *), + dynamic_element(&log_descriptor.unfinished_files, + i + 1, struct st_file_counter *), + sizeof(struct st_file_counter)); + } + memcpy(dynamic_element(&log_descriptor.unfinished_files, + place + 1, struct st_file_counter *), + &fc, sizeof(struct st_file_counter)); +end: + mysql_mutex_unlock(&log_descriptor.unfinished_files_lock); + DBUG_VOID_RETURN; +} + + +/* + @brief remove file mark "in progress" (for multi-group records) + + @param file log file number +*/ + +static void translog_mark_file_finished(uint32 file) +{ + int i; + struct st_file_counter *UNINIT_VAR(fc_ptr); + DBUG_ENTER("translog_mark_file_finished"); + DBUG_PRINT("enter", ("file: %lu", (ulong) file)); + + mysql_mutex_lock(&log_descriptor.unfinished_files_lock); + + DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0); + for (i= 0; + i < (int) log_descriptor.unfinished_files.elements; + i++) + { + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + i, struct st_file_counter *); + if (fc_ptr->file == file) + { + break; + } + } + DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements); + + if (! --fc_ptr->counter) + delete_dynamic_element(&log_descriptor.unfinished_files, i); + mysql_mutex_unlock(&log_descriptor.unfinished_files_lock); + DBUG_VOID_RETURN; +} + + +/* + @brief get max LSN of the record which parts stored in this file + + @param file file number + + @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR + @retval LSN_IMPOSSIBLE File is still not finished + @retval LSN_ERROR Error opening file + @retval # LSN of the record which parts stored in this file +*/ + +LSN translog_get_file_max_lsn_stored(uint32 file) +{ + uint32 limit= FILENO_IMPOSSIBLE; + DBUG_ENTER("translog_get_file_max_lsn_stored"); + DBUG_PRINT("enter", ("file: %lu", (ulong)file)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + mysql_mutex_lock(&log_descriptor.unfinished_files_lock); + + /* find file with minimum file number "in progress" */ + if (log_descriptor.unfinished_files.elements > 0) + { + struct st_file_counter *fc_ptr; + fc_ptr= dynamic_element(&log_descriptor.unfinished_files, + 0, struct st_file_counter *); + limit= fc_ptr->file; /* minimal file number "in progress" */ + } + mysql_mutex_unlock(&log_descriptor.unfinished_files_lock); + + /* + if there is no "in progress file" then unfinished file is in progress + for sure + */ + if (limit == FILENO_IMPOSSIBLE) + { + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + limit= LSN_FILE_NO(horizon); + } + + if (file >= limit) + { + DBUG_PRINT("info", ("The file in in progress")); + DBUG_RETURN(LSN_IMPOSSIBLE); + } + + { + LOGHANDLER_FILE_INFO info; + File fd; + + fd= open_logfile_by_number_no_cache(file); + if(fd < 0) + { + DBUG_PRINT("error", ("Can't open file")); + DBUG_RETURN(LSN_ERROR); + } + + if (translog_read_file_header(&info, fd)) + { + DBUG_PRINT("error", ("Can't read file header")); + info.max_lsn= LSN_ERROR; + } + + if (mysql_file_close(fd, MYF(MY_WME))) + { + DBUG_PRINT("error", ("Can't close file")); + info.max_lsn= LSN_ERROR; + } + + DBUG_PRINT("info", ("Max lsn: " LSN_FMT, LSN_IN_PARTS(info.max_lsn))); + DBUG_RETURN(info.max_lsn); + } +} + +/* + Initialize transaction log file buffer + + SYNOPSIS + translog_buffer_init() + buffer The buffer to initialize + num Number of this buffer + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num) +{ + DBUG_ENTER("translog_buffer_init"); + buffer->pre_force_close_horizon= + buffer->prev_last_lsn= buffer->last_lsn= + LSN_IMPOSSIBLE; + DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: %p", + buffer)); + + buffer->buffer_no= (uint8) num; + /* This Buffer File */ + buffer->file= NULL; + buffer->overlay= 0; + /* cache for current log */ + memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER); + /* Buffer size */ + buffer->size= 0; + buffer->skipped_data= 0; + /* cond of thread which is waiting for buffer filling */ + if (mysql_cond_init(key_TRANSLOG_BUFFER_waiting_filling_buffer, + &buffer->waiting_filling_buffer, 0)) + DBUG_RETURN(1); + /* Number of records which are in copy progress */ + buffer->copy_to_buffer_in_progress= 0; + /* list of waiting buffer ready threads */ + buffer->waiting_flush= 0; + /* + Buffers locked by the following mutex. As far as buffers create logical + circle (after last buffer goes first) it trigger false alarm of deadlock + detect system, so we remove check of deadlock for this buffers. Indeed + all mutex locks concentrated around current buffer except flushing + thread (but it is only one thread). One thread can't take more then + 2 buffer locks at once. So deadlock is impossible here. + + To prevent false alarm of dead lock detection we switch dead lock + detection for one buffer in the middle of the buffers chain. Excluding + only one of eight buffers from deadlock detection hardly can hide other + possible problems which include this mutexes. + */ + + if (mysql_mutex_init(key_TRANSLOG_BUFFER_mutex, + &buffer->mutex, MY_MUTEX_INIT_FAST) || + mysql_cond_init(key_TRANSLOG_BUFFER_prev_sent_to_disk_cond, + &buffer->prev_sent_to_disk_cond, 0)) + DBUG_RETURN(1); + mysql_mutex_setflags(&buffer->mutex, MYF_NO_DEADLOCK_DETECTION); + buffer->is_closing_buffer= 0; + buffer->prev_sent_to_disk= LSN_IMPOSSIBLE; + buffer->prev_buffer_offset= LSN_IMPOSSIBLE; + buffer->ver= 0; + DBUG_RETURN(0); +} + + +/* + @brief close transaction log file by descriptor + + @param file pagegecache file descriptor reference + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_close_log_file(TRANSLOG_FILE *file) +{ + int rc= 0; + flush_pagecache_blocks(log_descriptor.pagecache, &file->handler, + FLUSH_RELEASE); + /* + Sync file when we close it + TODO: sync only we have changed the log + */ + if (!file->is_sync) + { + rc= mysql_file_sync(file->handler.file, MYF(MY_WME)); + translog_syncs++; + } + rc|= mysql_file_close(file->handler.file, MYF(MY_WME)); + my_free(file); + return MY_TEST(rc); +} + + +/** + @brief Initializes TRANSLOG_FILE structure + + @param file reference on the file to initialize + @param number file number + @param is_sync is file synced on disk +*/ + +static void translog_file_init(TRANSLOG_FILE *file, uint32 number, + my_bool is_sync) +{ + pagecache_file_set_null_hooks(&file->handler); + file->handler.post_read_hook= translog_page_validator; + file->handler.flush_log_callback= maria_flush_log_for_page_none; + file->handler.callback_data= (uchar*)file; + + file->number= number; + file->was_recovered= 0; + file->is_sync= is_sync; +} + + +/** + @brief Create and fill header of new file. + + @note the caller must call it right after it has increased + log_descriptor.horizon to the new file + (log_descriptor.horizon+= LSN_ONE_FILE) + + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_create_new_file() +{ + TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE), + MYF(0)); + + TRANSLOG_FILE *old= get_current_logfile(); + uint32 file_no= LSN_FILE_NO(log_descriptor.horizon); + DBUG_ENTER("translog_create_new_file"); + + if (file == NULL) + goto error; + + /* + Writes max_lsn to the file header before finishing it (there is no need + to lock file header buffer because it is still unfinished file, so only + one thread can finish the file and nobody interested of LSN of current + (unfinished) file, because no one can purge it). + */ + if (translog_max_lsn_to_header(old->handler.file, log_descriptor.max_lsn)) + goto error; + + mysql_rwlock_wrlock(&log_descriptor.open_files_lock); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + DBUG_ASSERT(file_no == log_descriptor.max_file + 1); + if (allocate_dynamic(&log_descriptor.open_files, + log_descriptor.max_file - log_descriptor.min_file + 2)) + goto error_lock; + + /* this call just expand the array */ + if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file)) + goto error_lock; + + if ((file->handler.file= create_logfile_by_number_no_cache(file_no)) == -1) + goto error_lock; + translog_file_init(file, file_no, 0); + + log_descriptor.max_file++; + { + char *start= (char*) dynamic_element(&log_descriptor.open_files, 0, + TRANSLOG_FILE**); + memmove(start + sizeof(TRANSLOG_FILE*), start, + sizeof(TRANSLOG_FILE*) * + (log_descriptor.max_file - log_descriptor.min_file + 1 - 1)); + } + /* can't fail we because we expanded array */ + set_dynamic(&log_descriptor.open_files, (uchar*)&file, 0); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + mysql_rwlock_unlock(&log_descriptor.open_files_lock); + + DBUG_PRINT("info", ("file_no: %lu", (ulong)file_no)); + + if (translog_write_file_header()) + goto error; + + if (ma_control_file_write_and_force(last_checkpoint_lsn, file_no, + max_trid_in_control_file, + recovery_failures)) + goto error; + + DBUG_RETURN(0); + +error_lock: + mysql_rwlock_unlock(&log_descriptor.open_files_lock); +error: + translog_stop_writing(); + my_free(file); + DBUG_RETURN(1); +} + + +/** + @brief Locks the loghandler buffer. + + @param buffer This buffer which should be locked + + @note See comment before buffer 'mutex' variable. + + @retval 0 OK + @retval 1 Error +*/ + +static void translog_buffer_lock(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_lock"); + DBUG_PRINT("enter", + ("Lock buffer #%u: %p", buffer->buffer_no, + buffer)); + mysql_mutex_lock(&buffer->mutex); + DBUG_VOID_RETURN; +} + + +/* + Unlock the loghandler buffer + + SYNOPSIS + translog_buffer_unlock() + buffer This buffer which should be unlocked +*/ + +static void translog_buffer_unlock(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_unlock"); + DBUG_PRINT("enter", ("Unlock buffer... #%u (%p)", + (uint) buffer->buffer_no, buffer)); + + mysql_mutex_unlock(&buffer->mutex); + DBUG_VOID_RETURN; +} + + +/* + Write a header on the page + + SYNOPSIS + translog_new_page_header() + horizon Where to write the page + cursor Where to write the page + + NOTE + - space for page header should be checked before +*/ + +static uchar translog_sector_random; + +static void translog_new_page_header(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uchar *ptr; + + DBUG_ENTER("translog_new_page_header"); + DBUG_ASSERT(cursor->ptr); + + cursor->protected= 0; + + ptr= cursor->ptr; + /* Page number */ + int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE); + ptr+= 3; + /* File number */ + int3store(ptr, LSN_FILE_NO(*horizon)); + ptr+= 3; + DBUG_ASSERT(TRANSLOG_PAGE_FLAGS == (ptr - cursor->ptr)); + cursor->ptr[TRANSLOG_PAGE_FLAGS]= (uchar) log_descriptor.flags; + ptr++; + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { +#ifndef DBUG_OFF + DBUG_PRINT("info", ("write 0x11223344 CRC to " LSN_FMT, + LSN_IN_PARTS(*horizon))); + /* This will be overwritten by real CRC; This is just for debugging */ + int4store(ptr, 0x11223344); +#endif + /* CRC will be put when page is finished */ + ptr+= CRC_SIZE; + } + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + /* + translog_sector_randmo works like "random" values producer because + it is enough to have such "random" for this purpose and it will + not interfere with higher level pseudo random value generator + */ + ptr[0]= translog_sector_random++; + ptr+= TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + } + { + size_t len= (ptr - cursor->ptr); + (*horizon)+= len; /* increasing the offset part of the address */ + cursor->current_page_fill= (uint16)len; + if (!cursor->chaser) + cursor->buffer->size+= (translog_size_t)len; + } + cursor->ptr= ptr; + DBUG_PRINT("info", ("NewP buffer #%u: %p chaser: %d Size: %lu (%lu) " + "Horizon: " LSN_FMT, + (uint) cursor->buffer->buffer_no, cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + LSN_IN_PARTS(*horizon))); + translog_check_cursor(cursor); + DBUG_VOID_RETURN; +} + + +/* + Put sector protection on the page image + + SYNOPSIS + translog_put_sector_protection() + page reference on the page content + cursor cursor of the buffer + + NOTES + We put a sector protection on all following sectors on the page, + except the first sector that is protected by page header. +*/ + +static void translog_put_sector_protection(uchar *page, + struct st_buffer_cursor *cursor) +{ + uchar *table= page + log_descriptor.page_overhead - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + uint i, offset; + uint16 last_protected_sector= ((cursor->previous_offset - 1) / + DISK_DRIVE_SECTOR_SIZE); + uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE; + uint8 value= table[0] + cursor->write_counter; + DBUG_ENTER("translog_put_sector_protection"); + + if (start_sector == 0) + { + /* First sector is protected by file & page numbers in the page header. */ + start_sector= 1; + } + + DBUG_PRINT("enter", ("Write counter:%u value:%u offset:%u, " + "last protected:%u start sector:%u", + (uint) cursor->write_counter, + (uint) value, + (uint) cursor->previous_offset, + (uint) last_protected_sector, (uint) start_sector)); + if (last_protected_sector == start_sector) + { + i= last_protected_sector; + offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE; + /* restore data, because we modified sector which was protected */ + if (offset < cursor->previous_offset) + page[offset]= table[i]; + } + for (i= start_sector, offset= start_sector * DISK_DRIVE_SECTOR_SIZE; + i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + i++, (offset+= DISK_DRIVE_SECTOR_SIZE)) + { + DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x", + i, offset, (uint) page[offset])); + table[i]= page[offset]; + page[offset]= value; + DBUG_PRINT("info", ("sector:%u offset:%u data 0x%x", + i, offset, (uint) page[offset])); + } + DBUG_VOID_RETURN; +} + + +/* + Calculate CRC32 of given area + + SYNOPSIS + translog_crc() + area Pointer of the area beginning + length The Area length + + RETURN + CRC32 +*/ + +static uint32 translog_crc(uchar *area, uint length) +{ + DBUG_ENTER("translog_crc"); + DBUG_RETURN(my_checksum(0L, area, length)); +} + + +/* + Finish current page with zeros + + SYNOPSIS + translog_finish_page() + horizon \ horizon & buffer pointers + cursor / +*/ + +static void translog_finish_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill; + uchar *page= cursor->ptr - cursor->current_page_fill; + DBUG_ENTER("translog_finish_page"); + DBUG_PRINT("enter", ("Buffer: #%u %p " + "Buffer addr: " LSN_FMT " " + "Page addr: " LSN_FMT " " + "size:%u (%u) Pg:%u left:%u", + (uint) cursor->buffer_no, cursor->buffer, + LSN_IN_PARTS(cursor->buffer->offset), + (uint)LSN_FILE_NO(*horizon), + (uint)(LSN_OFFSET(*horizon) - + cursor->current_page_fill), + (uint) cursor->buffer->size, + (uint) (cursor->ptr -cursor->buffer->buffer), + (uint) cursor->current_page_fill, (uint) left)); + DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset) + || translog_status == TRANSLOG_UNINITED); + if ((LSN_FILE_NO(*horizon) != LSN_FILE_NO(cursor->buffer->offset))) + DBUG_VOID_RETURN; // everything wrong do not write to awoid more problems + translog_check_cursor(cursor); + if (cursor->protected) + { + DBUG_PRINT("info", ("Already protected and finished")); + DBUG_VOID_RETURN; + } + cursor->protected= 1; + + DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); + if (left != 0) + { + DBUG_PRINT("info", ("left: %u", (uint) left)); + memset(cursor->ptr, TRANSLOG_FILLER, left); + cursor->ptr+= left; + (*horizon)+= left; /* offset increasing */ + if (!cursor->chaser) + cursor->buffer->size+= left; + /* We are finishing the page so reset the counter */ + cursor->current_page_fill= 0; + DBUG_PRINT("info", ("Finish Page buffer #%u: %p " + "chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, + cursor->buffer, cursor->chaser, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_check_cursor(cursor); + } + /* + When we are finishing the page other thread might not finish the page + header yet (in case if we started from the middle of the page) so we + have to read log_descriptor.flags but not the flags from the page. + */ + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + translog_put_sector_protection(page, cursor); + DBUG_PRINT("info", ("drop write_counter")); + cursor->write_counter= 0; + cursor->previous_offset= 0; + } + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(page + log_descriptor.page_overhead, + TRANSLOG_PAGE_SIZE - + log_descriptor.page_overhead); + DBUG_PRINT("info", ("CRC: %lx", (ulong) crc)); + /* We have page number, file number and flag before crc */ + int4store(page + 3 + 3 + 1, crc); + } + DBUG_VOID_RETURN; +} + + +/* + @brief Wait until all threads have finished closing this buffer. + + @param buffer This buffer should be check +*/ + +static void translog_wait_for_closing(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_wait_for_closing"); + DBUG_PRINT("enter", ("Buffer #%u %p copies in progress: %u " + "is closing %u File: %d size: %lu", + (uint) buffer->buffer_no, buffer, + (uint) buffer->copy_to_buffer_in_progress, + (uint) buffer->is_closing_buffer, + (buffer->file ? buffer->file->handler.file : -1), + (ulong) buffer->size)); + translog_buffer_lock_assert_owner(buffer); + + while (buffer->is_closing_buffer) + { + DBUG_PRINT("info", ("wait for writers... buffer: #%u %p", + (uint) buffer->buffer_no, buffer)); + DBUG_ASSERT(buffer->file != NULL); + mysql_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done buffer: #%u %p", + (uint) buffer->buffer_no, buffer)); + } + + DBUG_VOID_RETURN; +} + + +/* + @brief Wait until all threads have finished filling this buffer. + + @param buffer This buffer should be check +*/ + +static void translog_wait_for_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_wait_for_writers"); + DBUG_PRINT("enter", ("Buffer #%u %p copies in progress: %u " + "is closing %u File: %d size: %lu", + (uint) buffer->buffer_no, buffer, + (uint) buffer->copy_to_buffer_in_progress, + (uint) buffer->is_closing_buffer, + (buffer->file ? buffer->file->handler.file : -1), + (ulong) buffer->size)); + translog_buffer_lock_assert_owner(buffer); + + while (buffer->copy_to_buffer_in_progress) + { + DBUG_PRINT("info", ("wait for writers... buffer: #%u %p", + (uint) buffer->buffer_no, buffer)); + DBUG_ASSERT(buffer->file != NULL); + mysql_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done buffer: #%u %p", + (uint) buffer->buffer_no, buffer)); + } + + DBUG_VOID_RETURN; +} + + +/* + + Wait for buffer to become free + + SYNOPSIS + translog_wait_for_buffer_free() + buffer The buffer we are waiting for + + NOTE + - this buffer should be locked +*/ + +static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer) +{ + TRANSLOG_ADDRESS offset= buffer->offset; + TRANSLOG_FILE *file= buffer->file; + uint8 ver= buffer->ver; + DBUG_ENTER("translog_wait_for_buffer_free"); + DBUG_PRINT("enter", ("Buffer #%u %p copies in progress: %u " + "is closing %u File: %d size: %lu", + (uint) buffer->buffer_no, buffer, + (uint) buffer->copy_to_buffer_in_progress, + (uint) buffer->is_closing_buffer, + (buffer->file ? buffer->file->handler.file : -1), + (ulong) buffer->size)); + + translog_wait_for_writers(buffer); + + if (offset != buffer->offset || file != buffer->file || ver != buffer->ver) + DBUG_VOID_RETURN; /* the buffer if already freed */ + + while (buffer->file != NULL) + { + DBUG_PRINT("info", ("wait for writers... buffer: #%u %p", + (uint) buffer->buffer_no, buffer)); + mysql_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex); + DBUG_PRINT("info", ("wait for writers done. buffer: #%u %p", + (uint) buffer->buffer_no, buffer)); + } + DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0); + DBUG_VOID_RETURN; +} + + +/* + Initialize the cursor for a buffer + + SYNOPSIS + translog_cursor_init() + buffer The buffer + cursor It's cursor + buffer_no Number of buffer +*/ + +static void translog_cursor_init(struct st_buffer_cursor *cursor, + struct st_translog_buffer *buffer, + uint8 buffer_no) +{ + DBUG_ENTER("translog_cursor_init"); + cursor->ptr= buffer->buffer; + cursor->buffer= buffer; + cursor->buffer_no= buffer_no; + cursor->current_page_fill= 0; + cursor->chaser= (cursor != &log_descriptor.bc); + cursor->write_counter= 0; + cursor->previous_offset= 0; + cursor->protected= 0; + DBUG_VOID_RETURN; +} + + +/* + @brief Initialize buffer for the current file, and a cursor for this buffer. + + @param buffer The buffer + @param cursor It's cursor + @param buffer_no Number of buffer +*/ + +static void translog_start_buffer(struct st_translog_buffer *buffer, + struct st_buffer_cursor *cursor, + uint buffer_no) +{ + DBUG_ENTER("translog_start_buffer"); + DBUG_PRINT("enter", + ("Assign buffer: #%u (%p) offset: 0x%x(%u)", + (uint) buffer->buffer_no, buffer, + (uint) LSN_OFFSET(log_descriptor.horizon), + (uint) LSN_OFFSET(log_descriptor.horizon))); + DBUG_ASSERT(buffer_no == buffer->buffer_no); + buffer->pre_force_close_horizon= + buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE; + DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: %p", + buffer)); + buffer->offset= log_descriptor.horizon; + buffer->next_buffer_offset= LSN_IMPOSSIBLE; + buffer->file= get_current_logfile(); + buffer->overlay= 0; + buffer->size= 0; + buffer->skipped_data= 0; + translog_cursor_init(cursor, buffer, buffer_no); + DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: %p " + "chaser: %d Size: %lu (%lu)", + (long) (buffer->file ? buffer->file->number : 0), + (buffer->file ? buffer->file->handler.file : -1), + (uint) cursor->buffer->buffer_no, cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_check_cursor(cursor); + mysql_mutex_lock(&log_descriptor.dirty_buffer_mask_lock); + log_descriptor.dirty_buffer_mask|= (1 << buffer->buffer_no); + mysql_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock); + + DBUG_VOID_RETURN; +} + + +/* + @brief Switch to the next buffer in a chain. + + @param horizon \ Pointers on current position in file and buffer + @param cursor / + @param new_file Also start new file + + @note + - loghandler should be locked + - after return new and old buffer still are locked + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + my_bool new_file) +{ + uint old_buffer_no= cursor->buffer_no; + uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no; + my_bool chasing= cursor->chaser; + DBUG_ENTER("translog_buffer_next"); + + DBUG_PRINT("info", ("horizon: " LSN_FMT " chasing: %d", + LSN_IN_PARTS(log_descriptor.horizon), chasing)); + + DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0); + + translog_finish_page(horizon, cursor); + + if (!chasing) + { + translog_buffer_lock(new_buffer); +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= new_buffer->offset; + TRANSLOG_FILE *file= new_buffer->file; + uint8 ver= new_buffer->ver; + translog_lock_assert_owner(); +#endif + translog_wait_for_buffer_free(new_buffer); +#ifndef DBUG_OFF + /* We keep the handler locked so nobody can start this new buffer */ + DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && + (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); + } +#endif + } + else + DBUG_ASSERT(new_buffer->file != NULL); + + if (new_file) + { + /* move the horizon to the next file and its header page */ + (*horizon)+= LSN_ONE_FILE; + (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE); + if (!chasing && translog_create_new_file()) + { + DBUG_RETURN(1); + } + } + + /* prepare next page */ + if (chasing) + translog_cursor_init(cursor, new_buffer, new_buffer_no); + else + { + translog_lock_assert_owner(); + translog_start_buffer(new_buffer, cursor, new_buffer_no); + new_buffer->prev_buffer_offset= + log_descriptor.buffers[old_buffer_no].offset; + new_buffer->prev_last_lsn= + BUFFER_MAX_LSN(log_descriptor.buffers + old_buffer_no); + } + log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset; + DBUG_PRINT("info", ("prev_last_lsn set to " LSN_FMT " buffer:%p", + LSN_IN_PARTS(new_buffer->prev_last_lsn), + new_buffer)); + translog_new_page_header(horizon, cursor); + DBUG_RETURN(0); +} + + +/* + Sets max LSN sent to file, and address from which data is only in the buffer + + SYNOPSIS + translog_set_sent_to_disk() + buffer buffer which we have sent to disk + + TODO: use atomic operations if possible (64bit architectures?) +*/ + +static void translog_set_sent_to_disk(struct st_translog_buffer *buffer) +{ + LSN lsn= buffer->last_lsn; + TRANSLOG_ADDRESS in_buffers= buffer->next_buffer_offset; + + DBUG_ENTER("translog_set_sent_to_disk"); + mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); + DBUG_PRINT("enter", ("lsn: " LSN_FMT " in_buffers: " LSN_FMT " " + "in_buffers_only: " LSN_FMT " start: " LSN_FMT " " + "sent_to_disk: " LSN_FMT, + LSN_IN_PARTS(lsn), + LSN_IN_PARTS(in_buffers), + LSN_IN_PARTS(log_descriptor.log_start), + LSN_IN_PARTS(log_descriptor.in_buffers_only), + LSN_IN_PARTS(log_descriptor.sent_to_disk))); + /* + We write sequentially (first part of following assert) but we rewrite + the same page in case we started mysql and shut it down immediately + (second part of the following assert) + */ + DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_disk) >= 0 || + cmp_translog_addr(lsn, log_descriptor.log_start) < 0); + log_descriptor.sent_to_disk= lsn; + /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ + if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) + { + log_descriptor.in_buffers_only= in_buffers; + DBUG_PRINT("info", ("set new in_buffers_only")); + } + mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_VOID_RETURN; +} + + +/* + Sets address from which data is only in the buffer + + SYNOPSIS + translog_set_only_in_buffers() + lsn LSN to assign + in_buffers to assign to in_buffers_only +*/ + +static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers) +{ + DBUG_ENTER("translog_set_only_in_buffers"); + mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); + DBUG_PRINT("enter", ("in_buffers: " LSN_FMT " " + "in_buffers_only: " LSN_FMT, + LSN_IN_PARTS(in_buffers), + LSN_IN_PARTS(log_descriptor.in_buffers_only))); + /* LSN_IMPOSSIBLE == 0 => it will work for very first time */ + if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0) + { + if (translog_status != TRANSLOG_OK) + goto end; + log_descriptor.in_buffers_only= in_buffers; + DBUG_PRINT("info", ("set new in_buffers_only")); + } +end: + mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_VOID_RETURN; +} + + +/* + Gets address from which data is only in the buffer + + SYNOPSIS + translog_only_in_buffers() + + RETURN + address from which data is only in the buffer +*/ + +static TRANSLOG_ADDRESS translog_only_in_buffers() +{ + register TRANSLOG_ADDRESS addr; + DBUG_ENTER("translog_only_in_buffers"); + mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); + addr= log_descriptor.in_buffers_only; + mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_RETURN(addr); +} + + +/* + Get max LSN sent to file + + SYNOPSIS + translog_get_sent_to_disk() + + RETURN + max LSN send to file +*/ + +static LSN translog_get_sent_to_disk() +{ + register LSN lsn; + DBUG_ENTER("translog_get_sent_to_disk"); + mysql_mutex_lock(&log_descriptor.sent_to_disk_lock); + lsn= log_descriptor.sent_to_disk; + DBUG_PRINT("info", ("sent to disk up to " LSN_FMT, LSN_IN_PARTS(lsn))); + mysql_mutex_unlock(&log_descriptor.sent_to_disk_lock); + DBUG_RETURN(lsn); +} + + +/* + Get first chunk address on the given page + + SYNOPSIS + translog_get_first_chunk_offset() + page The page where to find first chunk + + RETURN + first chunk offset +*/ + +static my_bool translog_get_first_chunk_offset(uchar *page) +{ + DBUG_ENTER("translog_get_first_chunk_offset"); + DBUG_ASSERT(page[TRANSLOG_PAGE_FLAGS] < TRANSLOG_FLAGS_NUM); + DBUG_RETURN(page_overhead[page[TRANSLOG_PAGE_FLAGS]]); +} + + +/* + Write coded length of record + + SYNOPSIS + translog_write_variable_record_1group_code_len + dst Destination buffer pointer + length Length which should be coded + header_len Calculated total header length +*/ + +static void +translog_write_variable_record_1group_code_len(uchar *dst, + translog_size_t length, + uint16 header_len) +{ + switch (header_len) { + case 6: /* (5 + 1) */ + DBUG_ASSERT(length <= 250); + *dst= (uint8) length; + return; + case 8: /* (5 + 3) */ + DBUG_ASSERT(length <= 0xFFFF); + *dst= 251; + int2store(dst + 1, length); + return; + case 9: /* (5 + 4) */ + DBUG_ASSERT(length <= (ulong) 0xFFFFFF); + *dst= 252; + int3store(dst + 1, length); + return; + case 10: /* (5 + 5) */ + *dst= 253; + int4store(dst + 1, length); + return; + default: + DBUG_ASSERT(0); + } + return; +} + + +/* + Decode record data length and advance given pointer to the next field + + SYNOPSIS + translog_variable_record_1group_decode_len() + src The pointer to the pointer to the length beginning + + RETURN + decoded length +*/ + +static translog_size_t translog_variable_record_1group_decode_len(uchar **src) +{ + uint8 first= (uint8) (**src); + switch (first) { + case 251: + (*src)+= 3; + return (uint2korr((*src) - 2)); + case 252: + (*src)+= 4; + return (uint3korr((*src) - 3)); + case 253: + (*src)+= 5; + return (uint4korr((*src) - 4)); + case 254: + case 255: + DBUG_ASSERT(0); /* reserved for future use */ + return (0); + default: + (*src)++; + return (first); + } +} + + +/* + Get total length of this chunk (not only body) + + SYNOPSIS + translog_get_total_chunk_length() + page The page where chunk placed + offset Offset of the chunk on this place + + RETURN + total length of the chunk +*/ + +static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset) +{ + DBUG_ENTER("translog_get_total_chunk_length"); + switch (page[offset] & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + { + /* 0 chunk referred as LSN (head or tail) */ + translog_size_t rec_len; + uchar *start= page + offset; + uchar *ptr= start + 1 + 2; /* chunk type and short trid */ + uint16 chunk_len, header_len, page_rest; + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); + rec_len= translog_variable_record_1group_decode_len(&ptr); + chunk_len= uint2korr(ptr); + header_len= (uint16) (ptr -start) + 2; + DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", + (ulong) rec_len, (uint) chunk_len, (uint) header_len)); + if (chunk_len) + { + DBUG_PRINT("info", ("chunk len: %u + %u = %u", + (uint) header_len, (uint) chunk_len, + (uint) (chunk_len + header_len))); + DBUG_RETURN(chunk_len + header_len); + } + page_rest= TRANSLOG_PAGE_SIZE - offset; + DBUG_PRINT("info", ("page_rest %u", (uint) page_rest)); + if (rec_len + header_len < page_rest) + DBUG_RETURN(rec_len + header_len); + DBUG_RETURN(page_rest); + } + case TRANSLOG_CHUNK_FIXED: + { + uchar *ptr; + uint type= page[offset] & TRANSLOG_REC_TYPE; + uint length; + int i; + /* 1 (pseudo)fixed record (also LSN) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED")); + DBUG_ASSERT(log_record_type_descriptor[type].rclass == + LOGRECTYPE_FIXEDLENGTH || + log_record_type_descriptor[type].rclass == + LOGRECTYPE_PSEUDOFIXEDLENGTH); + if (log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH) + { + DBUG_PRINT("info", + ("Fixed length: %u", + (uint) (log_record_type_descriptor[type].fixed_length + 3))); + DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3); + } + + ptr= page + offset + 3; /* first compressed LSN */ + length= log_record_type_descriptor[type].fixed_length + 3; + for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++) + { + /* first 2 bits is length - 2 */ + uint len= (((uint8) (*ptr)) >> 6) + 2; + if (ptr[0] == 0 && ((uint8) ptr[1]) == 1) + len+= LSN_STORE_SIZE; /* case of full LSN storing */ + ptr+= len; + /* subtract saved bytes */ + length-= (LSN_STORE_SIZE - len); + } + DBUG_PRINT("info", ("Pseudo-fixed length: %u", length)); + DBUG_RETURN(length); + } + case TRANSLOG_CHUNK_NOHDR: + /* 2 no header chunk (till page end) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR length: %u", + (uint) (TRANSLOG_PAGE_SIZE - offset))); + DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset); + case TRANSLOG_CHUNK_LNGTH: /* 3 chunk with chunk length */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH")); + DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3); + DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3)); + DBUG_RETURN(uint2korr(page + offset + 1) + 3); + default: + DBUG_ASSERT(0); + DBUG_RETURN(0); + } +} + +/* + @brief Waits previous buffer flush finish + + @param buffer buffer for check + + @retval 0 previous buffer flushed and this thread have to flush this one + @retval 1 previous buffer flushed and this buffer flushed by other thread too +*/ + +my_bool translog_prev_buffer_flush_wait(struct st_translog_buffer *buffer) +{ + TRANSLOG_ADDRESS offset= buffer->offset; + TRANSLOG_FILE *file= buffer->file; + uint8 ver= buffer->ver; + DBUG_ENTER("translog_prev_buffer_flush_wait"); + DBUG_PRINT("enter", ("buffer: %p #%u offset: " LSN_FMT " " + "prev sent: " LSN_FMT " prev offset: " LSN_FMT, + buffer, (uint) buffer->buffer_no, + LSN_IN_PARTS(buffer->offset), + LSN_IN_PARTS(buffer->prev_sent_to_disk), + LSN_IN_PARTS(buffer->prev_buffer_offset))); + translog_buffer_lock_assert_owner(buffer); + if (buffer->prev_buffer_offset != buffer->prev_sent_to_disk) + { + do { + mysql_cond_wait(&buffer->prev_sent_to_disk_cond, &buffer->mutex); + if (buffer->file != file || buffer->offset != offset || + buffer->ver != ver) + DBUG_RETURN(1); /* some the thread flushed the buffer already */ + } while(buffer->prev_buffer_offset != buffer->prev_sent_to_disk); + } + DBUG_RETURN(0); +} + + +/* + Flush given buffer + + SYNOPSIS + translog_buffer_flush() + buffer This buffer should be flushed + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_buffer_flush(struct st_translog_buffer *buffer) +{ + uint32 i, pg; + TRANSLOG_ADDRESS offset= buffer->offset; + TRANSLOG_FILE *file= buffer->file; + uint8 ver= buffer->ver; + uint skipped_data; + DBUG_ENTER("translog_buffer_flush"); + DBUG_PRINT("enter", + ("Buffer: #%u %p file: %d offset: " LSN_FMT " size: %lu", + (uint) buffer->buffer_no, buffer, + buffer->file->handler.file, + LSN_IN_PARTS(buffer->offset), + (ulong) buffer->size)); + translog_buffer_lock_assert_owner(buffer); + + if (buffer->file == NULL) + DBUG_RETURN(0); + + translog_wait_for_writers(buffer); + + if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + + if (buffer->is_closing_buffer) + { + /* some other flush in progress */ + translog_wait_for_closing(buffer); + if (buffer->file != file || buffer->offset != offset || buffer->ver != ver) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + } + + if (buffer->overlay && translog_prev_buffer_flush_wait(buffer)) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + + /* + Send page by page in the pagecache what we are going to write on the + disk + */ + file= buffer->file; + skipped_data= buffer->skipped_data; + DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE); + for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE; + i < buffer->size; + i+= TRANSLOG_PAGE_SIZE, pg++) + { +#ifdef DBUG_TRACE + TRANSLOG_ADDRESS addr= (buffer->offset + i); +#endif + DBUG_PRINT("info", ("send log form %lu till %lu address: " LSN_FMT " " + "page #: %lu buffer size: %lu buffer: %p", + (ulong) i, (ulong) (i + TRANSLOG_PAGE_SIZE), + LSN_IN_PARTS(addr), (ulong) pg, (ulong) buffer->size, + buffer)); + DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size); + if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN) + DBUG_RETURN(1); + if (pagecache_write_part(log_descriptor.pagecache, + &file->handler, pg, 3, + buffer->buffer + i, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DONE, 0, + LSN_IMPOSSIBLE, + skipped_data, + TRANSLOG_PAGE_SIZE - skipped_data)) + { + DBUG_PRINT("error", + ("Can't write page " LSN_FMT " to pagecache, error: %d", + buffer->file->number, + (uint)(LSN_OFFSET(buffer->offset)+ i), + my_errno)); + translog_stop_writing(); + DBUG_RETURN(1); + } + skipped_data= 0; + } + file->is_sync= 0; + if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data, + buffer->size - buffer->skipped_data, + LSN_OFFSET(buffer->offset) + buffer->skipped_data, + log_write_flags)) + { + DBUG_PRINT("error", ("Can't write buffer " LSN_FMT " size %lu " + "to the disk (%d)", + (uint) file->handler.file, + (uint) LSN_OFFSET(buffer->offset), + (ulong) buffer->size, errno)); + translog_stop_writing(); + DBUG_RETURN(1); + } + /* + Dropping the flag in such way can make false alarm: signalling than the + file in not sync when it is sync, but the situation is quite rare and + protections with mutexes give much more overhead to the whole engine + */ + file->is_sync= 0; + + if (LSN_OFFSET(buffer->last_lsn) != 0) /* if buffer->last_lsn is set */ + { + if (translog_prev_buffer_flush_wait(buffer)) + DBUG_RETURN(0); /* some the thread flushed the buffer already */ + translog_set_sent_to_disk(buffer); + } + else + translog_set_only_in_buffers(buffer->next_buffer_offset); + + /* say to next buffer that we are finished */ + { + struct st_translog_buffer *next_buffer= + log_descriptor.buffers + ((buffer->buffer_no + 1) % TRANSLOG_BUFFERS_NO); + if (likely(translog_status == TRANSLOG_OK)){ + translog_buffer_lock(next_buffer); + next_buffer->prev_sent_to_disk= buffer->offset; + translog_buffer_unlock(next_buffer); + mysql_cond_broadcast(&next_buffer->prev_sent_to_disk_cond); + } + else + { + /* + It is shutdown => + 1) there is only one thread + 2) mutexes of other buffers can be destroyed => we can't use them + */ + next_buffer->prev_sent_to_disk= buffer->offset; + } + } + /* Free buffer */ + buffer->file= NULL; + buffer->overlay= 0; + buffer->ver++; + mysql_mutex_lock(&log_descriptor.dirty_buffer_mask_lock); + log_descriptor.dirty_buffer_mask&= ~(1 << buffer->buffer_no); + mysql_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock); + mysql_cond_broadcast(&buffer->waiting_filling_buffer); + DBUG_RETURN(0); +} + + +/* + Recover page with sector protection (wipe out failed chunks) + + SYNOPSYS + translog_recover_page_up_to_sector() + page reference on the page + offset offset of failed sector + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset) +{ + uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end; + DBUG_ENTER("translog_recover_page_up_to_sector"); + DBUG_PRINT("enter", ("offset: %u first chunk: %u", + (uint) offset, (uint) chunk_offset)); + + while (chunk_offset < offset && page[chunk_offset] != TRANSLOG_FILLER) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + { + DBUG_PRINT("error", ("cant get chunk length (offset %u)", + (uint) chunk_offset)); + DBUG_RETURN(1); + } + DBUG_PRINT("info", ("chunk: offset: %u length %u", + (uint) chunk_offset, (uint) chunk_length)); + if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE) + { + DBUG_PRINT("error", ("damaged chunk (offset %u) in trusted area", + (uint) chunk_offset)); + DBUG_RETURN(1); + } + chunk_offset+= chunk_length; + } + + valid_chunk_end= chunk_offset; + /* end of trusted area - sector parsing */ + while (page[chunk_offset] != TRANSLOG_FILLER) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + break; + + DBUG_PRINT("info", ("chunk: offset: %u length %u", + (uint) chunk_offset, (uint) chunk_length)); + if (((ulong) chunk_offset) + ((ulong) chunk_length) > + (uint) (offset + DISK_DRIVE_SECTOR_SIZE)) + break; + + chunk_offset+= chunk_length; + valid_chunk_end= chunk_offset; + } + DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end)); + + memset(page + valid_chunk_end, TRANSLOG_FILLER, + TRANSLOG_PAGE_SIZE - valid_chunk_end); + + DBUG_RETURN(0); +} + + +/** + @brief Checks and removes sector protection. + + @param page reference on the page content. + @param file transaction log descriptor. + + @retvat 0 OK + @retval 1 Error +*/ + +static my_bool +translog_check_sector_protection(uchar *page, TRANSLOG_FILE *file) +{ + uint i, offset; + uchar *table= page + page_overhead[page[TRANSLOG_PAGE_FLAGS]] - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + uint8 current= table[0]; + DBUG_ENTER("translog_check_sector_protection"); + + for (i= 1, offset= DISK_DRIVE_SECTOR_SIZE; + i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + i++, offset+= DISK_DRIVE_SECTOR_SIZE) + { + /* + TODO: add chunk counting for "suspecting" sectors (difference is + more than 1-2), if difference more then present chunks then it is + the problem. + */ + uint8 test= page[offset]; + DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " + "read: 0x%x stored: 0x%x%x", + i, offset, (ulong) current, + (uint) uint2korr(page + offset), (uint) table[i], + (uint) table[i + 1])); + /* + 3 is minimal possible record length. So we can have "distance" + between 2 sectors value more then DISK_DRIVE_SECTOR_SIZE / 3 + only if it is old value, i.e. the sector was not written. + */ + if (((test < current) && + ((uint)(0xFFL - current + test) > DISK_DRIVE_SECTOR_SIZE / 3)) || + ((test >= current) && + ((uint)(test - current) > DISK_DRIVE_SECTOR_SIZE / 3))) + { + if (translog_recover_page_up_to_sector(page, offset)) + DBUG_RETURN(1); + file->was_recovered= 1; + DBUG_RETURN(0); + } + + /* Restore value on the page */ + page[offset]= table[i]; + current= test; + DBUG_PRINT("info", ("sector: #%u offset: %u current: %lx " + "read: 0x%x stored: 0x%x", + i, offset, (ulong) current, + (uint) page[offset], (uint) table[i])); + } + DBUG_RETURN(0); +} + + +/** + @brief Log page validator (read callback) + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to TRANSLOG_FILE) + + @todo: add turning loghandler to read-only mode after merging with + that patch. + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_page_validator(int res, PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + pgcache_page_no_t page_no= args->pageno; + uint this_page_page_overhead; + uint flags; + uchar *page_pos; + TRANSLOG_FILE *data= (TRANSLOG_FILE *) args->data; +#ifdef DBUG_TRACE + pgcache_page_no_t offset= page_no * TRANSLOG_PAGE_SIZE; +#endif + DBUG_ENTER("translog_page_validator"); + + data->was_recovered= 0; + + if (res) + { + DBUG_RETURN(1); + } + + if ((pgcache_page_no_t) uint3korr(page) != page_no || + (uint32) uint3korr(page + 3) != data->number) + { + DBUG_PRINT("error", ("Page " LSN_FMT ": " + "page address written in the page is incorrect: " + "File %lu instead of %lu or page %lu instead of %lu", + (uint)data->number, (uint)offset, + (ulong) uint3korr(page + 3), (ulong) data->number, + (ulong) uint3korr(page), + (ulong) page_no)); + DBUG_RETURN(1); + } + flags= (uint)(page[TRANSLOG_PAGE_FLAGS]); + this_page_page_overhead= page_overhead[flags]; + if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) + { + DBUG_PRINT("error", ("Page " LSN_FMT ": " + "Garbage in the page flags field detected : %x", + (uint) data->number, (uint) offset, + (uint) flags)); + DBUG_RETURN(1); + } + page_pos= page + (3 + 3 + 1); + if (flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(page + this_page_page_overhead, + TRANSLOG_PAGE_SIZE - + this_page_page_overhead); + if (crc != uint4korr(page_pos)) + { + DBUG_PRINT("error", ("Page " LSN_FMT ": " + "CRC mismatch: calculated: %lx on the page %lx", + (uint) data->number, (uint) offset, + (ulong) crc, (ulong) uint4korr(page_pos))); + DBUG_RETURN(1); + } + page_pos+= CRC_SIZE; /* Skip crc */ + } + if (flags & TRANSLOG_SECTOR_PROTECTION && + translog_check_sector_protection(page, data)) + { + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/** + @brief Locks the loghandler. +*/ + +void translog_lock() +{ + uint8 current_buffer; + DBUG_ENTER("translog_lock"); + + /* + Locking the loghandler mean locking current buffer, but it can change + during locking, so we should check it + */ + for (;;) + { + /* + log_descriptor.bc.buffer_no is only one byte so its reading is + an atomic operation + */ + current_buffer= log_descriptor.bc.buffer_no; + translog_buffer_lock(log_descriptor.buffers + current_buffer); + if (log_descriptor.bc.buffer_no == current_buffer) + break; + translog_buffer_unlock(log_descriptor.buffers + current_buffer); + } + DBUG_VOID_RETURN; +} + + +/* + Unlock the loghandler + + SYNOPSIS + translog_unlock() + + RETURN + 0 OK + 1 Error +*/ + +void translog_unlock() +{ + translog_buffer_unlock(log_descriptor.bc.buffer); +} + + +/** + @brief Get log page by file number and offset of the beginning of the page + + @param data validator data, which contains the page address + @param buffer buffer for page placing + (might not be used in some cache implementations) + @param direct_link if it is not NULL then caller can accept direct + link to the page cache + + @retval NULL Error + @retval # pointer to the page cache which should be used to read this page +*/ + +static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer, + PAGECACHE_BLOCK_LINK **direct_link) +{ + TRANSLOG_ADDRESS addr= *(data->addr), in_buffers; + uint32 file_no= LSN_FILE_NO(addr); + TRANSLOG_FILE *file; + DBUG_ENTER("translog_get_page"); + DBUG_PRINT("enter", ("File: %u Offset: %u(0x%x)", + file_no, + (uint) LSN_OFFSET(addr), + (uint) LSN_OFFSET(addr))); + + /* it is really page address */ + DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0); + if (direct_link) + *direct_link= NULL; + +restart: + + in_buffers= translog_only_in_buffers(); + DBUG_PRINT("info", ("in_buffers: " LSN_FMT, + LSN_IN_PARTS(in_buffers))); + if (in_buffers != LSN_IMPOSSIBLE && + cmp_translog_addr(addr, in_buffers) >= 0) + { + translog_lock(); + DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0); + /* recheck with locked loghandler */ + in_buffers= translog_only_in_buffers(); + if (cmp_translog_addr(addr, in_buffers) >= 0) + { + uint16 buffer_no= log_descriptor.bc.buffer_no; +#ifdef DBUG_ASSERT_EXISTS + uint16 buffer_start= buffer_no; +#endif + struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer; + struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer; + for (;;) + { + /* + if the page is in the buffer and it is the last version of the + page (in case of division the page by buffer flush) + */ + if (curr_buffer->file != NULL && + cmp_translog_addr(addr, curr_buffer->offset) >= 0 && + cmp_translog_addr(addr, + (curr_buffer->next_buffer_offset ? + curr_buffer->next_buffer_offset: + curr_buffer->offset + curr_buffer->size)) < 0) + { + TRANSLOG_ADDRESS offset= curr_buffer->offset; + TRANSLOG_FILE *fl= curr_buffer->file; + uchar *from, *table= NULL; + int is_last_unfinished_page; + uint last_protected_sector= 0; + uint skipped_data= curr_buffer->skipped_data; + TRANSLOG_FILE file_copy; + uint8 ver= curr_buffer->ver; + translog_wait_for_writers(curr_buffer); + if (offset != curr_buffer->offset || fl != curr_buffer->file || + ver != curr_buffer->ver) + { + DBUG_ASSERT(buffer_unlock == curr_buffer); + translog_buffer_unlock(buffer_unlock); + goto restart; + } + DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset)); + from= curr_buffer->buffer + (addr - curr_buffer->offset); + if (skipped_data && addr == curr_buffer->offset) + { + /* + We read page part of which is not present in buffer, + so we should read absent part from file (page cache actually) + */ + file= get_logfile_by_number(file_no); + DBUG_ASSERT(file != NULL); + /* + it's ok to not lock the page because: + - The log handler has it's own page cache. + - There is only one thread that can access the log + cache at a time + */ + if (!(buffer= pagecache_read(log_descriptor.pagecache, + &file->handler, + LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, + 3, buffer, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + NULL))) + DBUG_RETURN(NULL); + } + else + skipped_data= 0; /* Read after skipped in buffer data */ + /* + Now we have correct data in buffer up to 'skipped_data'. The + following memcpy() will move the data from the internal buffer + that was not yet on disk. + */ + memcpy(buffer + skipped_data, from + skipped_data, + TRANSLOG_PAGE_SIZE - skipped_data); + /* + We can use copy then in translog_page_validator() because it + do not put it permanently somewhere. + We have to use copy because after releasing log lock we can't + guaranty that the file still be present (in real life it will be + present but theoretically possible that it will be released + already from last files cache); + */ + file_copy= *(curr_buffer->file); + file_copy.handler.callback_data= (uchar*) &file_copy; + is_last_unfinished_page= ((log_descriptor.bc.buffer == + curr_buffer) && + (log_descriptor.bc.ptr >= from) && + (log_descriptor.bc.ptr < + from + TRANSLOG_PAGE_SIZE)); + if (is_last_unfinished_page && + (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)) + { + last_protected_sector= ((log_descriptor.bc.previous_offset - 1) / + DISK_DRIVE_SECTOR_SIZE); + table= buffer + log_descriptor.page_overhead - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + } + + DBUG_ASSERT(buffer_unlock == curr_buffer); + translog_buffer_unlock(buffer_unlock); + if (is_last_unfinished_page) + { + uint i; + /* + This is last unfinished page => we should not check CRC and + remove only that protection which already installed (no need + to check it) + + We do not check the flag of sector protection, because if + (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is + not set then last_protected_sector will be 0 so following loop + will be never executed + */ + DBUG_PRINT("info", ("This is last unfinished page, " + "last protected sector %u", + last_protected_sector)); + for (i= 1; i <= last_protected_sector; i++) + { + uint offset= i * DISK_DRIVE_SECTOR_SIZE; + DBUG_PRINT("info", ("Sector %u: 0x%02x <- 0x%02x", + i, buffer[offset], + table[i])); + buffer[offset]= table[i]; + } + } + else + { + /* + This IF should be true because we use in-memory data which + supposed to be correct. + */ + PAGECACHE_IO_HOOK_ARGS args; + args.page= buffer; + args.pageno= LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE; + args.data= (uchar*) &file_copy; + if (translog_page_validator(0, &args)) + { + DBUG_ASSERT(0); + buffer= NULL; + } + } + DBUG_RETURN(buffer); + } + buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO; + curr_buffer= log_descriptor.buffers + buffer_no; + translog_buffer_lock(curr_buffer); + translog_buffer_unlock(buffer_unlock); + buffer_unlock= curr_buffer; + /* we can't make a full circle */ + DBUG_ASSERT(buffer_start != buffer_no); + } + } + translog_unlock(); + } + file= get_logfile_by_number(file_no); + DBUG_ASSERT(file != NULL); + buffer= pagecache_read(log_descriptor.pagecache, &file->handler, + LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE, + 3, (direct_link ? NULL : buffer), + PAGECACHE_PLAIN_PAGE, + (direct_link ? + PAGECACHE_LOCK_READ : + PAGECACHE_LOCK_LEFT_UNLOCKED), + direct_link); + DBUG_PRINT("info", ("Direct link is assigned to : %p * %p", + direct_link, + (direct_link ? *direct_link : NULL))); + data->was_recovered= file->was_recovered; + DBUG_RETURN(buffer); +} + + +/** + @brief free direct log page link + + @param direct_link the direct log page link to be freed + +*/ + +static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link) +{ + DBUG_ENTER("translog_free_link"); + DBUG_PRINT("info", ("Direct link: %p", + direct_link)); + if (direct_link) + pagecache_unlock_by_link(log_descriptor.pagecache, direct_link, + PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN, + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, FALSE); + DBUG_VOID_RETURN; +} + + +/** + @brief Finds last full page of the given log file. + + @param addr address structure to fill with data, which contain + file number of the log file + @param last_page_ok Result of the check whether last page OK. + (for now only we check only that file length + divisible on page length). + @param no_errors suppress messages about non-critical errors + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr, + my_bool *last_page_ok, + my_bool no_errors) +{ + char path[FN_REFLEN]; + uint32 rec_offset; + my_off_t file_size; + uint32 file_no= LSN_FILE_NO(*addr); + TRANSLOG_FILE *file; +#ifdef DBUG_TRACE + char buff[21]; +#endif + DBUG_ENTER("translog_get_last_page_addr"); + + if (likely((file= get_logfile_by_number(file_no)) != NULL)) + { + /* + This function used only during initialization of loghandler or in + scanner (which mean we need read that part of the log), so the + requested log file have to be opened and can't be freed after + returning pointer on it (file_size). + */ + file_size= mysql_file_seek(file->handler.file, 0, SEEK_END, MYF(0)); + } + else + { + /* + This branch is used only during very early initialization + when files are not opened. + */ + File fd; + if ((fd= mysql_file_open(key_file_translog, + translog_filename_by_fileno(file_no, path), + O_RDONLY | O_CLOEXEC, (no_errors ? MYF(0) : MYF(MY_WME)))) < 0) + { + my_errno= errno; + DBUG_PRINT("error", ("Error %d during opening file #%d", + errno, file_no)); + DBUG_RETURN(1); + } + file_size= mysql_file_seek(fd, 0, SEEK_END, MYF(0)); + mysql_file_close(fd, MYF(0)); + } + DBUG_PRINT("info", ("File size: %s", llstr(file_size, buff))); + if (file_size == MY_FILEPOS_ERROR) + DBUG_RETURN(1); + DBUG_ASSERT(file_size < 0xffffffffULL); + if (((uint32)file_size) > TRANSLOG_PAGE_SIZE) + { + rec_offset= (((((uint32)file_size) / TRANSLOG_PAGE_SIZE) - 1) * + TRANSLOG_PAGE_SIZE); + *last_page_ok= (((uint32)file_size) == rec_offset + TRANSLOG_PAGE_SIZE); + } + else + { + *last_page_ok= 0; + rec_offset= 0; + } + *addr= MAKE_LSN(file_no, rec_offset); + DBUG_PRINT("info", ("Last page: 0x%lx ok: %d", (ulong) rec_offset, + *last_page_ok)); + DBUG_RETURN(0); +} + + +/** + @brief Get number bytes for record length storing + + @param length Record length which will be encoded + + @return 1,3,4,5 - number of bytes to store given length +*/ + +static uint translog_variable_record_length_bytes(translog_size_t length) +{ + if (length < 250) + return 1; + if (length < 0xFFFF) + return 3; + if (length < (ulong) 0xFFFFFF) + return 4; + return 5; +} + + +/** + @brief Gets header of this chunk. + + @param chunk The pointer to the chunk beginning + + @retval # total length of the chunk + @retval 0 Error +*/ + +static uint16 translog_get_chunk_header_length(uchar *chunk) +{ + DBUG_ENTER("translog_get_chunk_header_length"); + switch (*chunk & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + { + /* 0 chunk referred as LSN (head or tail) */ + translog_size_t rec_len __attribute__((unused)); + uchar *start= chunk; + uchar *ptr= start + 1 + 2; + uint16 chunk_len, header_len; + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN")); + rec_len= translog_variable_record_1group_decode_len(&ptr); + chunk_len= uint2korr(ptr); + header_len= (uint16) (ptr - start) +2; + DBUG_PRINT("info", ("rec len: %lu chunk len: %u header len: %u", + (ulong) rec_len, (uint) chunk_len, (uint) header_len)); + if (chunk_len) + { + /* TODO: fine header end */ + /* + The last chunk of multi-group record can be base for it header + calculation (we skip to the first group to read the header) so if we + stuck here something is wrong. + */ + DBUG_ASSERT(0); + DBUG_RETURN(0); /* Keep compiler happy */ + } + DBUG_RETURN(header_len); + } + case TRANSLOG_CHUNK_FIXED: + { + /* 1 (pseudo)fixed record (also LSN) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3")); + DBUG_RETURN(3); + } + case TRANSLOG_CHUNK_NOHDR: + /* 2 no header chunk (till page end) */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1")); + DBUG_RETURN(1); + break; + case TRANSLOG_CHUNK_LNGTH: + /* 3 chunk with chunk length */ + DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3")); + DBUG_RETURN(3); + break; + } + DBUG_ASSERT(0); + DBUG_RETURN(0); /* Keep compiler happy */ +} + + +/** + @brief Truncate the log to the given address. Used during the startup if the + end of log if corrupted. + + @param addr new horizon + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_truncate_log(TRANSLOG_ADDRESS addr) +{ + uchar *page; + TRANSLOG_ADDRESS current_page; + uint32 next_page_offset, page_rest; + uint32 i; + File fd; + int rc; + TRANSLOG_VALIDATOR_DATA data; + char path[FN_REFLEN]; + uchar page_buff[TRANSLOG_PAGE_SIZE]; + DBUG_ENTER("translog_truncate_log"); + /* TODO: write warning to the client */ + DBUG_PRINT("warning", ("removing all records from " LSN_FMT " " + "till " LSN_FMT, + LSN_IN_PARTS(addr), + LSN_IN_PARTS(log_descriptor.horizon))); + DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0); + /* remove files between the address and horizon */ + for (i= LSN_FILE_NO(addr) + 1; i <= LSN_FILE_NO(log_descriptor.horizon); i++) + if (mysql_file_delete(key_file_translog, + translog_filename_by_fileno(i, path), MYF(MY_WME))) + { + translog_unlock(); + DBUG_RETURN(1); + } + + /* truncate the last file up to the last page */ + next_page_offset= LSN_OFFSET(addr); + next_page_offset= (next_page_offset - + ((next_page_offset - 1) % TRANSLOG_PAGE_SIZE + 1) + + TRANSLOG_PAGE_SIZE); + page_rest= next_page_offset - LSN_OFFSET(addr); + memset(page_buff, TRANSLOG_FILLER, page_rest); + rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 || + ((mysql_file_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) || + (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr), + log_write_flags)) || + mysql_file_sync(fd, MYF(MY_WME))))); + translog_syncs++; + rc|= (fd > 0 && mysql_file_close(fd, MYF(MY_WME))); + if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS) + { + rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)); + translog_syncs++; + } + if (rc) + DBUG_RETURN(1); + + /* fix the horizon */ + log_descriptor.horizon= addr; + /* fix the buffer data */ + current_page= MAKE_LSN(LSN_FILE_NO(addr), (next_page_offset - + TRANSLOG_PAGE_SIZE)); + data.addr= ¤t_page; + if ((page= translog_get_page(&data, log_descriptor.buffers->buffer, NULL)) == + NULL) + DBUG_RETURN(1); + if (page != log_descriptor.buffers->buffer) + memcpy(log_descriptor.buffers->buffer, page, TRANSLOG_PAGE_SIZE); + log_descriptor.bc.buffer->offset= current_page; + log_descriptor.bc.buffer->size= LSN_OFFSET(addr) - LSN_OFFSET(current_page); + log_descriptor.bc.ptr= + log_descriptor.buffers->buffer + log_descriptor.bc.buffer->size; + log_descriptor.bc.current_page_fill= log_descriptor.bc.buffer->size; + DBUG_RETURN(0); +} + + +/** + Applies function 'callback' to all files (in a directory) which + name looks like a log's name (aria_log.[0-9]{7}). + If 'callback' returns TRUE this interrupts the walk and returns + TRUE. Otherwise FALSE is returned after processing all log files. + It cannot just use log_descriptor.directory because that may not yet have + been initialized. + + @param directory directory to scan + @param callback function to apply; is passed directory and base + name of found file +*/ + +my_bool translog_walk_filenames(const char *directory, + my_bool (*callback)(const char *, + const char *)) +{ + MY_DIR *dirp; + size_t i; + my_bool rc= FALSE; + + /* Finds and removes transaction log files */ + if (!(dirp = my_dir(directory, MYF(MY_DONT_SORT)))) + return FALSE; + + for (i= 0; i < dirp->number_of_files; i++) + { + char *file= dirp->dir_entry[i].name; + if (strncmp(file, "aria_log.", 10) == 0 && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] >= '0' && file[17] <= '9' && + file[18] == '\0' && (*callback)(directory, file)) + { + rc= TRUE; + break; + } + } + my_dirend(dirp); + return rc; +} + + +/** + @brief Fills table of dependence length of page header from page flags +*/ + +void translog_fill_overhead_table() +{ + uint i; + for (i= 0; i < TRANSLOG_FLAGS_NUM; i++) + { + page_overhead[i]= 7; + if (i & TRANSLOG_PAGE_CRC) + page_overhead[i]+= CRC_SIZE; + if (i & TRANSLOG_SECTOR_PROTECTION) + page_overhead[i]+= TRANSLOG_PAGE_SIZE / + DISK_DRIVE_SECTOR_SIZE; + } +} + + +/** + Callback to find first log in directory. +*/ + +static my_bool translog_callback_search_first(const char *directory + __attribute__((unused)), + const char *filename + __attribute__((unused))) +{ + return TRUE; +} + + +/** + @brief Checks that chunk is LSN one + + @param type type of the chunk + + @retval 1 the chunk is LNS + @retval 0 the chunk is not LSN +*/ + +static my_bool translog_is_LSN_chunk(uchar type) +{ + DBUG_ENTER("translog_is_LSN_chunk"); + DBUG_PRINT("info", ("byte: %x chunk type: %u record type: %u", + type, type >> 6, type & TRANSLOG_REC_TYPE)); + DBUG_RETURN(((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_FIXED) || + (((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_LSN) && + ((type & TRANSLOG_REC_TYPE)) != TRANSLOG_CHUNK_0_CONT)); +} + + +/** + @brief Initialize transaction log + + @param directory Directory where log files are put + @param log_file_max_size max size of one log size (for new logs creation) + @param server_version version of MySQL server (MYSQL_VERSION_ID) + @param server_id server ID (replication & Co) + @param pagecache Page cache for the log reads + @param flags flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION + TRANSLOG_RECORD_CRC) + @param read_only Put transaction log in read-only mode + @param init_table_func function to initialize record descriptors table + @param no_errors suppress messages about non-critical errors + + @todo + Free used resources in case of error. + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_init_with_table(const char *directory, + uint32 log_file_max_size, + uint32 server_version, + uint32 server_id, PAGECACHE *pagecache, + uint flags, my_bool readonly, + void (*init_table_func)(), + my_bool no_errors) +{ + int i; + int old_log_was_recovered= 0, logs_found= 0; + uint old_flags= flags; + uint32 start_file_num= 1; + TRANSLOG_ADDRESS UNINIT_VAR(sure_page), last_page, last_valid_page, + checkpoint_lsn; + my_bool version_changed= 0; + DBUG_ENTER("translog_init_with_table"); + + translog_syncs= 0; + flush_start= 0; + id_to_share= NULL; + log_purge_disabled= 0; + + log_descriptor.directory_fd= -1; + log_descriptor.is_everything_flushed= 1; + log_descriptor.flush_in_progress= 0; + log_descriptor.flush_no= 0; + log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; + + /* Normally in Aria this this calls translog_table_init() */ + (*init_table_func)(); + compile_time_assert(sizeof(log_descriptor.dirty_buffer_mask) * 8 >= + TRANSLOG_BUFFERS_NO); + log_descriptor.dirty_buffer_mask= 0; + if (readonly) + log_descriptor.open_flags= O_BINARY | O_RDONLY; + else + log_descriptor.open_flags= O_BINARY | O_RDWR; + if (mysql_mutex_init(key_TRANSLOG_BUFFER_mutex, + &log_descriptor.sent_to_disk_lock, MY_MUTEX_INIT_FAST) || + mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_file_header_lock, + &log_descriptor.file_header_lock, MY_MUTEX_INIT_FAST) || + mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_unfinished_files_lock, + &log_descriptor.unfinished_files_lock, MY_MUTEX_INIT_FAST) || + mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_purger_lock, + &log_descriptor.purger_lock, MY_MUTEX_INIT_FAST) || + mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_log_flush_lock, + &log_descriptor.log_flush_lock, MY_MUTEX_INIT_FAST) || + mysql_mutex_init(key_TRANSLOG_DESCRIPTOR_dirty_buffer_mask_lock, + &log_descriptor.dirty_buffer_mask_lock, MY_MUTEX_INIT_FAST) || + mysql_cond_init(key_TRANSLOG_DESCRIPTOR_log_flush_cond, + &log_descriptor.log_flush_cond, 0) || + mysql_cond_init(key_TRANSLOG_DESCRIPTOR_new_goal_cond, + &log_descriptor.new_goal_cond, 0) || + mysql_rwlock_init(key_TRANSLOG_DESCRIPTOR_open_files_lock, + &log_descriptor.open_files_lock) || + my_init_dynamic_array(PSI_INSTRUMENT_ME, &log_descriptor.open_files, + sizeof(TRANSLOG_FILE*), 10, 10, MYF(0)) || + my_init_dynamic_array(PSI_INSTRUMENT_ME, &log_descriptor.unfinished_files, + sizeof(struct st_file_counter), + 10, 10, MYF(0))) + goto err; + log_descriptor.min_need_file= 0; + log_descriptor.min_file_number= 0; + log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE; + + /* Directory to store files */ + unpack_dirname(log_descriptor.directory, directory); +#ifndef _WIN32 + if ((log_descriptor.directory_fd= my_open(log_descriptor.directory, + O_RDONLY, MYF(MY_WME))) < 0) + { + my_errno= errno; + DBUG_PRINT("error", ("Error %d during opening directory '%s'", + errno, log_descriptor.directory)); + goto err; + } +#endif + log_descriptor.in_buffers_only= LSN_IMPOSSIBLE; + DBUG_ASSERT(log_file_max_size % TRANSLOG_PAGE_SIZE == 0 && + log_file_max_size >= TRANSLOG_MIN_FILE_SIZE); + /* max size of one log size (for new logs creation) */ + log_file_size= log_descriptor.log_file_max_size= + log_file_max_size; + /* server version */ + log_descriptor.server_version= server_version; + /* server ID */ + log_descriptor.server_id= server_id; + /* Page cache for the log reads */ + log_descriptor.pagecache= pagecache; + /* Flags */ + DBUG_ASSERT((flags & + ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) == 0); + log_descriptor.flags= flags; + translog_fill_overhead_table(); + log_descriptor.page_overhead= page_overhead[flags]; + log_descriptor.page_capacity_chunk_2= + TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1; + compile_time_assert(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0); + log_descriptor.buffer_capacity_chunk_2= + (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) * + log_descriptor.page_capacity_chunk_2; + log_descriptor.half_buffer_capacity_chunk_2= + log_descriptor.buffer_capacity_chunk_2 / 2; + DBUG_PRINT("info", + ("Overhead: %u pc2: %u bc2: %u, bc2/2: %u", + log_descriptor.page_overhead, + log_descriptor.page_capacity_chunk_2, + log_descriptor.buffer_capacity_chunk_2, + log_descriptor.half_buffer_capacity_chunk_2)); + + /* Just to init it somehow (hack for bootstrap)*/ + { + TRANSLOG_FILE *file= 0; + log_descriptor.min_file = log_descriptor.max_file= 1; + insert_dynamic(&log_descriptor.open_files, (uchar *)&file); + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + pop_dynamic(&log_descriptor.open_files); + } + + /* Buffers for log writing */ + for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) + { + if (translog_buffer_init(log_descriptor.buffers + i, i)) + goto err; + DBUG_PRINT("info", ("translog_buffer buffer #%u:%p", + i, log_descriptor.buffers + i)); + } + + /* + last_logno and last_checkpoint_lsn were set in + ma_control_file_create_or_open() + */ + logs_found= (last_logno != FILENO_IMPOSSIBLE); + + translog_status= (readonly ? TRANSLOG_READONLY : TRANSLOG_OK); + checkpoint_lsn= last_checkpoint_lsn; + + if (logs_found) + { + my_bool pageok; + DBUG_PRINT("info", ("log found...")); + /* + TODO: scan directory for aria_log.XXXXXXXX files and find + highest XXXXXXXX & set logs_found + TODO: check that last checkpoint within present log addresses space + + find the log end + */ + if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE) + { + DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0); + /* only last log needs to be checked */ + sure_page= MAKE_LSN(last_logno, TRANSLOG_PAGE_SIZE); + } + else + { + sure_page= last_checkpoint_lsn; + DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0); + sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE; + } + /* Set horizon to the beginning of the last file first */ + log_descriptor.horizon= last_page= MAKE_LSN(last_logno, 0); + if (translog_get_last_page_addr(&last_page, &pageok, no_errors)) + { + if (!translog_walk_filenames(log_descriptor.directory, + &translog_callback_search_first)) + { + /* + Files was deleted, just start from the next log number, so that + existing tables are in the past. + */ + start_file_num= last_logno + 1; + checkpoint_lsn= LSN_IMPOSSIBLE; /* no log so no checkpoint */ + logs_found= 0; + } + else + goto err; + } + else if (LSN_OFFSET(last_page) == 0) + { + if (LSN_FILE_NO(last_page) == 1) + { + logs_found= 0; /* file #1 has no pages */ + DBUG_PRINT("info", ("log found. But is is empty => no log assumed")); + } + else + { + last_page-= LSN_ONE_FILE; + if (translog_get_last_page_addr(&last_page, &pageok, 0)) + goto err; + } + } + if (logs_found) + { + uint32 i; + log_descriptor.min_file= translog_first_file(log_descriptor.horizon, 1); + log_descriptor.max_file= last_logno; + /* Open all files */ + if (allocate_dynamic(&log_descriptor.open_files, + log_descriptor.max_file - + log_descriptor.min_file + 1)) + goto err; + for (i = log_descriptor.max_file; i >= log_descriptor.min_file; i--) + { + /* + We can't allocate all file together because they will be freed + one by one + */ + TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_FILE), + MYF(0)); + + compile_time_assert(MY_FILEPOS_ERROR > 0xffffffffULL); + if (file == NULL || + (file->handler.file= + open_logfile_by_number_no_cache(i)) < 0 || + mysql_file_seek(file->handler.file, 0, SEEK_END, MYF(0)) >= + 0xffffffffULL) + { + int j; + for (j= i - log_descriptor.min_file - 1; j > 0; j--) + { + TRANSLOG_FILE *el= + *dynamic_element(&log_descriptor.open_files, j, + TRANSLOG_FILE **); + mysql_file_close(el->handler.file, MYF(MY_WME)); + my_free(el); + } + if (file) + { + free(file); + goto err; + } + else + goto err; + } + translog_file_init(file, i, 1); + /* we allocated space so it can't fail */ + insert_dynamic(&log_descriptor.open_files, (uchar *)&file); + } + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + } + } + else if (readonly) + { + /* There is no logs and there is read-only mode => nothing to read */ + DBUG_PRINT("error", ("No logs and read-only mode")); + goto err; + } + + if (logs_found) + { + TRANSLOG_ADDRESS current_page= sure_page; + my_bool pageok; + + DBUG_PRINT("info", ("The log is really present")); + if (sure_page > last_page) + { + my_printf_error(HA_ERR_GENERIC, "Aria engine: log data error\n" + "last_log_page: " LSN_FMT " is less than\n" + "checkpoint page: " LSN_FMT, MYF(0), + LSN_IN_PARTS(last_page), LSN_IN_PARTS(sure_page)); + goto err; + } + + /* TODO: check page size */ + + last_valid_page= LSN_IMPOSSIBLE; + /* + Scans and validate pages. We need it to show "outside" only for sure + valid part of the log. If the log was damaged then fixed we have to + cut off damaged part before some other process start write something + in the log. + */ + do + { + TRANSLOG_ADDRESS current_file_last_page; + current_file_last_page= current_page; + if (translog_get_last_page_addr(¤t_file_last_page, &pageok, 0)) + goto err; + if (!pageok) + { + DBUG_PRINT("error", ("File %lu have no complete last page", + (ulong) LSN_FILE_NO(current_file_last_page))); + old_log_was_recovered= 1; + /* This file is not written till the end so it should be last */ + last_page= current_file_last_page; + /* TODO: issue warning */ + } + do + { + TRANSLOG_VALIDATOR_DATA data; + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + data.addr= ¤t_page; + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL) + goto err; + if (data.was_recovered) + { + DBUG_PRINT("error", ("file no: %lu (%d) " + "rec_offset: 0x%lx (%lu) (%d)", + (ulong) LSN_FILE_NO(current_page), + (uint3korr(page + 3) != + LSN_FILE_NO(current_page)), + (ulong) LSN_OFFSET(current_page), + (ulong) (LSN_OFFSET(current_page) / + TRANSLOG_PAGE_SIZE), + (uint3korr(page) != + LSN_OFFSET(current_page) / + TRANSLOG_PAGE_SIZE))); + old_log_was_recovered= 1; + break; + } + old_flags= page[TRANSLOG_PAGE_FLAGS]; + last_valid_page= current_page; + current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */ + } while (current_page <= current_file_last_page); + current_page+= LSN_ONE_FILE; + current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE); + } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) && + !old_log_was_recovered); + if (last_valid_page == LSN_IMPOSSIBLE) + { + /* Panic!!! Even page which should be valid is invalid */ + /* TODO: issue error */ + goto err; + } + DBUG_PRINT("info", ("Last valid page is in file: %lu " + "offset: %lu (0x%lx) " + "Logs found: %d was recovered: %d " + "flags match: %d", + (ulong) LSN_FILE_NO(last_valid_page), + (ulong) LSN_OFFSET(last_valid_page), + (ulong) LSN_OFFSET(last_valid_page), + logs_found, old_log_was_recovered, + (old_flags == flags))); + + /* TODO: check server ID */ + if (logs_found && !old_log_was_recovered && old_flags == flags) + { + TRANSLOG_VALIDATOR_DATA data; + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + uint16 chunk_offset; + data.addr= &last_valid_page; + /* continue old log */ + DBUG_ASSERT(LSN_FILE_NO(last_valid_page)== + LSN_FILE_NO(log_descriptor.horizon)); + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL || + (chunk_offset= translog_get_first_chunk_offset(page)) == 0) + goto err; + + /* Puts filled part of old page in the buffer */ + log_descriptor.horizon= last_valid_page; + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + /* + Free space if filled with TRANSLOG_FILLER and first uchar of + real chunk can't be TRANSLOG_FILLER + */ + while (chunk_offset < TRANSLOG_PAGE_SIZE && + page[chunk_offset] != TRANSLOG_FILLER) + { + uint16 chunk_length; + if ((chunk_length= + translog_get_total_chunk_length(page, chunk_offset)) == 0) + goto err; + DBUG_PRINT("info", ("chunk: offset: %u length: %u", + (uint) chunk_offset, (uint) chunk_length)); + chunk_offset+= chunk_length; + + /* chunk can't cross the page border */ + DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE); + } + memcpy(log_descriptor.buffers->buffer, page, chunk_offset); + log_descriptor.bc.buffer->size+= chunk_offset; + log_descriptor.bc.ptr+= chunk_offset; + log_descriptor.bc.current_page_fill= chunk_offset; + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + (chunk_offset + + LSN_OFFSET(last_valid_page))); + DBUG_PRINT("info", ("Move Page #%u: %p chaser: %d Size: %lu (%lu)", + (uint) log_descriptor.bc.buffer_no, + log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr - log_descriptor.bc. + buffer->buffer))); + translog_check_cursor(&log_descriptor.bc); + } + if (!old_log_was_recovered && old_flags == flags) + { + LOGHANDLER_FILE_INFO info; + + /* + Accessing &log_descriptor.open_files without mutex is safe + because it is initialization + */ + if (translog_read_file_header(&info, + (*dynamic_element(&log_descriptor. + open_files, + 0, TRANSLOG_FILE **))-> + handler.file)) + goto err; + version_changed= (info.maria_version != TRANSLOG_VERSION_ID); + } + } + DBUG_PRINT("info", ("Logs found: %d was recovered: %d", + logs_found, old_log_was_recovered)); + if (!logs_found) + { + TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(PSI_INSTRUMENT_ME, + sizeof(TRANSLOG_FILE), MYF(MY_WME)); + DBUG_PRINT("info", ("The log is not found => we will create new log")); + if (file == NULL) + goto err; + /* Start new log system from scratch */ + log_descriptor.horizon= MAKE_LSN(start_file_num, + TRANSLOG_PAGE_SIZE); /* header page */ + translog_file_init(file, start_file_num, 0); + if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file)) + { + my_free(file); + goto err; + } + if ((file->handler.file= + create_logfile_by_number_no_cache(start_file_num)) == -1) + goto err; + log_descriptor.min_file= log_descriptor.max_file= start_file_num; + if (translog_write_file_header()) + goto err; + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + + if (ma_control_file_write_and_force(checkpoint_lsn, start_file_num, + max_trid_in_control_file, + recovery_failures)) + goto err; + /* assign buffer 0 */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + } + else if ((old_log_was_recovered || old_flags != flags || version_changed) && + !readonly) + { + /* leave the damaged file untouched */ + log_descriptor.horizon+= LSN_ONE_FILE; + /* header page */ + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + TRANSLOG_PAGE_SIZE); + if (translog_create_new_file()) + goto err; + /* + Buffer system left untouched after recovery => we should init it + (starting from buffer 0) + */ + translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0); + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + } + + /* all LSNs that are on disk are flushed */ + log_descriptor.log_start= log_descriptor.sent_to_disk= + log_descriptor.flushed= log_descriptor.horizon; + log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset; + log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */ + /* + Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially) + address of the next LSN and we want indicate that all LSNs that are + already on the disk are flushed so we need decrease horizon on 1 (we are + sure that there is no LSN on the disk which is greater then 'flushed' + and there will not be LSN created that is equal or less then the value + of the 'flushed'). + */ + log_descriptor.flushed--; /* offset decreased */ + log_descriptor.sent_to_disk--; /* offset decreased */ + /* + Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up + structures for generating 2-byte ids: + */ + id_to_share= (MARIA_SHARE **) my_malloc(PSI_INSTRUMENT_ME, SHARE_ID_MAX * sizeof(MARIA_SHARE*), + MYF(MY_WME | MY_ZEROFILL)); + if (unlikely(!id_to_share)) + goto err; + id_to_share--; /* min id is 1 */ + + /* Check the last LSN record integrity */ + if (logs_found) + { + TRANSLOG_SCANNER_DATA scanner; + TRANSLOG_ADDRESS page_addr; + LSN last_lsn= LSN_IMPOSSIBLE; + /* + take very last page address and try to find LSN record on it + if it fail take address of previous page and so on + */ + page_addr= (log_descriptor.horizon - + ((log_descriptor.horizon - 1) % TRANSLOG_PAGE_SIZE + 1)); + if (translog_scanner_init(page_addr, 1, &scanner, 1)) + goto err; + scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]]; + for (;;) + { + uint chunk_1byte; + chunk_1byte= scanner.page[scanner.page_offset]; + while (!translog_is_LSN_chunk(chunk_1byte) && + scanner.page != END_OF_LOG && + scanner.page[scanner.page_offset] != TRANSLOG_FILLER && + scanner.page_addr == page_addr) + { + if (translog_get_next_chunk(&scanner)) + { + translog_destroy_scanner(&scanner); + goto err; + } + if (scanner.page != END_OF_LOG) + chunk_1byte= scanner.page[scanner.page_offset]; + } + if (translog_is_LSN_chunk(chunk_1byte)) + { + last_lsn= scanner.page_addr + scanner.page_offset; + if (translog_get_next_chunk(&scanner)) + { + translog_destroy_scanner(&scanner); + goto err; + } + if (scanner.page == END_OF_LOG) + break; /* it was the last record */ + chunk_1byte= scanner.page[scanner.page_offset]; + continue; /* try to find other record on this page */ + } + + if (last_lsn != LSN_IMPOSSIBLE) + break; /* there is no more records on the page */ + + /* We have to make step back */ + if (unlikely(LSN_OFFSET(page_addr) == TRANSLOG_PAGE_SIZE)) + { + uint32 file_no= LSN_FILE_NO(page_addr); + my_bool last_page_ok; + /* it is beginning of the current file */ + if (unlikely(file_no == 1)) + { + /* + It is beginning of the log => there is no LSNs in the log => + There is no harm in leaving it "as-is". + */ + log_descriptor.previous_flush_horizon= log_descriptor.horizon; + DBUG_PRINT("info", ("previous_flush_horizon: " LSN_FMT, + LSN_IN_PARTS(log_descriptor. + previous_flush_horizon))); + DBUG_RETURN(0); + } + file_no--; + page_addr= MAKE_LSN(file_no, TRANSLOG_PAGE_SIZE); + translog_get_last_page_addr(&page_addr, &last_page_ok, 0); + /* page should be OK as it is not the last file */ + DBUG_ASSERT(last_page_ok); + } + else + { + page_addr-= TRANSLOG_PAGE_SIZE; + } + translog_destroy_scanner(&scanner); + if (translog_scanner_init(page_addr, 1, &scanner, 1)) + goto err; + scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]]; + } + translog_destroy_scanner(&scanner); + + /* Now scanner points to the last LSN chunk, lets check it */ + { + TRANSLOG_HEADER_BUFFER rec; + translog_size_t rec_len; + int len; + uchar buffer[1]; + DBUG_PRINT("info", ("going to check the last found record " LSN_FMT, + LSN_IN_PARTS(last_lsn))); + + len= + translog_read_record_header(last_lsn, &rec); + if (unlikely (len == RECHEADER_READ_ERROR || + len == RECHEADER_READ_EOF)) + { + DBUG_PRINT("error", ("unexpected end of log or record during " + "reading record header: " LSN_FMT " len: %d", + LSN_IN_PARTS(last_lsn), len)); + if (readonly) + log_descriptor.log_start= log_descriptor.horizon= last_lsn; + else if (translog_truncate_log(last_lsn)) + { + translog_free_record_header(&rec); + goto err; + } + } + else + { + DBUG_ASSERT(last_lsn == rec.lsn); + if (likely(rec.record_length != 0)) + { + /* + Reading the last byte of record will trigger scanning all + record chunks for now + */ + rec_len= translog_read_record(rec.lsn, rec.record_length - 1, 1, + buffer, NULL); + if (rec_len != 1) + { + DBUG_PRINT("error", ("unexpected end of log or record during " + "reading record body: " LSN_FMT " len: %d", + LSN_IN_PARTS(rec.lsn), + len)); + if (readonly) + log_descriptor.log_start= log_descriptor.horizon= last_lsn; + + else if (translog_truncate_log(last_lsn)) + { + translog_free_record_header(&rec); + goto err; + } + } + } + } + translog_free_record_header(&rec); + } + } + log_descriptor.previous_flush_horizon= log_descriptor.horizon; + DBUG_PRINT("info", ("previous_flush_horizon: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.previous_flush_horizon))); + DBUG_RETURN(0); +err: + ma_message_no_user(0, "log initialization failed"); + DBUG_RETURN(1); +} + + +/* + @brief Free transaction log file buffer. + + @param buffer_no The buffer to free +*/ + +static void translog_buffer_destroy(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_destroy"); + DBUG_PRINT("enter", + ("Buffer #%u: %p file: %d offset: " LSN_FMT " size: %lu", + (uint) buffer->buffer_no, buffer, + (buffer->file ? buffer->file->handler.file : -1), + LSN_IN_PARTS(buffer->offset), + (ulong) buffer->size)); + if (buffer->file != NULL) + { + /* + We ignore errors here, because we can't do something about it + (it is shutting down) + + We also have to take the locks even if there can't be any other + threads running, because translog_buffer_flush() + requires that we have the buffer locked. + */ + translog_buffer_lock(buffer); + translog_buffer_flush(buffer); + translog_buffer_unlock(buffer); + } + DBUG_PRINT("info", ("Destroy mutex: %p", &buffer->mutex)); + mysql_mutex_destroy(&buffer->mutex); + mysql_cond_destroy(&buffer->waiting_filling_buffer); + DBUG_VOID_RETURN; +} + + +/* + Free log handler resources + + SYNOPSIS + translog_destroy() +*/ + +void translog_destroy() +{ + TRANSLOG_FILE **file; + uint i; + uint8 current_buffer; + DBUG_ENTER("translog_destroy"); + + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + translog_lock(); + current_buffer= log_descriptor.bc.buffer_no; + translog_status= (translog_status == TRANSLOG_READONLY ? + TRANSLOG_UNINITED : + TRANSLOG_SHUTDOWN); + if (log_descriptor.bc.buffer->file != NULL) + translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc); + translog_unlock(); + + for (i= 0; i < TRANSLOG_BUFFERS_NO; i++) + { + struct st_translog_buffer *buffer= (log_descriptor.buffers + + ((i + current_buffer + 1) % + TRANSLOG_BUFFERS_NO)); + translog_buffer_destroy(buffer); + } + translog_status= TRANSLOG_UNINITED; + + /* close files */ + while ((file= (TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files))) + translog_close_log_file(*file); + mysql_mutex_destroy(&log_descriptor.sent_to_disk_lock); + mysql_mutex_destroy(&log_descriptor.file_header_lock); + mysql_mutex_destroy(&log_descriptor.unfinished_files_lock); + mysql_mutex_destroy(&log_descriptor.purger_lock); + mysql_mutex_destroy(&log_descriptor.log_flush_lock); + mysql_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock); + mysql_cond_destroy(&log_descriptor.log_flush_cond); + mysql_cond_destroy(&log_descriptor.new_goal_cond); + mysql_rwlock_destroy(&log_descriptor.open_files_lock); + delete_dynamic(&log_descriptor.open_files); + delete_dynamic(&log_descriptor.unfinished_files); + + if (log_descriptor.directory_fd >= 0) + mysql_file_close(log_descriptor.directory_fd, MYF(MY_WME)); + if (id_to_share != NULL) + my_free(id_to_share + 1); + DBUG_VOID_RETURN; +} + + +/* + @brief Starts new page. + + @param horizon \ Position in file and buffer where we are + @param cursor / + @param prev_buffer Buffer which should be flushed will be assigned here. + This is always set (to NULL if nothing to flush). + + @note We do not want to flush the buffer immediately because we want to + let caller of this function first advance 'horizon' pointer and unlock the + loghandler and only then flush the log which can take some time. + + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + struct st_translog_buffer **prev_buffer) +{ + struct st_translog_buffer *buffer= cursor->buffer; + DBUG_ENTER("translog_page_next"); + + *prev_buffer= NULL; + if ((cursor->ptr + TRANSLOG_PAGE_SIZE > + cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) || + (LSN_OFFSET(*horizon) > + log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE)) + { + DBUG_PRINT("info", ("Switch to next buffer Buffer Size: %lu (%lu) => %d " + "File size: %lu max: %lu => %d", + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + (cursor->ptr + TRANSLOG_PAGE_SIZE > + cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER), + (ulong) LSN_OFFSET(*horizon), + (ulong) log_descriptor.log_file_max_size, + (LSN_OFFSET(*horizon) > + (log_descriptor.log_file_max_size - + TRANSLOG_PAGE_SIZE)))); + if (translog_buffer_next(horizon, cursor, + LSN_OFFSET(*horizon) > + (log_descriptor.log_file_max_size - + TRANSLOG_PAGE_SIZE))) + DBUG_RETURN(1); + *prev_buffer= buffer; + DBUG_PRINT("info", ("Buffer #%u (%p): have to be flushed", + (uint) buffer->buffer_no, buffer)); + } + else + { + DBUG_PRINT("info", ("Use the same buffer #%u (%p): " + "Buffer Size: %lu (%lu)", + (uint) buffer->buffer_no, + buffer, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_finish_page(horizon, cursor); + translog_new_page_header(horizon, cursor); + } + DBUG_RETURN(0); +} + + +/* + Write data of given length to the current page + + SYNOPSIS + translog_write_data_on_page() + horizon \ Pointers on file and buffer + cursor / + length IN length of the chunk + buffer buffer with data + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + translog_size_t length, + uchar *buffer) +{ + DBUG_ENTER("translog_write_data_on_page"); + DBUG_PRINT("enter", ("Chunk length: %lu Page size %u", + (ulong) length, (uint) cursor->current_page_fill)); + DBUG_ASSERT(length > 0); + DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer + + TRANSLOG_WRITE_BUFFER); + + memcpy(cursor->ptr, buffer, length); + cursor->ptr+= length; + (*horizon)+= length; /* adds offset */ + cursor->current_page_fill+= length; + if (!cursor->chaser) + cursor->buffer->size+= length; + DBUG_PRINT("info", ("Write data buffer #%u: %p " + "chaser: %d Size: %lu (%lu)", + (uint) cursor->buffer->buffer_no, cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + translog_check_cursor(cursor); + + DBUG_RETURN(0); +} + + +/* + Write data from parts of given length to the current page + + SYNOPSIS + translog_write_parts_on_page() + horizon \ Pointers on file and buffer + cursor / + length IN length of the chunk + parts IN/OUT chunk source + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor, + translog_size_t length, + struct st_translog_parts *parts) +{ + translog_size_t left= length; + uint cur= (uint) parts->current; + DBUG_ENTER("translog_write_parts_on_page"); + DBUG_PRINT("enter", ("Chunk length: %lu parts: %u of %u. Page size: %u " + "Buffer size: %lu (%lu)", + (ulong) length, + (uint) (cur + 1), (uint) parts->elements, + (uint) cursor->current_page_fill, + (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer))); + DBUG_ASSERT(length > 0); + DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer + + TRANSLOG_WRITE_BUFFER); + + do + { + translog_size_t len; + LEX_CUSTRING *part; + const uchar *buff; + + DBUG_ASSERT(cur < parts->elements); + part= parts->parts + cur; + buff= part->str; + DBUG_PRINT("info", ("Part: %u Length: %lu left: %lu buff: %p", + (uint) (cur + 1), (ulong) part->length, (ulong) left, + buff)); + + if (part->length > left) + { + /* we should write less then the current part */ + len= left; + part->length-= len; + part->str+= len; + DBUG_PRINT("info", ("Set new part: %u Length: %lu", + (uint) (cur + 1), (ulong) part->length)); + } + else + { + len= (translog_size_t) part->length; + cur++; + DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len)); + } + DBUG_PRINT("info", ("copy: %p <- %p %u", + cursor->ptr, buff, len)); + if (likely(len)) + { + memcpy(cursor->ptr, buff, len); + left-= len; + cursor->ptr+= len; + } + } while (left); + + DBUG_PRINT("info", ("Horizon: " LSN_FMT " Length %u(0x%x)", + LSN_IN_PARTS(*horizon), + length, length)); + parts->current= cur; + (*horizon)+= length; /* offset increasing */ + cursor->current_page_fill+= length; + if (!cursor->chaser) + cursor->buffer->size+= length; + /* + We do not not updating parts->total_record_length here because it is + need only before writing record to have total length + */ + DBUG_PRINT("info", ("Write parts buffer #%u: %p " + "chaser: %d Size: %lu (%lu) " + "Horizon: " LSN_FMT " buff offset: 0x%x", + (uint) cursor->buffer->buffer_no, cursor->buffer, + cursor->chaser, (ulong) cursor->buffer->size, + (ulong) (cursor->ptr - cursor->buffer->buffer), + LSN_IN_PARTS(*horizon), + (uint) (LSN_OFFSET(cursor->buffer->offset) + + cursor->buffer->size))); + translog_check_cursor(cursor); + + DBUG_RETURN(0); +} + + +/* + Put 1 group chunk type 0 header into parts array + + SYNOPSIS + translog_write_variable_record_1group_header() + parts Descriptor of record source parts + type The log record type + short_trid Short transaction ID or 0 if it has no sense + header_length Calculated header length of chunk type 0 + chunk0_header Buffer for the chunk header writing +*/ + +static void +translog_write_variable_record_1group_header(struct st_translog_parts *parts, + enum translog_record_type type, + SHORT_TRANSACTION_ID short_trid, + uint16 header_length, + uchar *chunk0_header) +{ + LEX_CUSTRING *part; + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (translog_size_t) (part->length= header_length); + part->str= chunk0_header; + /* puts chunk type */ + *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); + int2store(chunk0_header + 1, short_trid); + /* puts record length */ + translog_write_variable_record_1group_code_len(chunk0_header + 3, + parts->record_length, + header_length); + /* puts 0 as chunk length which indicate 1 group record */ + int2store(chunk0_header + header_length - 2, 0); +} + + +/* + Increase number of writers for this buffer + + SYNOPSIS + translog_buffer_increase_writers() + buffer target buffer +*/ + +static inline void +translog_buffer_increase_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_increase_writers"); + translog_buffer_lock_assert_owner(buffer); + buffer->copy_to_buffer_in_progress++; + DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u %p progress: %d", + (uint) buffer->buffer_no, buffer, + buffer->copy_to_buffer_in_progress)); + DBUG_VOID_RETURN; +} + + +/* + Decrease number of writers for this buffer + + SYNOPSIS + translog_buffer_decrease_writers() + buffer target buffer +*/ + +static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer) +{ + DBUG_ENTER("translog_buffer_decrease_writers"); + translog_buffer_lock_assert_owner(buffer); + buffer->copy_to_buffer_in_progress--; + DBUG_PRINT("info", + ("copy_to_buffer_in_progress. Buffer #%u %p progress: %d", + (uint) buffer->buffer_no, buffer, + buffer->copy_to_buffer_in_progress)); + if (buffer->copy_to_buffer_in_progress == 0) + mysql_cond_broadcast(&buffer->waiting_filling_buffer); + DBUG_VOID_RETURN; +} + + +/** + @brief Skip to the next page for chaser (thread which advanced horizon + pointer and now feeling the buffer) + + @param horizon \ Pointers on file position and buffer + @param cursor / + + @retval 1 OK + @retval 0 Error +*/ + +static my_bool translog_chaser_page_next(TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + struct st_translog_buffer *buffer_to_flush; + my_bool rc; + DBUG_ENTER("translog_chaser_page_next"); + DBUG_ASSERT(cursor->chaser); + rc= translog_page_next(horizon, cursor, &buffer_to_flush); + if (buffer_to_flush != NULL) + { + translog_buffer_lock(buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + used_buffs_register_unlock(&cursor->buffs, buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + DBUG_RETURN(rc); +} + +/* + Put chunk 2 from new page beginning + + SYNOPSIS + translog_write_variable_record_chunk2_page() + parts Descriptor of record source parts + horizon \ Pointers on file position and buffer + cursor / + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_chunk2_page(struct st_translog_parts *parts, + TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + uchar chunk2_header[1]; + DBUG_ENTER("translog_write_variable_record_chunk2_page"); + chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; + + if (translog_chaser_page_next(horizon, cursor)) + DBUG_RETURN(1); + + /* Puts chunk type */ + translog_write_data_on_page(horizon, cursor, 1, chunk2_header); + /* Puts chunk body */ + translog_write_parts_on_page(horizon, cursor, + log_descriptor.page_capacity_chunk_2, parts); + DBUG_RETURN(0); +} + + +/* + Put chunk 3 of requested length in the buffer from new page beginning + + SYNOPSIS + translog_write_variable_record_chunk3_page() + parts Descriptor of record source parts + length Length of this chunk + horizon \ Pointers on file position and buffer + cursor / + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_write_variable_record_chunk3_page(struct st_translog_parts *parts, + uint16 length, + TRANSLOG_ADDRESS *horizon, + struct st_buffer_cursor *cursor) +{ + LEX_CUSTRING *part; + uchar chunk3_header[1 + 2]; + DBUG_ENTER("translog_write_variable_record_chunk3_page"); + + if (translog_chaser_page_next(horizon, cursor)) + DBUG_RETURN(1); + + if (length == 0) + { + /* It was call to write page header only (no data for chunk 3) */ + DBUG_PRINT("info", ("It is a call to make page header only")); + DBUG_RETURN(0); + } + + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (translog_size_t) (part->length= 1 + 2); + part->str= chunk3_header; + /* Puts chunk type */ + *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH); + /* Puts chunk length */ + int2store(chunk3_header + 1, length); + + translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts); + DBUG_RETURN(0); +} + +/* + Move log pointer (horizon) on given number pages starting from next page, + and given offset on the last page + + SYNOPSIS + translog_advance_pointer() + pages Number of full pages starting from the next one + last_page_data Plus this data on the last page + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_advance_pointer(int pages, uint16 last_page_data, + TRUNSLOG_USED_BUFFERS *buffs) +{ + translog_size_t last_page_offset= (log_descriptor.page_overhead + + last_page_data); + translog_size_t offset= (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill + + pages * TRANSLOG_PAGE_SIZE + last_page_offset); + translog_size_t buffer_end_offset, file_end_offset, min_offset; + DBUG_ENTER("translog_advance_pointer"); + DBUG_PRINT("enter", ("Pointer: " LSN_FMT " + %u + %u pages + %u + %u", + LSN_IN_PARTS(log_descriptor.horizon), + (uint) (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill), + pages, (uint) log_descriptor.page_overhead, + (uint) last_page_data)); + translog_lock_assert_owner(); + + used_buffs_init(buffs); + + if (pages == -1) + { + /* + It is special case when we advance the pointer on the same page. + It can happened when we write last part of multi-group record. + */ + DBUG_ASSERT(last_page_data + log_descriptor.bc.current_page_fill <= + TRANSLOG_PAGE_SIZE); + offset= last_page_data; + last_page_offset= log_descriptor.bc.current_page_fill + last_page_data; + goto end; + } + DBUG_PRINT("info", ("last_page_offset %lu", (ulong) last_page_offset)); + DBUG_ASSERT(last_page_offset <= TRANSLOG_PAGE_SIZE); + + /* + The loop will be executed 1-3 times. Usually we advance the + pointer to fill only the current buffer (if we have more then 1/2 of + buffer free or 2 buffers (rest of current and all next). In case of + really huge record end where we write last group with "table of + content" of all groups and ignore buffer borders we can occupy + 3 buffers. + */ + for (;;) + { + uint8 new_buffer_no; + struct st_translog_buffer *new_buffer; + struct st_translog_buffer *old_buffer; + buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size; + if (likely(log_descriptor.log_file_max_size >= + LSN_OFFSET(log_descriptor.horizon))) + file_end_offset= (log_descriptor.log_file_max_size - + LSN_OFFSET(log_descriptor.horizon)); + else + { + /* + We already have written more then current file limit allow, + So we will finish this page and start new file + */ + file_end_offset= (TRANSLOG_PAGE_SIZE - + log_descriptor.bc.current_page_fill); + } + DBUG_PRINT("info", ("offset: %u buffer_end_offs: %u, " + "file_end_offs: %u", + offset, buffer_end_offset, + file_end_offset)); + DBUG_PRINT("info", ("Buff #%u %u (%p) offset 0x%x + size 0x%x = " + "0x%x (0x%x)", + log_descriptor.bc.buffer->buffer_no, + log_descriptor.bc.buffer_no, + log_descriptor.bc.buffer, + (uint) LSN_OFFSET(log_descriptor.bc.buffer->offset), + log_descriptor.bc.buffer->size, + (uint) (LSN_OFFSET(log_descriptor.bc.buffer->offset) + + log_descriptor.bc.buffer->size), + (uint) LSN_OFFSET(log_descriptor.horizon))); + DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) + + log_descriptor.bc.buffer->size == + LSN_OFFSET(log_descriptor.horizon)); + + if (offset <= buffer_end_offset && offset <= file_end_offset) + break; + old_buffer= log_descriptor.bc.buffer; + new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; + new_buffer= log_descriptor.buffers + new_buffer_no; + + translog_buffer_lock(new_buffer); +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= new_buffer->offset; + TRANSLOG_FILE *file= new_buffer->file; + uint8 ver= new_buffer->ver; + translog_lock_assert_owner(); +#endif + translog_wait_for_buffer_free(new_buffer); +#ifndef DBUG_OFF + /* We keep the handler locked so nobody can start this new buffer */ + DBUG_ASSERT((offset == new_buffer->offset && new_buffer->file == NULL && + (file == NULL ? ver : (uint8)(ver + 1)) == + new_buffer->ver) || + translog_status == TRANSLOG_READONLY); + } +#endif + + min_offset= MY_MIN(buffer_end_offset, file_end_offset); + /* TODO: check is it ptr or size enough */ + log_descriptor.bc.buffer->size+= min_offset; + log_descriptor.bc.ptr+= min_offset; + DBUG_PRINT("info", ("NewP buffer #%u: %p chaser: %d Size: %lu (%lu)", + (uint) log_descriptor.bc.buffer->buffer_no, + log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. + buffer->buffer))); + DBUG_ASSERT((ulong) (log_descriptor.bc.ptr - + log_descriptor.bc.buffer->buffer) == + log_descriptor.bc.buffer->size); + DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no == + log_descriptor.bc.buffer_no); + translog_buffer_increase_writers(log_descriptor.bc.buffer); + // register for case of error + used_buffs_add(buffs, log_descriptor.bc.buffer); + + if (file_end_offset <= buffer_end_offset) + { + log_descriptor.horizon+= LSN_ONE_FILE; + log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon, + TRANSLOG_PAGE_SIZE); + DBUG_PRINT("info", ("New file: %lu", + (ulong) LSN_FILE_NO(log_descriptor.horizon))); + if (translog_create_new_file()) + { + struct st_translog_buffer *ob= log_descriptor.bc.buffer; + translog_buffer_unlock(ob); + used_buffs_urgent_unlock(buffs); + translog_buffer_lock(ob); + DBUG_RETURN(1); + } + } + else + { + DBUG_PRINT("info", ("The same file")); + log_descriptor.horizon+= min_offset; /* offset increasing */ + } + translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); + old_buffer->next_buffer_offset= new_buffer->offset; + new_buffer->prev_buffer_offset= old_buffer->offset; + translog_buffer_unlock(old_buffer); + offset-= min_offset; + } + DBUG_PRINT("info", ("drop write_counter")); + log_descriptor.bc.write_counter= 0; + log_descriptor.bc.previous_offset= 0; +end: + log_descriptor.bc.ptr+= offset; + log_descriptor.bc.buffer->size+= offset; + translog_buffer_increase_writers(log_descriptor.bc.buffer); + used_buffs_add(buffs, log_descriptor.bc.buffer); + log_descriptor.horizon+= offset; /* offset increasing */ + log_descriptor.bc.current_page_fill= last_page_offset; + DBUG_PRINT("info", ("NewP buffer #%u: %p chaser: %d Size: %lu (%lu) " + "offset: %u last page: %u", + (uint) log_descriptor.bc.buffer->buffer_no, + log_descriptor.bc.buffer, + log_descriptor.bc.chaser, + (ulong) log_descriptor.bc.buffer->size, + (ulong) (log_descriptor.bc.ptr - + log_descriptor.bc.buffer-> + buffer), (uint) offset, + (uint) last_page_offset)); + DBUG_PRINT("info", + ("pointer moved to: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon))); + translog_check_cursor(&log_descriptor.bc); + log_descriptor.bc.protected= 0; + DBUG_RETURN(0); +} + +static void +used_buffs_add(TRUNSLOG_USED_BUFFERS *buffs, + struct st_translog_buffer *buff) +{ + DBUG_ENTER("used_buffs_add"); + DBUG_PRINT("enter", ("ADD buffs: %p unlk %u (%p) wrt_ptr: %u (%p)" + " buff %p (%u)", + buffs, + buffs->wrt_ptr, buffs->buff[buffs->wrt_ptr], + buffs->unlck_ptr, buffs->buff[buffs->unlck_ptr], + buff, buff->buffer_no)); + DBUG_ASSERT(buffs->wrt_ptr < MAX_TRUNSLOG_USED_BUFFERS); + buffs->buff[buffs->wrt_ptr++]= buff; + DBUG_VOID_RETURN; +} + +static void +used_buffs_register_unlock(TRUNSLOG_USED_BUFFERS *buffs, + struct st_translog_buffer *buff + __attribute__((unused)) ) +{ + DBUG_ENTER("used_buffs_register_unlock"); + DBUG_PRINT("enter", ("SUB buffs: %p unlk %u (%p) wrt_ptr: %u (%p)" + " buff %p (%u)", + buffs, + buffs->wrt_ptr, buffs->buff[buffs->wrt_ptr], + buffs->unlck_ptr, buffs->buff[buffs->unlck_ptr], + buff, buff->buffer_no)); + DBUG_ASSERT(buffs->buff[buffs->unlck_ptr] == buff); + buffs->unlck_ptr++; + DBUG_VOID_RETURN; +} +static void used_buffs_urgent_unlock(TRUNSLOG_USED_BUFFERS *buffs) +{ + uint i; + DBUG_ENTER("used_buffs_urgent_unlock"); + translog_lock(); + translog_stop_writing(); + translog_unlock(); + for (i= buffs->unlck_ptr; i < buffs->wrt_ptr; i++) + { + struct st_translog_buffer *buf= buffs->buff[i]; + translog_buffer_lock(buf); + translog_buffer_decrease_writers(buf); + translog_buffer_unlock(buf); + buffs->buff[i]= NULL; + } + used_buffs_init(buffs); + DBUG_VOID_RETURN; +} + +/* + Get page rest + + SYNOPSIS + translog_get_current_page_rest() + + NOTE loghandler should be locked + + RETURN + number of bytes left on the current page +*/ + +static uint translog_get_current_page_rest() +{ + return (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill); +} + + +/* + Get buffer rest in full pages + + SYNOPSIS + translog_get_current_buffer_rest() + + NOTE loghandler should be locked + + RETURN + number of full pages left on the current buffer +*/ + +static uint translog_get_current_buffer_rest() +{ + return (uint)((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER - + log_descriptor.bc.ptr) / + TRANSLOG_PAGE_SIZE); +} + +/* + Calculate possible group size without first (current) page + + SYNOPSIS + translog_get_current_group_size() + + NOTE loghandler should be locked + + RETURN + group size without first (current) page +*/ + +static translog_size_t translog_get_current_group_size() +{ + /* buffer rest in full pages */ + translog_size_t buffer_rest= translog_get_current_buffer_rest(); + DBUG_ENTER("translog_get_current_group_size"); + DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest)); + + buffer_rest*= log_descriptor.page_capacity_chunk_2; + /* in case of only half of buffer free we can write this and next buffer */ + if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2) + { + DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu", + (ulong) buffer_rest, + (ulong) log_descriptor.buffer_capacity_chunk_2)); + buffer_rest+= log_descriptor.buffer_capacity_chunk_2; + } + + DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest)); + + DBUG_RETURN(buffer_rest); +} + + +static inline void set_lsn(LSN *lsn, LSN value) +{ + DBUG_ENTER("set_lsn"); + translog_lock_assert_owner(); + *lsn= value; + /* we generate LSN so something is not flushed in log */ + log_descriptor.is_everything_flushed= 0; + DBUG_PRINT("info", ("new LSN appeared: " LSN_FMT, LSN_IN_PARTS(value))); + DBUG_VOID_RETURN; +} + + +/** + @brief Write variable record in 1 group. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param buffer_to_flush Buffer which have to be flushed if it is not 0 + @param header_length Calculated header length of chunk type 0 + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @note + We must have a translog_lock() when entering this function + We must have buffer_to_flush locked (if not null) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_write_variable_record_1group(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, uint16 header_length, + TRN *trn, void *hook_arg) +{ + TRANSLOG_ADDRESS horizon; + struct st_buffer_cursor cursor; + int rc= 0; + uint i; + translog_size_t record_rest, full_pages, first_page; + uint additional_chunk3_page= 0; + uchar chunk0_header[1 + 2 + 5 + 2]; + DBUG_ENTER("translog_write_variable_record_1group"); + translog_lock_assert_owner(); + if (buffer_to_flush) + translog_buffer_lock_assert_owner(buffer_to_flush); + + set_lsn(lsn, horizon= log_descriptor.horizon); + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, hook_arg))) + { + translog_unlock(); + if (buffer_to_flush != NULL) + { + translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + DBUG_RETURN(1); + } + cursor= log_descriptor.bc; + cursor.chaser= 1; + + /* Advance pointer to be able unlock the loghandler */ + first_page= translog_get_current_page_rest(); + record_rest= parts->record_length - (first_page - header_length); + full_pages= record_rest / log_descriptor.page_capacity_chunk_2; + record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); + + if (record_rest + 1 == log_descriptor.page_capacity_chunk_2) + { + DBUG_PRINT("info", ("2 chunks type 3 is needed")); + /* We will write 2 chunks type 3 at the end of this group */ + additional_chunk3_page= 1; + record_rest= 1; + } + + DBUG_PRINT("info", ("first_page: %u (%u) full_pages: %u (%lu) " + "additional: %u (%u) rest %u = %u", + first_page, first_page - header_length, + full_pages, + (ulong) full_pages * + log_descriptor.page_capacity_chunk_2, + additional_chunk3_page, + additional_chunk3_page * + (log_descriptor.page_capacity_chunk_2 - 1), + record_rest, parts->record_length)); + /* record_rest + 3 is chunk type 3 overhead + record_rest */ + rc= translog_advance_pointer((int)(full_pages + additional_chunk3_page), + (record_rest ? record_rest + 3 : 0), + &cursor.buffs); + log_descriptor.bc.buffer->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), + log_descriptor.bc.buffer)); + + translog_unlock(); + + /* + Check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + if (rc) + { + //translog_advance_pointer decreased writers so it is OK + DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); + DBUG_RETURN(1); + } + + translog_write_variable_record_1group_header(parts, type, short_trid, + header_length, chunk0_header); + + /* fill the pages */ + translog_write_parts_on_page(&horizon, &cursor, first_page, parts); + + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + + for (i= 0; i < full_pages; i++) + { + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto error; + + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + } + + if (additional_chunk3_page) + { + if (translog_write_variable_record_chunk3_page(parts, + log_descriptor. + page_capacity_chunk_2 - 2, + &horizon, &cursor)) + goto error; + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE); + } + + if (translog_write_variable_record_chunk3_page(parts, + record_rest, + &horizon, &cursor)) + goto error; + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, + (uint) LSN_FILE_NO(log_descriptor.horizon), + (uint) LSN_OFFSET(log_descriptor.horizon), + (uint) LSN_FILE_NO(horizon), + (uint) LSN_OFFSET(horizon))); + + translog_buffer_lock(cursor.buffer); + translog_buffer_decrease_writers(cursor.buffer); + used_buffs_register_unlock(&cursor.buffs, cursor.buffer); + translog_buffer_unlock(cursor.buffer); + DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); + DBUG_RETURN(0); +error: + used_buffs_urgent_unlock(&cursor.buffs); + DBUG_RETURN(1); +} + + +/** + @brief Write variable record in 1 chunk. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param buffer_to_flush Buffer which have to be flushed if it is not 0 + @param header_length Calculated header length of chunk type 0 + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @note + We must have a translog_lock() when entering this function + We must have buffer_to_flush locked (if not null) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_write_variable_record_1chunk(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, uint16 header_length, + TRN *trn, void *hook_arg) +{ + int rc; + uchar chunk0_header[1 + 2 + 5 + 2]; + DBUG_ENTER("translog_write_variable_record_1chunk"); + translog_lock_assert_owner(); + if (buffer_to_flush) + translog_buffer_lock_assert_owner(buffer_to_flush); + + translog_write_variable_record_1group_header(parts, type, short_trid, + header_length, chunk0_header); + set_lsn(lsn, log_descriptor.horizon); + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, hook_arg))) + { + translog_unlock(); + rc= 1; + goto err; + } + + rc= translog_write_parts_on_page(&log_descriptor.horizon, + &log_descriptor.bc, + parts->total_record_length, parts); + log_descriptor.bc.buffer->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), + log_descriptor.bc.buffer)); + translog_unlock(); + + /* + check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ +err: + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + + DBUG_RETURN(rc); +} + + +/* + @brief Calculates and write LSN difference (compressed LSN). + + @param base_lsn LSN from which we calculate difference + @param lsn LSN for codding + @param dst Result will be written to dst[-pack_length] .. dst[-1] + + @note To store an LSN in a compact way we will use the following compression: + If a log record has LSN1, and it contains the LSN2 as a back reference, + Instead of LSN2 we write LSN1-LSN2, encoded as: + two bits the number N (see below) + 14 bits + N bytes + That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 + is stored in the first two bits. + + @note function made to write the result in backward direction with no + special sense or tricks both directions are equal in complicity + + @retval # pointer on coded LSN +*/ + +static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst) +{ + uint64 diff; + DBUG_ENTER("translog_put_LSN_diff"); + DBUG_PRINT("enter", ("Base: " LSN_FMT " val: " LSN_FMT " dst: %p", + LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn), + dst)); + DBUG_ASSERT(base_lsn > lsn); + diff= base_lsn - lsn; + DBUG_PRINT("info", ("Diff: 0x%llx", (ulonglong) diff)); + if (diff <= 0x3FFF) + { + dst-= 2; + /* + Note we store this high uchar first to ensure that first uchar has + 0 in the 3 upper bits. + */ + dst[0]= (uchar)(diff >> 8); + dst[1]= (uchar)(diff & 0xFF); + } + else if (diff <= 0x3FFFFFL) + { + dst-= 3; + dst[0]= (uchar)(0x40 | (diff >> 16)); + int2store(dst + 1, diff & 0xFFFF); + } + else if (diff <= 0x3FFFFFFFL) + { + dst-= 4; + dst[0]= (uchar)(0x80 | (diff >> 24)); + int3store(dst + 1, diff & 0xFFFFFFL); + } + else if (diff <= 0x3FFFFFFFFFLL) + + { + dst-= 5; + dst[0]= (uchar)(0xC0 | (diff >> 32)); + int4store(dst + 1, diff & 0xFFFFFFFFL); + } + else + { + /* + It is full LSN after special 1 diff (which is impossible + in real life) + */ + dst-= 2 + LSN_STORE_SIZE; + dst[0]= 0; + dst[1]= 1; + lsn_store(dst + 2, lsn); + } + DBUG_PRINT("info", ("new dst: %p", dst)); + DBUG_RETURN(dst); +} + + +/* + Get LSN from LSN-difference (compressed LSN) + + SYNOPSIS + translog_get_LSN_from_diff() + base_lsn LSN from which we calculate difference + src pointer to coded lsn + dst pointer to buffer where to write 7byte LSN + + NOTE: + To store an LSN in a compact way we will use the following compression: + + If a log record has LSN1, and it contains the lSN2 as a back reference, + Instead of LSN2 we write LSN1-LSN2, encoded as: + + two bits the number N (see below) + 14 bits + N bytes + + That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2 + is stored in the first two bits. + + RETURN + pointer to buffer after decoded LSN +*/ + +static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst) +{ + LSN lsn; + uint32 diff; + uint32 first_byte; + uint32 file_no, rec_offset; + uint8 code; + DBUG_ENTER("translog_get_LSN_from_diff"); + DBUG_PRINT("enter", ("Base: " LSN_FMT " src:%p dst %p", + LSN_IN_PARTS(base_lsn), src, dst)); + first_byte= *((uint8*) src); + code= first_byte >> 6; /* Length is in 2 most significant bits */ + first_byte&= 0x3F; + src++; /* Skip length + encode */ + file_no= LSN_FILE_NO(base_lsn); /* Assume relative */ + DBUG_PRINT("info", ("code: %u first byte: %lu", + (uint) code, (ulong) first_byte)); + switch (code) { + case 0: + if (first_byte == 0 && *((uint8*)src) == 1) + { + /* + It is full LSN after special 1 diff (which is impossible + in real life) + */ + memcpy(dst, src + 1, LSN_STORE_SIZE); + DBUG_PRINT("info", ("Special case of full LSN, new src:%p", + src + 1 + LSN_STORE_SIZE)); + DBUG_RETURN(src + 1 + LSN_STORE_SIZE); + } + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) | *((uint8*)src)); + break; + case 1: + diff= uint2korr(src); + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) | diff); + break; + case 2: + diff= uint3korr(src); + rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) | diff); + break; + case 3: + { + ulonglong base_offset= LSN_OFFSET(base_lsn); + diff= uint4korr(src); + if (diff > LSN_OFFSET(base_lsn)) + { + /* take 1 from file offset */ + first_byte++; + base_offset+= 0x100000000LL; + } + file_no= LSN_FILE_NO(base_lsn) - first_byte; + DBUG_ASSERT(base_offset - diff <= UINT_MAX); + rec_offset= (uint32)(base_offset - diff); + break; + } + default: + DBUG_ASSERT(0); + DBUG_RETURN(NULL); + } + lsn= MAKE_LSN(file_no, rec_offset); + src+= code + 1; + lsn_store(dst, lsn); + DBUG_PRINT("info", ("new src:%p", src)); + DBUG_RETURN(src); +} + + +/** + @brief Encodes relative LSNs listed in the parameters. + + @param parts Parts list with encoded LSN(s) + @param base_lsn LSN which is base for encoding + @param lsns number of LSN(s) to encode + @param compressed_LSNs buffer which can be used for storing compressed LSN(s) +*/ + +static void translog_relative_LSN_encode(struct st_translog_parts *parts, + LSN base_lsn, + uint lsns, uchar *compressed_LSNs) +{ + LEX_CUSTRING *part; + uint lsns_len= lsns * LSN_STORE_SIZE; + uchar buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE]; + uchar *buffer= buffer_src; + const uchar *cbuffer; + + DBUG_ENTER("translog_relative_LSN_encode"); + + DBUG_ASSERT(parts->current != 0); + part= parts->parts + parts->current; + + /* collect all LSN(s) in one chunk if it (they) is (are) divided */ + if (part->length < lsns_len) + { + size_t copied= part->length; + LEX_CUSTRING *next_part; + DBUG_PRINT("info", ("Using buffer:%p", compressed_LSNs)); + memcpy(buffer, part->str, part->length); + next_part= parts->parts + parts->current + 1; + do + { + DBUG_ASSERT(next_part < parts->parts + parts->elements); + if ((next_part->length + copied) < lsns_len) + { + memcpy(buffer + copied, next_part->str, + next_part->length); + copied+= next_part->length; + next_part->length= 0; next_part->str= 0; + /* delete_dynamic_element(&parts->parts, parts->current + 1); */ + next_part++; + parts->current++; + part= parts->parts + parts->current; + } + else + { + size_t len= lsns_len - copied; + memcpy(buffer + copied, next_part->str, len); + copied= lsns_len; + next_part->str+= len; + next_part->length-= len; + } + } while (copied < lsns_len); + cbuffer= buffer; + } + else + { + cbuffer= part->str; + part->str+= lsns_len; + part->length-= lsns_len; + parts->current--; + part= parts->parts + parts->current; + } + + { + /* Compress */ + LSN ref; + int economy; + const uchar *src_ptr; + uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE); + /* + We write the result in backward direction with no special sense or + tricks both directions are equal in complicity + */ + for (src_ptr= cbuffer + lsns_len - LSN_STORE_SIZE; + src_ptr >= (const uchar*)cbuffer; + src_ptr-= LSN_STORE_SIZE) + { + ref= lsn_korr(src_ptr); + dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr); + } + part->length= (size_t)((compressed_LSNs + + (MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE)) - + dst_ptr); + economy= lsns_len - (uint)part->length; + parts->record_length-= economy; + DBUG_PRINT("info", ("new length of LSNs: %lu economy: %d", + (ulong)part->length, economy)); + parts->total_record_length-= economy; + part->str= dst_ptr; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Write multi-group variable-size record. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param buffer_to_flush Buffer which have to be flushed if it is not 0 + @param header_length Header length calculated for 1 group + @param buffer_rest Beginning from which we plan to write in full pages + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @note + We must have a translog_lock() when entering this function + + We must have buffer_to_flush locked (if not null) + buffer_to_flush should *NOT* be locked when calling this function. + (This is note is here as this is different from most other + translog_write...() functions which require the buffer to be locked) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_write_variable_record_mgroup(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + struct st_translog_buffer + *buffer_to_flush, + uint16 header_length, + translog_size_t buffer_rest, + TRN *trn, void *hook_arg) +{ + TRANSLOG_ADDRESS horizon; + struct st_buffer_cursor cursor; + int rc= 0; + size_t i, curr_group= 0; + uint chunk2_page, full_pages; + translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1; + translog_size_t done= 0; + struct st_translog_group_descriptor group; + DYNAMIC_ARRAY groups; + uint16 chunk3_size; + uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1; + uint16 last_page_capacity; + my_bool new_page_before_chunk0= 1, first_chunk0= 1; + uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1]; + uchar chunk2_header[1]; + uint header_fixed_part= header_length + 2; + uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1); + uint file_of_the_first_group; + int pages_to_skip; + struct st_translog_buffer *buffer_of_last_lsn; + my_bool external_buffer_to_flush= TRUE; + DBUG_ENTER("translog_write_variable_record_mgroup"); + translog_lock_assert_owner(); + + used_buffs_init(&cursor.buffs); + chunk2_header[0]= TRANSLOG_CHUNK_NOHDR; + + if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &groups, + sizeof(struct st_translog_group_descriptor), + 10, 10, MYF(0))) + { + translog_unlock(); + if (buffer_to_flush != NULL) + { + translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + DBUG_PRINT("error", ("init array failed")); + DBUG_RETURN(1); + } + + first_page= translog_get_current_page_rest(); + record_rest= parts->record_length - (first_page - 1); + DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest)); + + if (record_rest < buffer_rest) + { + /* + The record (group 1 type) is larger than the free space on the page + - we need to split it in two. But when we split it in two, the first + part is big enough to hold all the data of the record (because the + header of the first part of the split is smaller than the header of + the record as a whole when it takes only one chunk) + */ + DBUG_PRINT("info", ("too many free space because changing header")); + buffer_rest-= log_descriptor.page_capacity_chunk_2; + DBUG_ASSERT(record_rest >= buffer_rest); + } + + file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon); + translog_mark_file_unfinished(file_of_the_first_group); + do + { + DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); + group.addr= horizon= log_descriptor.horizon; + cursor= log_descriptor.bc; + cursor.chaser= 1; + if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255) + { + /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */ + full_pages= 255; + buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2; + } + /* + group chunks = + full pages + first page (which actually can be full, too). + But here we assign number of chunks - 1 + */ + group.num= full_pages; + if (insert_dynamic(&groups, (uchar*) &group)) + { + DBUG_PRINT("error", ("insert into array failed")); + goto err_unlock; + } + + DBUG_PRINT("info", ("chunk: #%u first_page: %u (%u) " + "full_pages: %lu (%lu) " + "Left %lu", + groups.elements, + first_page, first_page - 1, + (ulong) full_pages, + (ulong) (full_pages * + log_descriptor.page_capacity_chunk_2), + (ulong)(parts->record_length - (first_page - 1 + + buffer_rest) - + done))); + rc= translog_advance_pointer((int)full_pages, 0, &cursor.buffs); + + translog_unlock(); + + if (buffer_to_flush != NULL) + { + if (!external_buffer_to_flush) + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + external_buffer_to_flush= FALSE; + + if (rc) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + //translog_advance_pointer decreased writers so it is OK + DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); + goto err; + } + + translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); + translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " + "Left %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + done))); + + for (i= 0; i < full_pages; i++) + { + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto err; + + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " " + "local: " LSN_FMT " " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + i * log_descriptor.page_capacity_chunk_2 - + done))); + } + + done+= (first_page - 1 + buffer_rest); + + if (translog_chaser_page_next(&horizon, &cursor)) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + translog_buffer_lock(cursor.buffer); + translog_buffer_decrease_writers(cursor.buffer); + used_buffs_register_unlock(&cursor.buffs, cursor.buffer); + translog_buffer_unlock(cursor.buffer); + + translog_lock(); + + /* Check that we have place for chunk type 2 */ + first_page= translog_get_current_page_rest(); + if (first_page <= 1) + { + if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush)) + goto err_unlock; + first_page= translog_get_current_page_rest(); + } + buffer_rest= translog_get_current_group_size(); + + if (buffer_to_flush) + used_buffs_register_unlock(&cursor.buffs, + buffer_to_flush); // will be unlocked + + } while ((translog_size_t)(first_page + buffer_rest) < + (translog_size_t)(parts->record_length - done)); + + group.addr= horizon= log_descriptor.horizon; + cursor= log_descriptor.bc; + cursor.chaser= 1; + group.num= 0; /* 0 because it does not matter */ + if (insert_dynamic(&groups, (uchar*) &group)) + { + DBUG_PRINT("error", ("insert into array failed")); + goto err_unlock; + } + record_rest= parts->record_length - done; + DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest)); + if (first_page > record_rest + 1) + { + /* + We have not so much data to fill all first page + (no speaking about full pages) + so it will be: + <chunk0 <data>> + or + <chunk0>...<chunk0><chunk0 <data>> + or + <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>> + */ + chunk2_page= full_pages= 0; + last_page_capacity= first_page; + pages_to_skip= -1; + } + else + { + /* + We will have: + <chunk2 <data>>...<chunk2 <data>><chunk0 <data>> + or + <chunk2 <data>>...<chunk2 <data>><chunk0>...<chunk0><chunk0 <data>> + or + <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>> + */ + chunk2_page= 1; + record_rest-= (first_page - 1); + pages_to_skip= full_pages= + record_rest / log_descriptor.page_capacity_chunk_2; + record_rest= (record_rest % log_descriptor.page_capacity_chunk_2); + last_page_capacity= page_capacity; + } + chunk3_size= 0; + chunk3_pages= 0; + if (last_page_capacity > record_rest + 1 && record_rest != 0) + { + if (last_page_capacity > + record_rest + header_fixed_part + groups.elements * (7 + 1)) + { + /* 1 record of type 0 */ + chunk3_pages= 0; + } + else + { + pages_to_skip++; + chunk3_pages= 1; + if (record_rest + 2 == last_page_capacity) + { + chunk3_size= record_rest - 1; + record_rest= 1; + } + else + { + chunk3_size= record_rest; + record_rest= 0; + } + } + } + /* + A first non-full page will hold type 0 chunk only if it fit in it with + all its headers + */ + while (page_capacity < + record_rest + header_fixed_part + + (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1)) + chunk0_pages++; + DBUG_PRINT("info", ("chunk0_pages: %u groups %u groups per full page: %u " + "Group on last page: %u", + chunk0_pages, groups.elements, + groups_per_page, + (groups.elements - + ((page_capacity - header_fixed_part) / (7 + 1)) * + (chunk0_pages - 1)))); + DBUG_PRINT("info", ("first_page: %u chunk2: %u full_pages: %u (%lu) " + "chunk3: %u (%u) rest: %u", + first_page, + chunk2_page, full_pages, + (ulong) full_pages * + log_descriptor.page_capacity_chunk_2, + chunk3_pages, (uint) chunk3_size, (uint) record_rest)); + + DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); + rc= translog_advance_pointer(pages_to_skip + (int)(chunk0_pages - 1), + (uint16)(record_rest + header_fixed_part + + ((uint)groups.elements - + ((page_capacity - + header_fixed_part) / (7 + 1)) * + (chunk0_pages - 1)) * (7 + 1)), + &cursor.buffs); + buffer_of_last_lsn= log_descriptor.bc.buffer; + translog_unlock(); + + if (buffer_to_flush != NULL) + { + DBUG_ASSERT(!external_buffer_to_flush); + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + if (rc) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + + if (rc) + goto err; + + if (chunk2_page) + { + DBUG_PRINT("info", ("chunk 2 to finish first page")); + translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header); + translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts); + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + done))); + } + else if (chunk3_pages) + { + uchar chunk3_header[3]; + DBUG_PRINT("info", ("chunk 3")); + DBUG_ASSERT(full_pages == 0); + chunk3_pages= 0; + chunk3_header[0]= TRANSLOG_CHUNK_LNGTH; + int2store(chunk3_header + 1, chunk3_size); + translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header); + translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts); + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - chunk3_size - done))); + } + else + { + DBUG_PRINT("info", ("no new_page_before_chunk0")); + new_page_before_chunk0= 0; + } + + for (i= 0; i < full_pages; i++) + { + DBUG_ASSERT(chunk2_page != 0); + if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor)) + goto err; + + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT " " + "Left: %lu", + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon), + (ulong) (parts->record_length - (first_page - 1) - + i * log_descriptor.page_capacity_chunk_2 - + done))); + } + + if (chunk3_pages && + translog_write_variable_record_chunk3_page(parts, + chunk3_size, + &horizon, &cursor)) + goto err; + DBUG_PRINT("info", ("absolute horizon: " LSN_FMT " local: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon), + LSN_IN_PARTS(horizon))); + + *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN); + int2store(chunk0_header + 1, short_trid); + translog_write_variable_record_1group_code_len(chunk0_header + 3, + parts->record_length, + header_length); + do + { + size_t limit; + if (new_page_before_chunk0 && + translog_chaser_page_next(&horizon, &cursor)) + { + DBUG_PRINT("error", ("flush of unlock buffer failed")); + goto err; + } + new_page_before_chunk0= 1; + + if (first_chunk0) + { + first_chunk0= 0; + + /* + We can drop "log_descriptor.is_everything_flushed" earlier when have + lock on loghandler and assign initial value of "horizon" variable or + before unlocking loghandler (because we will increase writers + counter on the buffer and every thread which wanted flush the buffer + will wait till we finish with it). But IMHO better here take short + lock and do not bother other threads with waiting. + */ + translog_lock(); + set_lsn(lsn, horizon); + buffer_of_last_lsn->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", + LSN_IN_PARTS(buffer_of_last_lsn->last_lsn), + buffer_of_last_lsn)); + if (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook) (type, trn, + tbl_info, + lsn, hook_arg)) + goto err_unlock; + translog_unlock(); + } + + /* + A first non-full page will hold type 0 chunk only if it fit in it with + all its headers => the fist page is full or number of groups less then + possible number of full page. + */ + limit= (groups_per_page < groups.elements - curr_group ? + groups_per_page : groups.elements - curr_group); + DBUG_PRINT("info", ("Groups: %zu curr: %zu limit: %zu", + groups.elements, curr_group, limit)); + + if (chunk0_pages == 1) + { + DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u", + (uint) limit, (uint) record_rest, + (uint) (2 + limit * (7 + 1) + record_rest))); + int2store(chunk0_header + header_length - 2, + 2 + limit * (7 + 1) + record_rest); + } + else + { + DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u", + (uint) limit, (uint) (2 + limit * (7 + 1)))); + int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1)); + } + int2store(chunk0_header + header_length, groups.elements - curr_group); + translog_write_data_on_page(&horizon, &cursor, header_fixed_part, + chunk0_header); + for (i= curr_group; i < limit + curr_group; i++) + { + struct st_translog_group_descriptor *grp_ptr; + grp_ptr= dynamic_element(&groups, i, + struct st_translog_group_descriptor *); + lsn_store(group_desc, grp_ptr->addr); + group_desc[7]= grp_ptr->num; + translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc); + } + + if (chunk0_pages == 1 && record_rest != 0) + translog_write_parts_on_page(&horizon, &cursor, record_rest, parts); + + chunk0_pages--; + curr_group+= limit; + /* put special type to indicate that it is not LSN chunk */ + *chunk0_header= (uchar) (TRANSLOG_CHUNK_LSN | TRANSLOG_CHUNK_0_CONT); + } while (chunk0_pages != 0); + translog_buffer_lock(cursor.buffer); + translog_buffer_decrease_writers(cursor.buffer); + used_buffs_register_unlock(&cursor.buffs, cursor.buffer); + translog_buffer_unlock(cursor.buffer); + rc= 0; + DBUG_ASSERT(cursor.buffs.unlck_ptr == cursor.buffs.wrt_ptr); + + if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn), + *lsn, FALSE)) + goto err; + + translog_mark_file_finished(file_of_the_first_group); + + delete_dynamic(&groups); + DBUG_RETURN(0); + +err_unlock: + + translog_unlock(); + +err: + + if (cursor.buffs.unlck_ptr != cursor.buffs.wrt_ptr) + used_buffs_urgent_unlock(&cursor.buffs); + + if (buffer_to_flush != NULL) + { + /* This is to prevent locking buffer forever in case of error */ + if (!external_buffer_to_flush) + translog_buffer_decrease_writers(buffer_to_flush); + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + buffer_to_flush= NULL; + } + + + translog_mark_file_finished(file_of_the_first_group); + + delete_dynamic(&groups); + DBUG_RETURN(1); +} + + +/** + @brief Write the variable length log record. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_write_variable_record(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + TRN *trn, void *hook_arg) +{ + struct st_translog_buffer *buffer_to_flush= NULL; + uint header_length1= 1 + 2 + 2 + + translog_variable_record_length_bytes(parts->record_length); + ulong buffer_rest; + uint page_rest; + /* Max number of such LSNs per record is 2 */ + uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE]; + my_bool res; + DBUG_ENTER("translog_write_variable_record"); + + translog_lock(); + DBUG_PRINT("info", ("horizon: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon))); + page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; + DBUG_PRINT("info", ("header length: %u page_rest: %u", + header_length1, page_rest)); + + /* + header and part which we should read have to fit in one chunk + TODO: allow to divide readable header + */ + if (page_rest < + (header_length1 + log_record_type_descriptor[type].read_header_len)) + { + DBUG_PRINT("info", + ("Next page, size: %u header: %u + %u", + log_descriptor.bc.current_page_fill, + header_length1, + log_record_type_descriptor[type].read_header_len)); + translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush); + /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */ + page_rest= log_descriptor.page_capacity_chunk_2 + 1; + DBUG_PRINT("info", ("page_rest: %u", page_rest)); + } + + /* + To minimize compressed size we will compress always relative to + very first chunk address (log_descriptor.horizon for now) + */ + if (log_record_type_descriptor[type].compressed_LSN > 0) + { + translog_relative_LSN_encode(parts, log_descriptor.horizon, + log_record_type_descriptor[type]. + compressed_LSN, compressed_LSNs); + /* recalculate header length after compression */ + header_length1= 1 + 2 + 2 + + translog_variable_record_length_bytes(parts->record_length); + DBUG_PRINT("info", ("after compressing LSN(s) header length: %u " + "record length: %lu", + header_length1, (ulong)parts->record_length)); + } + + /* TODO: check space on current page for header + few bytes */ + if (page_rest >= parts->record_length + header_length1) + { + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_1chunk(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, trn, hook_arg); + DBUG_RETURN(res); + } + + buffer_rest= translog_get_current_group_size(); + + if (buffer_rest >= parts->record_length + header_length1 - page_rest) + { + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_1group(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, trn, hook_arg); + DBUG_RETURN(res); + } + /* following function makes translog_unlock(); */ + res= translog_write_variable_record_mgroup(lsn, type, tbl_info, + short_trid, + parts, buffer_to_flush, + header_length1, + buffer_rest, trn, hook_arg); + DBUG_RETURN(res); +} + + +/** + @brief Write the fixed and pseudo-fixed log record. + + @param lsn LSN of the record will be written here + @param type the log record type + @param short_trid Short transaction ID or 0 if it has no sense + @param parts Descriptor of record source parts + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param hook_arg Argument which will be passed to pre-write and + in-write hooks of this record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_write_fixed_record(LSN *lsn, + enum translog_record_type type, + MARIA_HA *tbl_info, + SHORT_TRANSACTION_ID short_trid, + struct st_translog_parts *parts, + TRN *trn, void *hook_arg) +{ + struct st_translog_buffer *buffer_to_flush= NULL; + uchar chunk1_header[1 + 2]; + /* Max number of such LSNs per record is 2 */ + uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD * + COMPRESSED_LSN_MAX_STORE_SIZE]; + LEX_CUSTRING *part; + int rc= 1; + DBUG_ENTER("translog_write_fixed_record"); + DBUG_ASSERT((log_record_type_descriptor[type].rclass == + LOGRECTYPE_FIXEDLENGTH && + parts->record_length == + log_record_type_descriptor[type].fixed_length) || + (log_record_type_descriptor[type].rclass == + LOGRECTYPE_PSEUDOFIXEDLENGTH && + parts->record_length == + log_record_type_descriptor[type].fixed_length)); + + translog_lock(); + DBUG_PRINT("info", ("horizon: " LSN_FMT, + LSN_IN_PARTS(log_descriptor.horizon))); + + DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE); + DBUG_PRINT("info", + ("Page size: %u record: %u next cond: %d", + log_descriptor.bc.current_page_fill, + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3), + ((((uint) log_descriptor.bc.current_page_fill) + + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > + TRANSLOG_PAGE_SIZE))); + /* + check that there is enough place on current page. + NOTE: compressing may increase page LSN size on two bytes for every LSN + */ + if ((((uint) log_descriptor.bc.current_page_fill) + + (parts->record_length + + log_record_type_descriptor[type].compressed_LSN * 2 + 3)) > + TRANSLOG_PAGE_SIZE) + { + DBUG_PRINT("info", ("Next page")); + if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc, + &buffer_to_flush)) + goto err; /* rc == 1 */ + if (buffer_to_flush) + translog_buffer_lock_assert_owner(buffer_to_flush); + } + + set_lsn(lsn, log_descriptor.horizon); + if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn), + *lsn, TRUE) || + (log_record_type_descriptor[type].inwrite_hook && + (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info, + lsn, hook_arg))) + goto err; + + /* compress LSNs */ + if (log_record_type_descriptor[type].rclass == + LOGRECTYPE_PSEUDOFIXEDLENGTH) + { + DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0); + translog_relative_LSN_encode(parts, *lsn, + log_record_type_descriptor[type]. + compressed_LSN, compressed_LSNs); + } + + /* + Write the whole record at once (we know that there is enough place on + the destination page) + */ + DBUG_ASSERT(parts->current != 0); /* first part is left for header */ + part= parts->parts + (--parts->current); + parts->total_record_length+= (translog_size_t) (part->length= 1 + 2); + part->str= chunk1_header; + *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED); + int2store(chunk1_header + 1, short_trid); + + rc= translog_write_parts_on_page(&log_descriptor.horizon, + &log_descriptor.bc, + parts->total_record_length, parts); + + log_descriptor.bc.buffer->last_lsn= *lsn; + DBUG_PRINT("info", ("last_lsn set to " LSN_FMT " buffer: %p", + LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn), + log_descriptor.bc.buffer)); + +err: + translog_unlock(); + + /* + check if we switched buffer and need process it (current buffer is + unlocked already => we will not delay other threads + */ + if (buffer_to_flush != NULL) + { + if (!rc) + rc= translog_buffer_flush(buffer_to_flush); + translog_buffer_unlock(buffer_to_flush); + } + + DBUG_RETURN(rc); +} + + +/** + @brief Writes the log record + + If share has no 2-byte-id yet, gives an id to the share and logs + LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID + yet, logs it. + + @param lsn LSN of the record will be written here + @param type the log record type + @param trn Transaction structure pointer for hooks by + record log type, for short_id + @param tbl_info MARIA_HA of table or NULL + @param rec_len record length or 0 (count it) + @param part_no number of parts or 0 (count it) + @param parts_data zero ended (in case of number of parts is 0) + array of LEX_STRINGs (parts), first + TRANSLOG_INTERNAL_PARTS positions in the log + should be unused (need for loghandler) + @param store_share_id if tbl_info!=NULL then share's id will + automatically be stored in the two first bytes + pointed (so pointer is assumed to be !=NULL) + @param hook_arg argument which will be passed to pre-write and + in-write hooks of this record. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_write_record(LSN *lsn, + enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + translog_size_t rec_len, + uint part_no, + LEX_CUSTRING *parts_data, + uchar *store_share_id, + void *hook_arg) +{ + struct st_translog_parts parts; + LEX_CUSTRING *part; + int rc; + uint short_trid= trn->short_id; + DBUG_ENTER("translog_write_record"); + DBUG_PRINT("enter", ("type: %u (%s) ShortTrID: %u rec_len: %lu", + (uint) type, log_record_type_descriptor[type].name, + (uint) short_trid, (ulong) rec_len)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + DBUG_ASSERT(type != 0); + DBUG_SLOW_ASSERT((uint)type <= max_allowed_translog_type); + if (unlikely(translog_status != TRANSLOG_OK)) + { + DBUG_PRINT("error", ("Transaction log is write protected")); + DBUG_RETURN(1); + } + + if (tbl_info && type != LOGREC_FILE_ID) + { + MARIA_SHARE *share= tbl_info->s; + DBUG_ASSERT(share->now_transactional); + if (unlikely(share->id == 0)) + { + /* + First log write for this MARIA_SHARE; give it a short id. + When the lock manager is enabled and needs a short id, it should be + assigned in the lock manager (because row locks will be taken before + log records are written; for example SELECT FOR UPDATE takes locks but + writes no log record. + */ + if (unlikely(translog_assign_id_to_share(tbl_info, trn))) + DBUG_RETURN(1); + } + fileid_store(store_share_id, share->id); + } + if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID))) + { + LSN dummy_lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[6]; + DBUG_ASSERT(trn->undo_lsn == LSN_IMPOSSIBLE); + int6store(log_data, trn->trid); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */ + if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID, + trn, NULL, sizeof(log_data), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL))) + DBUG_RETURN(1); + } + + parts.parts= parts_data; + + /* count parts if they are not counted by upper level */ + if (part_no == 0) + { + for (part_no= TRANSLOG_INTERNAL_PARTS; + parts_data[part_no].length != 0; + part_no++); + } + parts.elements= part_no; + parts.current= TRANSLOG_INTERNAL_PARTS; + + /* clear TRANSLOG_INTERNAL_PARTS */ + compile_time_assert(TRANSLOG_INTERNAL_PARTS != 0); + parts_data[0].str= 0; + parts_data[0].length= 0; + + /* count length of the record */ + if (rec_len == 0) + { + for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\ + part < parts_data + part_no; + part++) + { + rec_len+= (translog_size_t) part->length; + } + } + parts.record_length= rec_len; + +#ifndef DBUG_OFF + { + uint i; + size_t len= 0; +#ifdef HAVE_valgrind + ha_checksum checksum= 0; +#endif + for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++) + { +#ifdef HAVE_valgrind + /* Find unitialized bytes early */ + checksum+= my_checksum(checksum, parts_data[i].str, + parts_data[i].length); +#endif + len+= parts_data[i].length; + } + DBUG_ASSERT(len == rec_len); + } +#endif + /* + Start total_record_length from record_length then overhead will + be add + */ + parts.total_record_length= parts.record_length; + DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length)); + + /* process this parts */ + if (!(rc= (log_record_type_descriptor[type].prewrite_hook && + (*log_record_type_descriptor[type].prewrite_hook)(type, trn, + tbl_info, + hook_arg)))) + { + switch (log_record_type_descriptor[type].rclass) { + case LOGRECTYPE_VARIABLE_LENGTH: + rc= translog_write_variable_record(lsn, type, tbl_info, + short_trid, &parts, trn, hook_arg); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + case LOGRECTYPE_FIXEDLENGTH: + rc= translog_write_fixed_record(lsn, type, tbl_info, + short_trid, &parts, trn, hook_arg); + break; + case LOGRECTYPE_NOT_ALLOWED: + default: + DBUG_ASSERT(0); + rc= 1; + } + } + + DBUG_PRINT("info", ("LSN: " LSN_FMT, LSN_IN_PARTS(*lsn))); + DBUG_RETURN(rc); +} + + +/* + Decode compressed (relative) LSN(s) + + SYNOPSIS + translog_relative_lsn_decode() + base_lsn LSN for encoding + src Decode LSN(s) from here + dst Put decoded LSNs here + lsns number of LSN(s) + + RETURN + position in sources after decoded LSN(s) +*/ + +static uchar *translog_relative_LSN_decode(LSN base_lsn, + uchar *src, uchar *dst, uint lsns) +{ + uint i; + for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE) + { + src= translog_get_LSN_from_diff(base_lsn, src, dst); + } + return src; +} + +/** + @brief Get header of fixed/pseudo length record and call hook for + it processing + + @param page Pointer to the buffer with page where LSN chunk is + placed + @param page_offset Offset of the first chunk in the page + @param buff Buffer to be filled with header data + + @return Length of header or operation status + @retval # number of bytes in TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +static int translog_fixed_length_header(uchar *page, + translog_size_t page_offset, + TRANSLOG_HEADER_BUFFER *buff) +{ + struct st_log_record_type_descriptor *desc= + log_record_type_descriptor + buff->type; + uchar *src= page + page_offset + 3; + uchar *dst= buff->header; + uchar *start= src; + int lsns= desc->compressed_LSN; + uint length= desc->fixed_length; + DBUG_ENTER("translog_fixed_length_header"); + + buff->record_length= length; + + if (desc->rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH) + { + DBUG_ASSERT(lsns > 0); + src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns); + lsns*= LSN_STORE_SIZE; + dst+= lsns; + length-= lsns; + buff->compressed_LSN_economy= (lsns - (int) (src - start)); + } + else + buff->compressed_LSN_economy= 0; + + memcpy(dst, src, length); + buff->non_header_data_start_offset= (uint16) (page_offset + + ((src + length) - + (page + page_offset))); + buff->non_header_data_len= 0; + DBUG_RETURN(buff->record_length); +} + + +/* + Free resources used by TRANSLOG_HEADER_BUFFER + + SYNOPSIS + translog_free_record_header(); +*/ + +void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff) +{ + DBUG_ENTER("translog_free_record_header"); + if (buff->groups_no != 0) + { + my_free(buff->groups); + buff->groups_no= 0; + } + DBUG_VOID_RETURN; +} + + +/** + @brief Returns the current horizon at the end of the current log + + @return Horizon + @retval LSN_ERROR error + @retvar # Horizon +*/ + +TRANSLOG_ADDRESS translog_get_horizon() +{ + TRANSLOG_ADDRESS res; + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + translog_lock(); + res= log_descriptor.horizon; + translog_unlock(); + return res; +} + + +/** + @brief Returns the current horizon at the end of the current log, caller is + assumed to already hold the lock + + @return Horizon + @retval LSN_ERROR error + @retvar # Horizon +*/ + +TRANSLOG_ADDRESS translog_get_horizon_no_lock() +{ + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + translog_lock_assert_owner(); + return log_descriptor.horizon; +} + + +/* + Set last page in the scanner data structure + + SYNOPSIS + translog_scanner_set_last_page() + scanner Information about current chunk during scanning + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA *scanner) +{ + my_bool page_ok; + if (LSN_FILE_NO(scanner->page_addr) == LSN_FILE_NO(scanner->horizon)) + { + /* It is last file => we can easy find last page address by horizon */ + uint pagegrest= LSN_OFFSET(scanner->horizon) % TRANSLOG_PAGE_SIZE; + scanner->last_file_page= (scanner->horizon - + (pagegrest ? pagegrest : TRANSLOG_PAGE_SIZE)); + return (0); + } + scanner->last_file_page= scanner->page_addr; + return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok, 0)); +} + + +/** + @brief Get page from page cache according to requested method + + @param scanner The scanner data + + @return operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool +translog_scanner_get_page(TRANSLOG_SCANNER_DATA *scanner) +{ + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_scanner_get_page"); + data.addr= &scanner->page_addr; + data.was_recovered= 0; + DBUG_RETURN((scanner->page= + translog_get_page(&data, scanner->buffer, + (scanner->use_direct_link ? + &scanner->direct_link : + NULL))) == + NULL); +} + + +/** + @brief Initialize reader scanner. + + @param lsn LSN with which it have to be inited + @param fixed_horizon true if it is OK do not read records which was written + after scanning beginning + @param scanner scanner which have to be inited + @param use_direct prefer using direct lings from page handler + where it is possible. + + @note If direct link was used translog_destroy_scanner should be + called after it using + + @return status of the operation + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_scanner_init(LSN lsn, + my_bool fixed_horizon, + TRANSLOG_SCANNER_DATA *scanner, + my_bool use_direct) +{ + DBUG_ENTER("translog_scanner_init"); + DBUG_PRINT("enter", ("Scanner: %p LSN: " LSN_FMT, + scanner, LSN_IN_PARTS(lsn))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; + + scanner->fixed_horizon= fixed_horizon; + scanner->use_direct_link= use_direct; + scanner->direct_link= NULL; + + scanner->horizon= translog_get_horizon(); + DBUG_PRINT("info", ("horizon: " LSN_FMT, LSN_IN_PARTS(scanner->horizon))); + + /* lsn < horizon */ + DBUG_ASSERT(lsn <= scanner->horizon); + + scanner->page_addr= lsn; + scanner->page_addr-= scanner->page_offset; /*decrease offset */ + + if (translog_scanner_set_last_page(scanner)) + DBUG_RETURN(1); + + if (translog_scanner_get_page(scanner)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief Destroy scanner object; + + @param scanner The scanner object to destroy +*/ + +void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_destroy_scanner"); + DBUG_PRINT("enter", ("Scanner: %p", scanner)); + translog_free_link(scanner->direct_link); + DBUG_VOID_RETURN; +} + + +/* + Checks End of the Log + + SYNOPSIS + translog_scanner_eol() + scanner Information about current chunk during scanning + + RETURN + 1 End of the Log + 0 OK +*/ + +static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eol"); + DBUG_PRINT("enter", + ("Horizon: " LSN_FMT " Current: (%u, 0x%x+0x%x=0x%x)", + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->page_addr), + (uint) scanner->page_offset, + (uint) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset))); + if (scanner->horizon > (scanner->page_addr + + scanner->page_offset)) + { + DBUG_PRINT("info", ("Horizon is not reached")); + DBUG_RETURN(0); + } + if (scanner->fixed_horizon) + { + DBUG_PRINT("info", ("Horizon is fixed and reached")); + DBUG_RETURN(1); + } + scanner->horizon= translog_get_horizon(); + DBUG_PRINT("info", + ("Horizon is re-read, EOL: %d", + scanner->horizon <= (scanner->page_addr + + scanner->page_offset))); + DBUG_RETURN(scanner->horizon <= (scanner->page_addr + + scanner->page_offset)); +} + + +/** + @brief Cheks End of the Page + + @param scanner Information about current chunk during scanning + + @retval 1 End of the Page + @retval 0 OK +*/ + +static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eop"); + DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE || + scanner->page[scanner->page_offset] == TRANSLOG_FILLER); +} + + +/** + @brief Checks End of the File (i.e. we are scanning last page, which do not + mean end of this page) + + @param scanner Information about current chunk during scanning + + @retval 1 End of the File + @retval 0 OK +*/ + +static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner) +{ + DBUG_ENTER("translog_scanner_eof"); + DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) == + LSN_FILE_NO(scanner->last_file_page)); + DBUG_PRINT("enter", ("curr Page: 0x%lx last page: 0x%lx " + "normal EOF: %d", + (ulong) LSN_OFFSET(scanner->page_addr), + (ulong) LSN_OFFSET(scanner->last_file_page), + LSN_OFFSET(scanner->page_addr) == + LSN_OFFSET(scanner->last_file_page))); + /* + TODO: detect damaged file EOF, + TODO: issue warning if damaged file EOF detected + */ + DBUG_RETURN(scanner->page_addr == + scanner->last_file_page); +} + +/* + Move scanner to the next chunk + + SYNOPSIS + translog_get_next_chunk() + scanner Information about current chunk during scanning + + RETURN + 0 OK + 1 Error +*/ + +static my_bool +translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner) +{ + uint16 len; + DBUG_ENTER("translog_get_next_chunk"); + + if (translog_scanner_eop(scanner)) + len= TRANSLOG_PAGE_SIZE - scanner->page_offset; + else if ((len= translog_get_total_chunk_length(scanner->page, + scanner->page_offset)) == 0) + DBUG_RETURN(1); + scanner->page_offset+= len; + + if (translog_scanner_eol(scanner)) + { + scanner->page= END_OF_LOG; + scanner->page_offset= 0; + DBUG_RETURN(0); + } + if (translog_scanner_eop(scanner)) + { + /* before reading next page we should unpin current one if it was pinned */ + translog_free_link(scanner->direct_link); + if (translog_scanner_eof(scanner)) + { + DBUG_PRINT("info", ("horizon: " LSN_FMT " pageaddr: " LSN_FMT, + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->page_addr))); + /* if it is log end it have to be caught before */ + DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) > + LSN_FILE_NO(scanner->page_addr)); + scanner->page_addr+= LSN_ONE_FILE; + scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr, + TRANSLOG_PAGE_SIZE); + if (translog_scanner_set_last_page(scanner)) + DBUG_RETURN(1); + } + else + { + scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */ + } + + if (translog_scanner_get_page(scanner)) + DBUG_RETURN(1); + + scanner->page_offset= translog_get_first_chunk_offset(scanner->page); + if (translog_scanner_eol(scanner)) + { + scanner->page= END_OF_LOG; + scanner->page_offset= 0; + DBUG_RETURN(0); + } + DBUG_ASSERT(scanner->page[scanner->page_offset] != TRANSLOG_FILLER); + } + DBUG_RETURN(0); +} + + +/** + @brief Get header of variable length record and call hook for it processing + + @param page Pointer to the buffer with page where LSN chunk is + placed + @param page_offset Offset of the first chunk in the page + @param buff Buffer to be filled with header data + @param scanner If present should be moved to the header page if + it differ from LSN page + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval RECHEADER_READ_EOF End of the log reached during the read + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +static int +translog_variable_length_header(uchar *page, translog_size_t page_offset, + TRANSLOG_HEADER_BUFFER *buff, + TRANSLOG_SCANNER_DATA *scanner) +{ + struct st_log_record_type_descriptor *desc= (log_record_type_descriptor + + buff->type); + uchar *src= page + page_offset + 1 + 2; + uchar *dst= buff->header; + LSN base_lsn; + uint lsns= desc->compressed_LSN; + uint16 chunk_len; + uint16 length= desc->read_header_len; + uint16 buffer_length= length; + uint16 body_len; + int rc; + TRANSLOG_SCANNER_DATA internal_scanner; + DBUG_ENTER("translog_variable_length_header"); + + buff->record_length= translog_variable_record_1group_decode_len(&src); + chunk_len= uint2korr(src); + DBUG_PRINT("info", ("rec len: %lu chunk len: %u length: %u bufflen: %u", + (ulong) buff->record_length, (uint) chunk_len, + (uint) length, (uint) buffer_length)); + if (chunk_len == 0) + { + uint16 page_rest; + DBUG_PRINT("info", ("1 group")); + src+= 2; + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + + base_lsn= buff->lsn; + body_len= MY_MIN(page_rest, buff->record_length); + } + else + { + uint grp_no, curr; + uint header_to_skip; + uint16 page_rest; + + DBUG_PRINT("info", ("multi-group")); + grp_no= buff->groups_no= uint2korr(src + 2); + if (!(buff->groups= + (TRANSLOG_GROUP*) my_malloc(PSI_INSTRUMENT_ME, sizeof(TRANSLOG_GROUP) * grp_no, + MYF(0)))) + DBUG_RETURN(RECHEADER_READ_ERROR); + DBUG_PRINT("info", ("Groups: %u", (uint) grp_no)); + src+= (2 + 2); + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + curr= 0; + header_to_skip= (uint) (src - (page + page_offset)); + buff->chunk0_pages= 0; + + for (;;) + { + uint i, read_length= grp_no; + + buff->chunk0_pages++; + if (page_rest < grp_no * (7 + 1)) + read_length= page_rest / (7 + 1); + DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u " + "start from: %u", + buff->chunk0_pages, read_length, grp_no, curr)); + for (i= 0; i < read_length; i++, curr++) + { + DBUG_ASSERT(curr < buff->groups_no); + buff->groups[curr].addr= lsn_korr(src + i * (7 + 1)); + buff->groups[curr].num= src[i * (7 + 1) + 7]; + DBUG_PRINT("info", ("group #%u " LSN_FMT " chunks: %u", + curr, + LSN_IN_PARTS(buff->groups[curr].addr), + (uint) buff->groups[curr].num)); + } + grp_no-= read_length; + if (grp_no == 0) + { + if (scanner) + { + buff->chunk0_data_addr= scanner->page_addr; + /* offset increased */ + buff->chunk0_data_addr+= (page_offset + header_to_skip + + read_length * (7 + 1)); + } + else + { + buff->chunk0_data_addr= buff->lsn; + /* offset increased */ + buff->chunk0_data_addr+= (header_to_skip + read_length * (7 + 1)); + } + buff->chunk0_data_len= chunk_len - 2 - read_length * (7 + 1); + DBUG_PRINT("info", ("Data address: " LSN_FMT " len: %u", + LSN_IN_PARTS(buff->chunk0_data_addr), + buff->chunk0_data_len)); + break; + } + if (scanner == NULL) + { + DBUG_PRINT("info", ("use internal scanner for header reading")); + scanner= &internal_scanner; + if (translog_scanner_init(buff->lsn, 1, scanner, 0)) + { + rc= RECHEADER_READ_ERROR; + goto exit_and_free; + } + } + if (translog_get_next_chunk(scanner)) + { + if (scanner == &internal_scanner) + translog_destroy_scanner(scanner); + rc= RECHEADER_READ_ERROR; + goto exit_and_free; + } + if (scanner->page == END_OF_LOG) + { + if (scanner == &internal_scanner) + translog_destroy_scanner(scanner); + rc= RECHEADER_READ_EOF; + goto exit_and_free; + } + page= scanner->page; + page_offset= scanner->page_offset; + src= page + page_offset + header_to_skip; + chunk_len= uint2korr(src - 2 - 2); + DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len)); + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + } + + if (scanner == NULL) + { + DBUG_PRINT("info", ("use internal scanner")); + scanner= &internal_scanner; + } + else + { + translog_destroy_scanner(scanner); + } + base_lsn= buff->groups[0].addr; + translog_scanner_init(base_lsn, 1, scanner, scanner == &internal_scanner); + /* first group chunk is always chunk type 2 */ + page= scanner->page; + page_offset= scanner->page_offset; + src= page + page_offset + 1; + page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page)); + body_len= page_rest; + if (scanner == &internal_scanner) + translog_destroy_scanner(scanner); + } + if (lsns) + { + uchar *start= src; + src= translog_relative_LSN_decode(base_lsn, src, dst, lsns); + lsns*= LSN_STORE_SIZE; + dst+= lsns; + length-= lsns; + buff->record_length+= (buff->compressed_LSN_economy= + (int) (lsns - (src - start))); + DBUG_PRINT("info", ("lsns: %u length: %u economy: %d new length: %lu", + lsns / LSN_STORE_SIZE, (uint) length, + (int) buff->compressed_LSN_economy, + (ulong) buff->record_length)); + body_len-= (uint16) (src - start); + } + else + buff->compressed_LSN_economy= 0; + + DBUG_ASSERT(body_len >= length); + body_len-= length; + memcpy(dst, src, length); + buff->non_header_data_start_offset= (uint16) (src + length - page); + buff->non_header_data_len= body_len; + DBUG_PRINT("info", ("non_header_data_start_offset: %u len: %u buffer: %u", + buff->non_header_data_start_offset, + buff->non_header_data_len, buffer_length)); + DBUG_RETURN(buffer_length); + +exit_and_free: + my_free(buff->groups); + buff->groups_no= 0; /* prevent try to use of buff->groups */ + DBUG_RETURN(rc); +} + + +/** + @brief Read record header from the given buffer + + @param page page content buffer + @param page_offset offset of the chunk in the page + @param buff destination buffer + @param scanner If this is set the scanner will be moved to the + record header page (differ from LSN page in case of + multi-group records) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_record_header_from_buffer(uchar *page, + uint16 page_offset, + TRANSLOG_HEADER_BUFFER *buff, + TRANSLOG_SCANNER_DATA *scanner) +{ + translog_size_t res; + DBUG_ENTER("translog_read_record_header_from_buffer"); + DBUG_PRINT("info", ("page byte: 0x%x offset: %u", + (uint) page[page_offset], (uint) page_offset)); + DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset])); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + buff->type= (page[page_offset] & TRANSLOG_REC_TYPE); + buff->short_trid= uint2korr(page + page_offset + 1); + DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN " LSN_FMT, + (uint) buff->type, (uint)buff->short_trid, + LSN_IN_PARTS(buff->lsn))); + /* Read required bytes from the header and call hook */ + switch (log_record_type_descriptor[buff->type].rclass) { + case LOGRECTYPE_VARIABLE_LENGTH: + res= translog_variable_length_header(page, page_offset, buff, + scanner); + break; + case LOGRECTYPE_PSEUDOFIXEDLENGTH: + case LOGRECTYPE_FIXEDLENGTH: + res= translog_fixed_length_header(page, page_offset, buff); + break; + default: + DBUG_ASSERT(0); /* we read some junk (got no LSN) */ + res= RECHEADER_READ_ERROR; + } + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of a record (the part depend + on record type). + + @param lsn log record serial number (address of the record) + @param buff log record header buffer + + @note Some type of record can be read completely by this call + @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative + LSN can be translated to absolute one), some fields can be added (like + actual header length in the record if the header has variable length) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff) +{ + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE; + PAGECACHE_BLOCK_LINK *direct_link; + TRANSLOG_ADDRESS addr; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_read_record_header"); + DBUG_PRINT("enter", ("LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); + DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + buff->lsn= lsn; + buff->groups_no= 0; + data.addr= &addr; + data.was_recovered= 0; + addr= lsn; + addr-= page_offset; /* offset decreasing */ + res= (!(page= translog_get_page(&data, psize_buff.buffer, &direct_link))) ? + RECHEADER_READ_ERROR : + translog_read_record_header_from_buffer(page, page_offset, buff, 0); + translog_free_link(direct_link); + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of a record (the part depend + on record type). + + @param scan scanner position to read + @param buff log record header buffer + @param move_scanner request to move scanner to the header position + + @note Some type of record can be read completely by this call + @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative + LSN can be translated to absolute one), some fields can be added (like + actual header length in the record if the header has variable length) + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where stored + decoded part of the header +*/ + +int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff, + my_bool move_scanner) +{ + translog_size_t res; + DBUG_ENTER("translog_read_record_header_scan"); + DBUG_PRINT("enter", ("Scanner: Cur: " LSN_FMT " Hrz: " LSN_FMT " " + "Lst: " LSN_FMT " Offset: %u(%x) fixed %d", + LSN_IN_PARTS(scanner->page_addr), + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->last_file_page), + (uint) scanner->page_offset, + (uint) scanner->page_offset, scanner->fixed_horizon)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + buff->groups_no= 0; + buff->lsn= scanner->page_addr; + buff->lsn+= scanner->page_offset; /* offset increasing */ + res= translog_read_record_header_from_buffer(scanner->page, + scanner->page_offset, + buff, + (move_scanner ? + scanner : 0)); + DBUG_RETURN(res); +} + + +/** + @brief Read record header and some fixed part of the next record (the part + depend on record type). + + @param scanner data for scanning if lsn is NULL scanner data + will be used for continue scanning. + The scanner can be NULL. + + @param buff log record header buffer + + @return Length of header or operation status + @retval RECHEADER_READ_ERROR error + @retval RECHEADER_READ_EOF EOF + @retval # number of bytes in + TRANSLOG_HEADER_BUFFER::header where + stored decoded part of the header +*/ + +int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff) +{ + translog_size_t res; + + DBUG_ENTER("translog_read_next_record_header"); + buff->groups_no= 0; /* to be sure that we will free it right */ + DBUG_PRINT("enter", ("scanner: %p", scanner)); + DBUG_PRINT("info", ("Scanner: Cur: " LSN_FMT " Hrz: " LSN_FMT " " + "Lst: " LSN_FMT " Offset: %u(%x) fixed: %d", + LSN_IN_PARTS(scanner->page_addr), + LSN_IN_PARTS(scanner->horizon), + LSN_IN_PARTS(scanner->last_file_page), + (uint) scanner->page_offset, + (uint) scanner->page_offset, scanner->fixed_horizon)); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + do + { + if (translog_get_next_chunk(scanner)) + DBUG_RETURN(RECHEADER_READ_ERROR); + if (scanner->page == END_OF_LOG) + { + DBUG_PRINT("info", ("End of file from the scanner")); + /* Last record was read */ + buff->lsn= LSN_IMPOSSIBLE; + DBUG_RETURN(RECHEADER_READ_EOF); + } + DBUG_PRINT("info", ("Page: " LSN_FMT " offset: %lu byte: %x", + LSN_IN_PARTS(scanner->page_addr), + (ulong) scanner->page_offset, + (uint) scanner->page[scanner->page_offset])); + } while (!translog_is_LSN_chunk(scanner->page[scanner->page_offset]) && + scanner->page[scanner->page_offset] != TRANSLOG_FILLER); + + if (scanner->page[scanner->page_offset] == TRANSLOG_FILLER) + { + DBUG_PRINT("info", ("End of file")); + /* Last record was read */ + buff->lsn= LSN_IMPOSSIBLE; + /* Return 'end of log' marker */ + res= RECHEADER_READ_EOF; + } + else + res= translog_read_record_header_scan(scanner, buff, 0); + DBUG_RETURN(res); +} + + +/* + Moves record data reader to the next chunk and fill the data reader + information about that chunk. + + SYNOPSIS + translog_record_read_next_chunk() + data data cursor + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_record_read_next_chunk(TRANSLOG_READER_DATA *data) +{ + translog_size_t new_current_offset= data->current_offset + data->chunk_size; + uint16 chunk_header_len, chunk_len; + uint8 type; + DBUG_ENTER("translog_record_read_next_chunk"); + + if (data->eor) + { + DBUG_PRINT("info", ("end of the record flag set")); + DBUG_RETURN(1); + } + + if (data->header.groups_no && + data->header.groups_no - 1 != data->current_group && + data->header.groups[data->current_group].num == data->current_chunk) + { + /* Goto next group */ + data->current_group++; + data->current_chunk= 0; + DBUG_PRINT("info", ("skip to group: #%u", data->current_group)); + translog_destroy_scanner(&data->scanner); + translog_scanner_init(data->header.groups[data->current_group].addr, + 1, &data->scanner, 1); + } + else + { + data->current_chunk++; + if (translog_get_next_chunk(&data->scanner)) + DBUG_RETURN(1); + if (data->scanner.page == END_OF_LOG) + { + /* + Actually it should not happened, but we want to quit nicely in case + of a truncated log + */ + DBUG_RETURN(1); + } + } + type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE; + + if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no) + { + DBUG_PRINT("info", + ("Last chunk: data len: %u offset: %u group: %u of %u", + data->header.chunk0_data_len, data->scanner.page_offset, + data->current_group, data->header.groups_no - 1)); + DBUG_ASSERT(data->header.groups_no - 1 == data->current_group); + DBUG_ASSERT(data->header.lsn == + data->scanner.page_addr + data->scanner.page_offset); + translog_destroy_scanner(&data->scanner); + translog_scanner_init(data->header.chunk0_data_addr, 1, &data->scanner, 1); + data->chunk_size= data->header.chunk0_data_len; + data->body_offset= data->scanner.page_offset; + data->current_offset= new_current_offset; + data->eor= 1; + DBUG_RETURN(0); + } + + if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED) + { + data->eor= 1; + DBUG_RETURN(1); /* End of record */ + } + + chunk_header_len= + translog_get_chunk_header_length(data->scanner.page + + data->scanner.page_offset); + chunk_len= translog_get_total_chunk_length(data->scanner.page, + data->scanner.page_offset); + data->chunk_size= chunk_len - chunk_header_len; + data->body_offset= data->scanner.page_offset + chunk_header_len; + data->current_offset= new_current_offset; + DBUG_PRINT("info", ("grp: %u chunk: %u body_offset: %u chunk_size: %u " + "current_offset: %lu", + (uint) data->current_group, + (uint) data->current_chunk, + (uint) data->body_offset, + (uint) data->chunk_size, (ulong) data->current_offset)); + DBUG_RETURN(0); +} + + +/* + Initialize record reader data from LSN + + SYNOPSIS + translog_init_reader_data() + lsn reference to LSN we should start from + data reader data to initialize + + RETURN + 0 OK + 1 Error +*/ + +static my_bool translog_init_reader_data(LSN lsn, + TRANSLOG_READER_DATA *data) +{ + int read_header; + DBUG_ENTER("translog_init_reader_data"); + if (translog_scanner_init(lsn, 1, &data->scanner, 1) || + ((read_header= + translog_read_record_header_scan(&data->scanner, &data->header, 1)) + == RECHEADER_READ_ERROR)) + DBUG_RETURN(1); + data->read_header= read_header; + data->body_offset= data->header.non_header_data_start_offset; + data->chunk_size= data->header.non_header_data_len; + data->current_offset= data->read_header; + data->current_group= 0; + data->current_chunk= 0; + data->eor= 0; + DBUG_PRINT("info", ("read_header: %u " + "body_offset: %u chunk_size: %u current_offset: %lu", + (uint) data->read_header, + (uint) data->body_offset, + (uint) data->chunk_size, (ulong) data->current_offset)); + DBUG_RETURN(0); +} + + +/** + @brief Destroy reader data object +*/ + +static void translog_destroy_reader_data(TRANSLOG_READER_DATA *data) +{ + translog_destroy_scanner(&data->scanner); + translog_free_record_header(&data->header); +} + + +/* + Read a part of the record. + + SYNOPSIS + translog_read_record_header() + lsn log record serial number (address of the record) + offset From the beginning of the record beginning (read + by translog_read_record_header). + length Length of record part which have to be read. + buffer Buffer where to read the record part (have to be at + least 'length' bytes length) + + RETURN + length of data actually read +*/ + +translog_size_t translog_read_record(LSN lsn, + translog_size_t offset, + translog_size_t length, + uchar *buffer, + TRANSLOG_READER_DATA *data) +{ + translog_size_t requested_length= length; + translog_size_t end= offset + length; + TRANSLOG_READER_DATA internal_data; + DBUG_ENTER("translog_read_record"); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (data == NULL) + { + DBUG_ASSERT(lsn != LSN_IMPOSSIBLE); + data= &internal_data; + } + if (lsn || + (offset < data->current_offset && + !(offset < data->read_header && offset + length < data->read_header))) + { + if (translog_init_reader_data(lsn, data)) + DBUG_RETURN(0); + } + DBUG_PRINT("info", ("Offset: %lu length: %lu " + "Scanner: Cur: " LSN_FMT " Hrz: " LSN_FMT " " + "Lst: " LSN_FMT " Offset: %u(%x) fixed: %d", + (ulong) offset, (ulong) length, + LSN_IN_PARTS(data->scanner.page_addr), + LSN_IN_PARTS(data->scanner.horizon), + LSN_IN_PARTS(data->scanner.last_file_page), + (uint) data->scanner.page_offset, + (uint) data->scanner.page_offset, + data->scanner.fixed_horizon)); + if (offset < data->read_header) + { + uint16 len= MY_MIN(data->read_header, end) - offset; + DBUG_PRINT("info", + ("enter header offset: %lu length: %lu", + (ulong) offset, (ulong) length)); + memcpy(buffer, data->header.header + offset, len); + length-= len; + if (length == 0) + { + translog_destroy_reader_data(data); + DBUG_RETURN(requested_length); + } + offset+= len; + buffer+= len; + DBUG_PRINT("info", + ("len: %u offset: %lu curr: %lu length: %lu", + len, (ulong) offset, (ulong) data->current_offset, + (ulong) length)); + } + /* TODO: find first page which we should read by offset */ + + /* read the record chunk by chunk */ + for(;;) + { + uint page_end= data->current_offset + data->chunk_size; + DBUG_PRINT("info", + ("enter body offset: %lu curr: %lu " + "length: %lu page_end: %lu", + (ulong) offset, (ulong) data->current_offset, (ulong) length, + (ulong) page_end)); + if (offset < page_end) + { + uint len= page_end - offset; + set_if_smaller(len, length); /* in case we read beyond record's end */ + DBUG_ASSERT(offset >= data->current_offset); + memcpy(buffer, + data->scanner.page + data->body_offset + + (offset - data->current_offset), len); + length-= len; + if (length == 0) + { + translog_destroy_reader_data(data); + DBUG_RETURN(requested_length); + } + offset+= len; + buffer+= len; + DBUG_PRINT("info", + ("len: %u offset: %lu curr: %lu length: %lu", + len, (ulong) offset, (ulong) data->current_offset, + (ulong) length)); + } + if (translog_record_read_next_chunk(data)) + { + translog_destroy_reader_data(data); + DBUG_RETURN(requested_length - length); + } + } +} + + +/* + @brief Force skipping to the next buffer + + @todo Do not copy old page content if all page protections are switched off + (because we do not need calculate something or change old parts of the page) +*/ + +static void translog_force_current_buffer_to_finish() +{ + TRANSLOG_ADDRESS new_buff_beginning; + uint16 old_buffer_no= log_descriptor.bc.buffer_no; + uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + struct st_translog_buffer *new_buffer= (log_descriptor.buffers + + new_buffer_no); + struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer; + uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill; + uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill; + uint16 UNINIT_VAR(current_page_fill), write_counter, previous_offset; + DBUG_ENTER("translog_force_current_buffer_to_finish"); + + DBUG_PRINT("enter", ("Buffer #%u %p " + "Buffer addr: " LSN_FMT " " + "Page addr: " LSN_FMT " " + "size: %lu (%lu) Pg: %u left: %u in progress %u", + (uint) old_buffer_no, + old_buffer, + LSN_IN_PARTS(old_buffer->offset), + LSN_FILE_NO(log_descriptor.horizon), + (uint)(LSN_OFFSET(log_descriptor.horizon) - + log_descriptor.bc.current_page_fill), + (ulong) old_buffer->size, + (ulong) (log_descriptor.bc.ptr -log_descriptor.bc. + buffer->buffer), + (uint) log_descriptor.bc.current_page_fill, + (uint) left, + (uint) old_buffer-> + copy_to_buffer_in_progress)); + translog_lock_assert_owner(); + new_buff_beginning= old_buffer->offset; + new_buff_beginning+= old_buffer->size; /* increase offset */ + + DBUG_ASSERT(log_descriptor.bc.ptr !=NULL); + DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) == + LSN_FILE_NO(old_buffer->offset) || + translog_status == TRANSLOG_READONLY ); + translog_check_cursor(&log_descriptor.bc); + DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE); + if (left) + { + /* + TODO: if 'left' is so small that can't hold any other record + then do not move the page + */ + DBUG_PRINT("info", ("left: %u", (uint) left)); + + old_buffer->pre_force_close_horizon= + old_buffer->offset + old_buffer->size; + /* decrease offset */ + new_buff_beginning-= log_descriptor.bc.current_page_fill; + current_page_fill= log_descriptor.bc.current_page_fill; + + memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left); + old_buffer->size+= left; + DBUG_PRINT("info", ("Finish Page buffer #%u: %p " + "Size: %lu", + (uint) old_buffer->buffer_no, + old_buffer, + (ulong) old_buffer->size)); + DBUG_ASSERT(old_buffer->buffer_no == + log_descriptor.bc.buffer_no); + } + else + { + log_descriptor.bc.current_page_fill= 0; + } + + translog_buffer_lock(new_buffer); +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= new_buffer->offset; + TRANSLOG_FILE *file= new_buffer->file; + uint8 ver= new_buffer->ver; + translog_lock_assert_owner(); +#endif + translog_wait_for_buffer_free(new_buffer); +#ifndef DBUG_OFF + /* We keep the handler locked so nobody can start this new buffer */ + DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL && + (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver); + } +#endif + + write_counter= log_descriptor.bc.write_counter; + previous_offset= log_descriptor.bc.previous_offset; + translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no); + /* Fix buffer offset (which was incorrectly set to horizon) */ + log_descriptor.bc.buffer->offset= new_buff_beginning; + log_descriptor.bc.write_counter= write_counter; + log_descriptor.bc.previous_offset= previous_offset; + new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer); + DBUG_PRINT("info", ("prev_last_lsn set to " LSN_FMT " buffer: %p", + LSN_IN_PARTS(new_buffer->prev_last_lsn), + new_buffer)); + + /* + Advances this log pointer, increases writers and let other threads to + write to the log while we process old page content + */ + if (left) + { + log_descriptor.bc.ptr+= current_page_fill; + log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill= + current_page_fill; + new_buffer->overlay= 1; + } + else + translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc); + translog_buffer_increase_writers(new_buffer); + translog_buffer_unlock(new_buffer); + + /* + We have to wait until all writers finish before start changing the + pages by applying protection and copying the page content in the + new buffer. + */ +#ifndef DBUG_OFF + { + TRANSLOG_ADDRESS offset= old_buffer->offset; + TRANSLOG_FILE *file= old_buffer->file; + uint8 ver= old_buffer->ver; +#endif + /* + Now only one thread can flush log (buffer can flush many threads but + log flush log flush where this function is used can do only one thread) + so no other thread can set is_closing_buffer. + */ + DBUG_ASSERT(!old_buffer->is_closing_buffer); + old_buffer->is_closing_buffer= 1; /* Other flushes will wait */ + DBUG_PRINT("enter", ("Buffer #%u %p is_closing_buffer set", + (uint) old_buffer->buffer_no, old_buffer)); + translog_wait_for_writers(old_buffer); +#ifndef DBUG_OFF + /* We blocked flushing this buffer so the buffer should not changed */ + DBUG_ASSERT(offset == old_buffer->offset && file == old_buffer->file && + ver == old_buffer->ver); + } +#endif + + if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION) + { + translog_put_sector_protection(data, &log_descriptor.bc); + if (left) + { + log_descriptor.bc.write_counter++; + log_descriptor.bc.previous_offset= current_page_fill; + } + else + { + DBUG_PRINT("info", ("drop write_counter")); + log_descriptor.bc.write_counter= 0; + log_descriptor.bc.previous_offset= 0; + } + } + + if (log_descriptor.flags & TRANSLOG_PAGE_CRC) + { + uint32 crc= translog_crc(data + log_descriptor.page_overhead, + TRANSLOG_PAGE_SIZE - + log_descriptor.page_overhead); + DBUG_PRINT("info", ("CRC: 0x%x", crc)); + int4store(data + 3 + 3 + 1, crc); + } + old_buffer->is_closing_buffer= 0; + DBUG_PRINT("enter", ("Buffer #%u %p is_closing_buffer cleared", + (uint) old_buffer->buffer_no, old_buffer)); + mysql_cond_broadcast(&old_buffer->waiting_filling_buffer); + + if (left) + { + if (log_descriptor.flags & + (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION)) + memcpy(new_buffer->buffer, data, current_page_fill); + else + { + /* + This page header does not change if we add more data to the page so + we can not copy it and will not overwrite later + */ + new_buffer->skipped_data= current_page_fill; + TRASH_ALLOC(new_buffer->buffer, current_page_fill); + DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE); + } + } + old_buffer->next_buffer_offset= new_buffer->offset; + translog_buffer_lock(new_buffer); + new_buffer->prev_buffer_offset= old_buffer->offset; + translog_buffer_decrease_writers(new_buffer); + translog_buffer_unlock(new_buffer); + + DBUG_VOID_RETURN; +} + + +/** + @brief Waits while given lsn will be flushed + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed +*/ + +void translog_flush_wait_for_end(LSN lsn) +{ + DBUG_ENTER("translog_flush_wait_for_end"); + DBUG_PRINT("enter", ("LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); + mysql_mutex_assert_owner(&log_descriptor.log_flush_lock); + while (cmp_translog_addr(log_descriptor.flushed, lsn) < 0) + mysql_cond_wait(&log_descriptor.log_flush_cond, + &log_descriptor.log_flush_lock); + DBUG_VOID_RETURN; +} + + +/** + @brief Sets goal for the next flush pass and waits for this pass end. + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed +*/ + +void translog_flush_set_new_goal_and_wait(TRANSLOG_ADDRESS lsn) +{ + int flush_no= log_descriptor.flush_no; + DBUG_ENTER("translog_flush_set_new_goal_and_wait"); + DBUG_PRINT("enter", ("LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); + mysql_mutex_assert_owner(&log_descriptor.log_flush_lock); + if (cmp_translog_addr(lsn, log_descriptor.next_pass_max_lsn) > 0) + { + log_descriptor.next_pass_max_lsn= lsn; + log_descriptor.max_lsn_requester= pthread_self(); + mysql_cond_broadcast(&log_descriptor.new_goal_cond); + } + while (flush_no == log_descriptor.flush_no) + { + mysql_cond_wait(&log_descriptor.log_flush_cond, + &log_descriptor.log_flush_lock); + } + DBUG_VOID_RETURN; +} + + +/** + @brief sync() range of files (inclusive) and directory (by request) + + @param min min internal file number to flush + @param max max internal file number to flush + @param sync_dir need sync directory + + return Operation status + @retval 0 OK + @retval 1 Error +*/ + +static my_bool translog_sync_files(uint32 min, uint32 max, + my_bool sync_dir) +{ + uint fn; + my_bool rc= 0; + ulonglong flush_interval; + DBUG_ENTER("translog_sync_files"); + DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d", + (ulong) min, (ulong) max, (int) sync_dir)); + DBUG_ASSERT(min <= max); + + flush_interval= group_commit_wait; + if (flush_interval) + flush_start= microsecond_interval_timer(); + for (fn= min; fn <= max; fn++) + { + TRANSLOG_FILE *file= get_logfile_by_number(fn); + DBUG_ASSERT(file != NULL); + if (!file->is_sync) + { + if (mysql_file_sync(file->handler.file, MYF(MY_WME))) + { + rc= 1; + translog_stop_writing(); + DBUG_RETURN(rc); + } + translog_syncs++; + file->is_sync= 1; + } + } + + if (sync_dir) + { + if (!(rc= sync_dir(log_descriptor.directory_fd, + MYF(MY_WME | MY_IGNORE_BADFD)))) + translog_syncs++; + } + + DBUG_RETURN(rc); +} + + +/** + check_skipped_lsn + + Check if lsn skipped in redo is ok +*/ + +void check_skipped_lsn(MARIA_HA *info, LSN lsn, my_bool index_file, + pgcache_page_no_t page) +{ + if (lsn <= log_descriptor.horizon) + { + DBUG_PRINT("info", ("Page is up to date, skipping redo")); + } + else + { + /* Give error, but don't flood the log */ + if (skipped_lsn_err_count++ < MAX_LSN_ERRORS && + ! info->s->redo_error_given++) + { + eprint(tracef, "Table %s has wrong LSN: " LSN_FMT " on page: %llu", + (index_file ? info->s->data_file_name.str : + info->s->index_file_name.str), + LSN_IN_PARTS(lsn), (ulonglong) page); + recovery_found_crashed_tables++; + } + } +} + + +/* + @brief Flushes buffers with LSNs in them less or equal address <lsn> + + @param lsn address up to which all LSNs should be flushed, + can be reset to real last LSN address + @parem sent_to_disk returns 'sent to disk' position + @param flush_horizon returns horizon of the flush + + @note About terminology see comment to translog_flush(). +*/ + +void translog_flush_buffers(TRANSLOG_ADDRESS *lsn, + TRANSLOG_ADDRESS *sent_to_disk, + TRANSLOG_ADDRESS *flush_horizon) +{ + dirty_buffer_mask_t dirty_buffer_mask; + uint i; + uint8 UNINIT_VAR(last_buffer_no), start_buffer_no; + DBUG_ENTER("translog_flush_buffers"); + + /* + We will recheck information when will lock buffers one by + one so we can use unprotected read here (this is just for + speed up buffers processing) + */ + dirty_buffer_mask= log_descriptor.dirty_buffer_mask; + DBUG_PRINT("info", ("Dirty buffer mask: %lx current buffer: %u", + (ulong) dirty_buffer_mask, + (uint) log_descriptor.bc.buffer_no)); + for (i= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO; + i != log_descriptor.bc.buffer_no && !(dirty_buffer_mask & (1 << i)); + i= (i + 1) % TRANSLOG_BUFFERS_NO) {} + start_buffer_no= i; + + DBUG_PRINT("info", + ("start from: %u current: %u prev last lsn: " LSN_FMT, + (uint) start_buffer_no, (uint) log_descriptor.bc.buffer_no, + LSN_IN_PARTS(log_descriptor.bc.buffer->prev_last_lsn))); + + /* + if LSN up to which we have to flush bigger then maximum LSN of previous + buffer and at least one LSN was saved in the current buffer (last_lsn != + LSN_IMPOSSIBLE) then we have to close the current buffer. + */ + if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 && + log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE) + { + struct st_translog_buffer *buffer= log_descriptor.bc.buffer; + *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */ + DBUG_PRINT("info", ("LSN to flush fixed to last lsn: " LSN_FMT, + LSN_IN_PARTS(*lsn))); + last_buffer_no= log_descriptor.bc.buffer_no; + log_descriptor.is_everything_flushed= 1; + translog_force_current_buffer_to_finish(); + translog_buffer_unlock(buffer); + } + else + { + if (log_descriptor.bc.buffer->last_lsn == LSN_IMPOSSIBLE && + log_descriptor.bc.buffer->prev_last_lsn == LSN_IMPOSSIBLE) + { + DBUG_PRINT("info", ("There is no LSNs yet generated => do nothing")); + translog_unlock(); + DBUG_VOID_RETURN; + } + + /* fix lsn if it was horizon */ + *lsn= log_descriptor.bc.buffer->prev_last_lsn; + DBUG_PRINT("info", ("LSN to flush fixed to prev last lsn: " LSN_FMT, + LSN_IN_PARTS(*lsn))); + last_buffer_no= ((log_descriptor.bc.buffer_no + TRANSLOG_BUFFERS_NO -1) % + TRANSLOG_BUFFERS_NO); + translog_unlock(); + } + /* flush buffers */ + *sent_to_disk= translog_get_sent_to_disk(); + if (cmp_translog_addr(*lsn, *sent_to_disk) > 0) + { + + DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u", + (uint) start_buffer_no, (uint) last_buffer_no)); + last_buffer_no= (last_buffer_no + 1) % TRANSLOG_BUFFERS_NO; + i= start_buffer_no; + do + { + struct st_translog_buffer *buffer= log_descriptor.buffers + i; + translog_buffer_lock(buffer); + DBUG_PRINT("info", ("Check buffer: %p #: %u " + "prev last LSN: " LSN_FMT " " + "last LSN: " LSN_FMT " status: %s", + buffer, + (uint) i, + LSN_IN_PARTS(buffer->prev_last_lsn), + LSN_IN_PARTS(buffer->last_lsn), + (buffer->file ? + "dirty" : "closed"))); + if (buffer->prev_last_lsn <= *lsn && + buffer->file != NULL) + { + DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size); + *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ? + buffer->pre_force_close_horizon : + buffer->offset + buffer->size); + /* pre_force_close_horizon is reset during new buffer start */ + DBUG_PRINT("info", ("flush_horizon: " LSN_FMT, + LSN_IN_PARTS(*flush_horizon))); + DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon); + + translog_buffer_flush(buffer); + } + translog_buffer_unlock(buffer); + i= (i + 1) % TRANSLOG_BUFFERS_NO; + } while (i != last_buffer_no); + *sent_to_disk= translog_get_sent_to_disk(); + } + + DBUG_VOID_RETURN; +} + +/** + @brief Flush the log up to given LSN (included) + + @param lsn log record serial number up to which (inclusive) + the log has to be flushed + + @return Operation status + @retval 0 OK + @retval 1 Error + + @note + + - Non group commit logic: Commits made in passes. Thread which started + flush first is performing actual flush, other threads sets new goal (LSN) + of the next pass (if it is maximum) and waits for the pass end or just + wait for the pass end. + + - If hard group commit enabled and rate set to zero: + The first thread sends all changed buffers to disk. This is repeated + as long as there are new LSNs added. The process can not loop + forever because we have limited number of threads and they will wait + for the data to be synced. + Pseudo code: + + do + send changed buffers to disk + while new_goal + sync + + - If hard group commit switched ON and less than rate microseconds has + passed from last sync, then after buffers have been sent to disk + wait until rate microseconds has passed since last sync, do sync and return. + This ensures that if we call sync infrequently we don't do any waits. + + - If soft group commit enabled everything works as with 'non group commit' + but the thread doesn't do any real sync(). If rate is not zero the + sync() will be performed by a service thread with the given rate + when needed (new LSN appears). + + @note Terminology: + 'sent to disk' means written to disk but not sync()ed, + 'flushed' mean sent to disk and synced(). +*/ + +my_bool translog_flush(TRANSLOG_ADDRESS lsn) +{ + struct timespec abstime; + ulonglong UNINIT_VAR(flush_interval); + ulonglong time_spent; + LSN sent_to_disk= LSN_IMPOSSIBLE; + TRANSLOG_ADDRESS flush_horizon; + my_bool rc= 0; + my_bool hgroup_commit_at_start; + DBUG_ENTER("translog_flush"); + DBUG_PRINT("enter", ("Flush up to LSN: " LSN_FMT, LSN_IN_PARTS(lsn))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + mysql_mutex_lock(&log_descriptor.log_flush_lock); + DBUG_PRINT("info", ("Everything is flushed up to " LSN_FMT, + LSN_IN_PARTS(log_descriptor.flushed))); + if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0) + { + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_RETURN(0); + } + if (log_descriptor.flush_in_progress) + { + translog_lock(); + /* fix lsn if it was horizon */ + if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0) + lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer); + translog_unlock(); + translog_flush_set_new_goal_and_wait(lsn); + if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self())) + { + /* + translog_flush_wait_for_end() release log_flush_lock while is + waiting then acquire it again + */ + translog_flush_wait_for_end(lsn); + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_RETURN(0); + } + log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; + } + log_descriptor.flush_in_progress= 1; + flush_horizon= log_descriptor.previous_flush_horizon; + DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: " LSN_FMT, + LSN_IN_PARTS(flush_horizon))); + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + + hgroup_commit_at_start= hard_group_commit; + if (hgroup_commit_at_start) + flush_interval= group_commit_wait; + + translog_lock(); + if (log_descriptor.is_everything_flushed) + { + DBUG_PRINT("info", ("everything is flushed")); + translog_unlock(); + mysql_mutex_lock(&log_descriptor.log_flush_lock); + goto out; + } + + for (;;) + { + /* Following function flushes buffers and makes translog_unlock() */ + translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon); + + if (!hgroup_commit_at_start) + break; /* flush pass is ended */ + +retest: + /* + We do not check time here because mysql_mutex_lock rarely takes + a lot of time so we can sacrifice a bit precision to performance + (taking into account that microsecond_interval_timer() might be + expensive call). + */ + if (flush_interval == 0) + break; /* flush pass is ended */ + + mysql_mutex_lock(&log_descriptor.log_flush_lock); + if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE) + { + if (flush_interval == 0 || + (time_spent= (microsecond_interval_timer() - flush_start)) >= + flush_interval) + { + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + break; + } + DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu", + flush_interval - time_spent, + flush_interval, time_spent)); + /* wait time or next goal */ + set_timespec_nsec(abstime, flush_interval - time_spent); + mysql_cond_timedwait(&log_descriptor.new_goal_cond, + &log_descriptor.log_flush_lock, + &abstime); + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + DBUG_PRINT("info", ("retest conditions")); + goto retest; + } + + /* take next goal */ + lsn= log_descriptor.next_pass_max_lsn; + log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE; + /* prevent other thread from continue */ + log_descriptor.max_lsn_requester= pthread_self(); + DBUG_PRINT("info", ("flush took next goal: " LSN_FMT, + LSN_IN_PARTS(lsn))); + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + + /* next flush pass */ + DBUG_PRINT("info", ("next flush pass")); + translog_lock(); + } + + /* + sync() files from previous flush till current one + */ + if (!soft_sync || hgroup_commit_at_start) + { + if ((rc= + translog_sync_files(LSN_FILE_NO(log_descriptor.flushed), + LSN_FILE_NO(lsn), + sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS && + (LSN_FILE_NO(log_descriptor. + previous_flush_horizon) != + LSN_FILE_NO(flush_horizon) || + (LSN_OFFSET(log_descriptor. + previous_flush_horizon) / + TRANSLOG_PAGE_SIZE) != + (LSN_OFFSET(flush_horizon) / + TRANSLOG_PAGE_SIZE))))) + { + sent_to_disk= LSN_IMPOSSIBLE; + mysql_mutex_lock(&log_descriptor.log_flush_lock); + goto out; + } + /* keep values for soft sync() and forced sync() actual */ + { + uint32 fileno= LSN_FILE_NO(lsn); + soft_sync_min= fileno; + soft_sync_max= fileno; + } + } + else + { + soft_sync_max= LSN_FILE_NO(lsn); + soft_need_sync= 1; + } + + DBUG_ASSERT(flush_horizon <= log_descriptor.horizon); + + mysql_mutex_lock(&log_descriptor.log_flush_lock); + log_descriptor.previous_flush_horizon= flush_horizon; +out: + if (sent_to_disk != LSN_IMPOSSIBLE) + log_descriptor.flushed= sent_to_disk; + log_descriptor.flush_in_progress= 0; + log_descriptor.flush_no++; + DBUG_PRINT("info", ("flush_in_progress is dropped")); + mysql_mutex_unlock(&log_descriptor.log_flush_lock); + mysql_cond_broadcast(&log_descriptor.log_flush_cond); + DBUG_RETURN(rc); +} + + +/** + @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact + + If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently + open MARIA_SHAREs), give it one and record this assignment in the log + (LOGREC_FILE_ID log record). + + @param tbl_info table + @param trn calling transaction + + @return Operation status + @retval 0 OK + @retval 1 Error + + @note Can be called even if share already has an id (then will do nothing) +*/ + +int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn) +{ + uint16 id; + MARIA_SHARE *share= tbl_info->s; + /* + If you give an id to a non-BLOCK_RECORD table, you also need to release + this id somewhere. Then you can change the assertion. + */ + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + /* re-check under mutex to avoid having 2 ids for the same share */ + mysql_mutex_lock(&share->intern_lock); + if (unlikely(share->id == 0)) + { + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar log_data[FILEID_STORE_SIZE]; + /* Inspired by set_short_trid() of trnman.c */ + uint i= share->kfile.file % SHARE_ID_MAX + 1; + id= 0; + do + { + for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */ + { + void *tmp= NULL; + if (id_to_share[i] == NULL && + my_atomic_casptr((void **)&id_to_share[i], &tmp, share)) + { + id= (uint16) i; + break; + } + } + i= 1; /* scan the whole array */ + } while (id == 0); + DBUG_PRINT("info", ("id_to_share: %p -> %u", share, id)); + fileid_store(log_data, id); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + /* + open_file_name is an unresolved name (symlinks are not resolved, datadir + is not realpath-ed, etc) which is good: the log can be moved to another + directory and continue working. + */ + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= + (uchar *)share->open_file_name.str; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= + share->open_file_name.length + 1; + /* + We can't unlock share->intern_lock before the log entry is written to + ensure no one uses the id before it's logged. + */ + if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info, + (translog_size_t) + (sizeof(log_data) + + log_array[TRANSLOG_INTERNAL_PARTS + + 1].length), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL))) + { + mysql_mutex_unlock(&share->intern_lock); + return 1; + } + /* + Now when translog record is done, we can set share->id. + If we set it before, then translog_write_record may pick up the id + before it's written to the log. + */ + share->id= id; + share->state.logrec_file_id= lsn; + } + mysql_mutex_unlock(&share->intern_lock); + return 0; +} + + +/** + @brief Recycles a MARIA_SHARE's short id. + + @param share table + + @note Must be called only if share has an id (i.e. id != 0) +*/ + +void translog_deassign_id_from_share(MARIA_SHARE *share) +{ + DBUG_PRINT("info", ("id_to_share: %p id %u -> 0", + share, share->id)); + /* + We don't need any mutex as we are called only when closing the last + instance of the table or at the end of REPAIR: no writes can be + happening. But a Checkpoint may be reading share->id, so we require this + mutex: + */ + mysql_mutex_assert_owner(&share->intern_lock); + my_atomic_storeptr((void **)&id_to_share[share->id], 0); + share->id= 0; + /* useless but safety: */ + share->lsn_of_file_id= LSN_IMPOSSIBLE; +} + + +void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share, + uint16 id) +{ + DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + DBUG_ASSERT(share->id == 0); + DBUG_ASSERT(id_to_share[id] == NULL); + id_to_share[share->id= id]= share; +} + + +/** + @brief check if such log file exists + + @param file_no number of the file to test + + @retval 0 no such file + @retval 1 there is file with such number +*/ + +my_bool translog_is_file(uint file_no) +{ + MY_STAT stat_buff; + char path[FN_REFLEN]; + return (MY_TEST(mysql_file_stat(key_file_translog, + translog_filename_by_fileno(file_no, path), + &stat_buff, MYF(0)))); +} + + +/** + @brief returns minimum log file number + + @param horizon the end of the log + @param is_protected true if it is under purge_log protection + + @retval minimum file number + @retval 0 no files found +*/ + +static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected) +{ + uint min_file= 1, max_file; + DBUG_ENTER("translog_first_file"); + if (!is_protected) + mysql_mutex_lock(&log_descriptor.purger_lock); + if (log_descriptor.min_file_number) + { + min_file= log_descriptor.min_file_number; + if (translog_is_file(log_descriptor.min_file_number)) + { + DBUG_PRINT("info", ("cached %lu", + (ulong) log_descriptor.min_file_number)); + if (!is_protected) + mysql_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(log_descriptor.min_file_number); + } + } + + max_file= LSN_FILE_NO(horizon); + if (!translog_is_file(max_file)) + { + if (!is_protected) + mysql_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(max_file); /* For compatibility */ + } + + /* binary search for last file */ + while (min_file < max_file) + { + uint test= (min_file + max_file) / 2; + DBUG_PRINT("info", ("min_file: %u test: %u max_file: %u", + min_file, test, max_file)); + if (translog_is_file(test)) + max_file= test; + else + min_file= test+1; + } + log_descriptor.min_file_number= max_file; + if (!is_protected) + mysql_mutex_unlock(&log_descriptor.purger_lock); + DBUG_PRINT("info", ("first file :%lu", (ulong) max_file)); + DBUG_ASSERT(max_file >= 1); + DBUG_RETURN(max_file); +} + + +/** + @brief returns the most close LSN higher the given chunk address + + @param addr the chunk address to start from + @param horizon the horizon if it is known or LSN_IMPOSSIBLE + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no LSNs after the address + @retval # LSN of the most close LSN higher the given chunk address +*/ + +LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon) +{ + TRANSLOG_SCANNER_DATA scanner; + LSN result; + DBUG_ENTER("translog_next_LSN"); + + if (horizon == LSN_IMPOSSIBLE) + horizon= translog_get_horizon(); + + if (addr == horizon) + DBUG_RETURN(LSN_IMPOSSIBLE); + + translog_scanner_init(addr, 0, &scanner, 1); + /* + addr can point not to a chunk beginning but page end so next + page beginning. + */ + if (addr % TRANSLOG_PAGE_SIZE == 0) + { + /* + We are emulating the page end which cased such horizon value to + trigger translog_scanner_eop(). + + We can't just increase addr on page header overhead because it + can be file end so we allow translog_get_next_chunk() to skip + to the next page in correct way + */ + scanner.page_addr-= TRANSLOG_PAGE_SIZE; + scanner.page_offset= TRANSLOG_PAGE_SIZE; +#ifndef DBUG_OFF + scanner.page= NULL; /* prevent using incorrect page content */ +#endif + } + /* addr can point not to a chunk beginning but to a page end */ + if (translog_scanner_eop(&scanner)) + { + if (translog_get_next_chunk(&scanner)) + { + result= LSN_ERROR; + goto out; + } + if (scanner.page == END_OF_LOG) + { + result= LSN_IMPOSSIBLE; + goto out; + } + } + + while (!translog_is_LSN_chunk(scanner.page[scanner.page_offset]) && + scanner.page[scanner.page_offset] != TRANSLOG_FILLER) + { + if (translog_get_next_chunk(&scanner)) + { + result= LSN_ERROR; + goto out; + } + if (scanner.page == END_OF_LOG) + { + result= LSN_IMPOSSIBLE; + goto out; + } + } + + if (scanner.page[scanner.page_offset] == TRANSLOG_FILLER) + result= LSN_IMPOSSIBLE; /* reached page filler */ + else + result= scanner.page_addr + scanner.page_offset; +out: + translog_destroy_scanner(&scanner); + DBUG_RETURN(result); +} + + +/** + @brief returns the LSN of the first record starting in this log + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no log or the log is empty + @retval # LSN of the first record +*/ + +LSN translog_first_lsn_in_log() +{ + TRANSLOG_ADDRESS addr, horizon= translog_get_horizon(); + TRANSLOG_VALIDATOR_DATA data; + uint file; + uint16 chunk_offset; + uchar *page; + DBUG_ENTER("translog_first_lsn_in_log"); + DBUG_PRINT("info", ("Horizon: " LSN_FMT, LSN_IN_PARTS(horizon))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (!(file= translog_first_file(horizon, 0))) + { + /* log has no records yet */ + DBUG_RETURN(LSN_IMPOSSIBLE); + } + + addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */ + data.addr= &addr; + { + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL || + (chunk_offset= translog_get_first_chunk_offset(page)) == 0) + DBUG_RETURN(LSN_ERROR); + } + addr+= chunk_offset; + + DBUG_RETURN(translog_next_LSN(addr, horizon)); +} + + +/** + @brief Returns theoretical first LSN if first log is present + + @retval LSN_ERROR Error + @retval LSN_IMPOSSIBLE no log + @retval # LSN of the first record +*/ + +LSN translog_first_theoretical_lsn() +{ + TRANSLOG_ADDRESS addr= translog_get_horizon(); + TRANSLOG_PAGE_SIZE_BUFF psize_buff; + uchar *page; + TRANSLOG_VALIDATOR_DATA data; + DBUG_ENTER("translog_first_theoretical_lsn"); + DBUG_PRINT("info", ("Horizon: " LSN_FMT, LSN_IN_PARTS(addr))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (!translog_is_file(1)) + DBUG_RETURN(LSN_IMPOSSIBLE); + if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE)) + { + /* log has no records yet */ + DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + + log_descriptor.page_overhead)); + } + + addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */ + data.addr= &addr; + if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL) + DBUG_RETURN(LSN_ERROR); + + DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE + + page_overhead[page[TRANSLOG_PAGE_FLAGS]])); +} + + +/** + @brief Checks given low water mark and purge files if it is need + + @param low the last (minimum) address which is need + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_purge(TRANSLOG_ADDRESS low) +{ + uint32 last_need_file= LSN_FILE_NO(low); + uint32 min_unsync; + int soft; + TRANSLOG_ADDRESS horizon= translog_get_horizon(); + int rc= 0; + DBUG_ENTER("translog_purge"); + DBUG_PRINT("enter", ("low: " LSN_FMT, LSN_IN_PARTS(low))); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + soft= soft_sync; + min_unsync= soft_sync_min; + DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync)); + if (soft && min_unsync < last_need_file) + { + last_need_file= min_unsync; + DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file)); + } + + mysql_mutex_lock(&log_descriptor.purger_lock); + DBUG_PRINT("info", ("last_lsn_checked file: %lu:", + (ulong) log_descriptor.last_lsn_checked)); + if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file) + { + uint32 i; + uint32 min_file= translog_first_file(horizon, 1); + DBUG_ASSERT(min_file != 0); /* log is already started */ + DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file)); + for(i= min_file; i < last_need_file && rc == 0; i++) + { + LSN lsn= translog_get_file_max_lsn_stored(i); + if (lsn == LSN_IMPOSSIBLE) + break; /* files are still in writing */ + if (lsn == LSN_ERROR) + { + rc= 1; + break; + } + if (cmp_translog_addr(lsn, low) >= 0) + break; + + DBUG_PRINT("info", ("purge file %lu", (ulong) i)); + + /* remove file descriptor from the cache */ + /* + log_descriptor.min_file can be changed only here during execution + and the function is serialized, so we can access it without problems + */ + if (i >= log_descriptor.min_file) + { + TRANSLOG_FILE *file; + mysql_rwlock_wrlock(&log_descriptor.open_files_lock); + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + DBUG_ASSERT(log_descriptor.min_file == i); + file= *((TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files)); + DBUG_PRINT("info", ("Files : %d", log_descriptor.open_files.elements)); + DBUG_ASSERT(i == file->number); + log_descriptor.min_file++; + DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 == + log_descriptor.open_files.elements); + mysql_rwlock_unlock(&log_descriptor.open_files_lock); + translog_close_log_file(file); + } + if (log_purge_type == TRANSLOG_PURGE_IMMIDIATE && ! log_purge_disabled) + { + char path[FN_REFLEN], *file_name; + file_name= translog_filename_by_fileno(i, path); + rc= MY_TEST(mysql_file_delete(key_file_translog, + file_name, MYF(MY_WME))); + } + } + if (unlikely(rc == 1)) + log_descriptor.min_need_file= 0; /* impossible value */ + else + log_descriptor.min_need_file= i; + } + + mysql_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(rc); +} + + +/** + @brief Purges files by stored min need file in case of + "one demand" purge type + + @note This function do real work only if it is "one demand" purge type + and translog_purge() was called at least once and last time without + errors + + @retval 0 OK + @retval 1 Error +*/ + +my_bool translog_purge_at_flush() +{ + uint32 i, min_file; + int rc= 0; + DBUG_ENTER("translog_purge_at_flush"); + DBUG_ASSERT(translog_status == TRANSLOG_OK || + translog_status == TRANSLOG_READONLY); + + if (unlikely(translog_status == TRANSLOG_READONLY)) + { + DBUG_PRINT("info", ("The log is read only => exit")); + DBUG_RETURN(0); + } + + if (log_purge_type != TRANSLOG_PURGE_ONDEMAND) + { + DBUG_PRINT("info", ("It is not \"at_flush\" => exit")); + DBUG_RETURN(0); + } + + mysql_mutex_lock(&log_descriptor.purger_lock); + + if (unlikely(log_descriptor.min_need_file == 0 || log_purge_disabled)) + { + DBUG_PRINT("info", ("No info about min need file => exit")); + mysql_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(0); + } + + min_file= translog_first_file(translog_get_horizon(), 1); + DBUG_ASSERT(min_file != 0); /* log is already started */ + for(i= min_file; i < log_descriptor.min_need_file ; i++) + { + char path[FN_REFLEN], *file_name; + DBUG_PRINT("info", ("purge file %lu\n", (ulong) i)); + file_name= translog_filename_by_fileno(i, path); + rc|= MY_TEST(mysql_file_delete(key_file_translog, + file_name, MYF(MY_WME))); + DBUG_ASSERT(rc == 0); + } + + mysql_mutex_unlock(&log_descriptor.purger_lock); + DBUG_RETURN(rc); +} + + +/** + @brief Gets min file number + + @param horizon the end of the log + + @retval minimum file number + @retval 0 no files found +*/ + +uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon) +{ + return translog_first_file(horizon, 0); +} + + +/** + @brief Gets min file number which is needed + + @retval minimum file number + @retval 0 unknown +*/ + +uint32 translog_get_first_needed_file() +{ + uint32 file_no; + mysql_mutex_lock(&log_descriptor.purger_lock); + file_no= log_descriptor.min_need_file; + mysql_mutex_unlock(&log_descriptor.purger_lock); + return file_no; +} + + +/** + @brief Gets transaction log file size + + @return transaction log file size +*/ + +uint32 translog_get_file_size() +{ + uint32 res; + translog_lock(); + res= log_descriptor.log_file_max_size; + translog_unlock(); + return (res); +} + + +/** + @brief Sets transaction log file size + + @return Returns actually set transaction log size +*/ + +void translog_set_file_size(uint32 size) +{ + struct st_translog_buffer *old_buffer= NULL; + DBUG_ENTER("translog_set_file_size"); + translog_lock(); + DBUG_PRINT("enter", ("Size: %lu", (ulong) size)); + DBUG_ASSERT(size % TRANSLOG_PAGE_SIZE == 0); + DBUG_ASSERT(size >= TRANSLOG_MIN_FILE_SIZE); + log_descriptor.log_file_max_size= size; + /* if current file longer then finish it*/ + if (LSN_OFFSET(log_descriptor.horizon) >= log_descriptor.log_file_max_size) + { + old_buffer= log_descriptor.bc.buffer; + translog_buffer_next(&log_descriptor.horizon, &log_descriptor.bc, 1); + translog_buffer_unlock(old_buffer); + } + translog_unlock(); + if (old_buffer) + { + translog_buffer_lock(old_buffer); + translog_buffer_flush(old_buffer); + translog_buffer_unlock(old_buffer); + } + DBUG_VOID_RETURN; +} + + +/** + Write debug information to log if we EXTRA_DEBUG is enabled +*/ + +my_bool translog_log_debug_info(TRN *trn __attribute__((unused)), + enum translog_debug_info_type type + __attribute__((unused)), + uchar *info __attribute__((unused)), + size_t length __attribute__((unused))) +{ +#ifdef EXTRA_DEBUG + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar debug_type; + LSN lsn; + + if (!trn) + { + /* + We can't log the current transaction because we don't have + an active transaction. Use a temporary transaction object instead + */ + trn= &dummy_transaction_object; + } + debug_type= (uchar) type; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + return translog_write_record(&lsn, LOGREC_DEBUG_INFO, + trn, NULL, + (translog_size_t) (1+ length), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL); +#else + return 0; +#endif +} + + + +/** + Sets soft sync mode + + @param mode TRUE if we need switch soft sync on else off +*/ + +void translog_soft_sync(my_bool mode) +{ + soft_sync= mode; +} + + +/** + Sets hard group commit + + @param mode TRUE if we need switch hard group commit on else off +*/ + +void translog_hard_group_commit(my_bool mode) +{ + hard_group_commit= mode; +} + + +/** + @brief forced log sync (used when we are switching modes) +*/ + +void translog_sync() +{ + DBUG_ENTER("ma_translog_sync"); + + /* The following is only true if initalization of translog succeded */ + if (log_descriptor.open_files.elements != 0) + { + uint32 max= get_current_logfile()->number; + uint32 min; + + min= soft_sync_min; + if (!min) + min= max; + + translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS); + } + DBUG_VOID_RETURN; +} + +/** + @brief set rate for group commit + + @param interval interval to set. + + @note We use this function with additional variable because have to + restart service thread with new value which we can't make inside changing + variable routine (update_maria_group_commit_interval) +*/ + +void translog_set_group_commit_interval(uint32 interval) +{ + DBUG_ENTER("translog_set_group_commit_interval"); + group_commit_wait= interval; + DBUG_PRINT("info", ("wait: %llu", + (ulonglong)group_commit_wait)); + DBUG_VOID_RETURN; +} + + +/** + @brief syncing service thread +*/ + +static pthread_handler_t +ma_soft_sync_background( void *arg __attribute__((unused))) +{ + + my_thread_init(); + { + DBUG_ENTER("ma_soft_sync_background"); + for(;;) + { + ulonglong prev_loop= microsecond_interval_timer(); + ulonglong time, sleep; + uint32 min, max, sync_request; + min= soft_sync_min; + max= soft_sync_max; + sync_request= soft_need_sync; + soft_sync_min= max; + soft_need_sync= 0; + + sleep= group_commit_wait; + if (sync_request) + translog_sync_files(min, max, FALSE); + time= microsecond_interval_timer() - prev_loop; + if (time > sleep) + sleep= 0; + else + sleep-= time; + if (my_service_thread_sleep(&soft_sync_control, sleep)) + break; + } + my_thread_end(); + DBUG_RETURN(0); + } +} + + +/** + @brief Starts syncing thread +*/ + +int translog_soft_sync_start(void) +{ + int res= 0; + uint32 min, max; + DBUG_ENTER("translog_soft_sync_start"); + + /* check and init variables */ + min= soft_sync_min; + max= soft_sync_max; + if (!max) + soft_sync_max= max= get_current_logfile()->number; + if (!min) + soft_sync_min= max; + soft_need_sync= 1; + + if (!(res= ma_service_thread_control_init(&soft_sync_control))) + if ((res= mysql_thread_create(key_thread_soft_sync, + &soft_sync_control.thread, NULL, + ma_soft_sync_background, NULL))) + soft_sync_control.killed= TRUE; + DBUG_RETURN(res); +} + + +/** + @brief Stops syncing thread +*/ + +void translog_soft_sync_end(void) +{ + DBUG_ENTER("translog_soft_sync_end"); + if (soft_sync_control.inited) + { + ma_service_thread_control_end(&soft_sync_control); + } + DBUG_VOID_RETURN; +} + + +/** + @brief Dump information about file header page. +*/ + +static void dump_header_page(uchar *buff) +{ + LOGHANDLER_FILE_INFO desc; + char strbuff[21]; + struct tm tmp_tm; + time_t header_time; + + translog_interpret_file_header(&desc, buff); + header_time= desc.timestamp/1000000ULL; + localtime_r(&header_time, &tmp_tm); + + printf(" This can be header page:\n" + " Timestamp: %04d.%02d.%02d %02d.%02d.%02d (%s)\n" + " Aria log version: %lu\n" + " Server version: %lu\n" + " Server id %lu\n" + " Page size %lu\n", + tmp_tm.tm_year+1900, tmp_tm.tm_mon+1, tmp_tm.tm_mday, + tmp_tm.tm_hour, tmp_tm.tm_min, tmp_tm.tm_sec, + llstr(desc.timestamp, strbuff), + desc.maria_version, + desc.mysql_version, + desc.server_id, + desc.page_size); + if (desc.page_size != TRANSLOG_PAGE_SIZE) + printf(" WARNING: page size is not equal compiled in one %lu!!!\n", + (ulong) TRANSLOG_PAGE_SIZE); + printf(" File number %lu\n" + " Max lsn: " LSN_FMT "\n", + desc.file_number, + LSN_IN_PARTS(desc.max_lsn)); +} + +static const char *record_class_string[]= +{ + "LOGRECTYPE_NOT_ALLOWED", + "LOGRECTYPE_VARIABLE_LENGTH", + "LOGRECTYPE_PSEUDOFIXEDLENGTH", + "LOGRECTYPE_FIXEDLENGTH" +}; + + +/** + @brief dump information about transaction log chunk + + @param buffer reference to the whole page + @param ptr pointer to the chunk + + @reval # reference to the next chunk + @retval NULL can't interpret data +*/ + +static uchar *dump_chunk(uchar *buffer, uchar *ptr) +{ + uint length; + if (*ptr == TRANSLOG_FILLER) + { + printf(" Filler till the page end\n"); + for (; ptr < buffer + TRANSLOG_PAGE_SIZE; ptr++) + { + if (*ptr != TRANSLOG_FILLER) + { + printf(" WARNING: non filler character met before page end " + "(page + 0x%04x: 0x%02x) (stop interpretation)!!!", + (uint) (ptr - buffer), (uint) ptr[0]); + return NULL; + } + } + return ptr; + } + if (*ptr == 0 || *ptr == 0xFF) + { + printf(" WARNING: chunk can't start from 0x0 " + "(stop interpretation)!!!\n"); + return NULL; + } + switch (ptr[0] & TRANSLOG_CHUNK_TYPE) { + case TRANSLOG_CHUNK_LSN: + printf(" LSN chunk type 0 (variable length)\n"); + if (likely((ptr[0] & TRANSLOG_REC_TYPE) != TRANSLOG_CHUNK_0_CONT)) + { + printf(" Record type %u: %s record class %s compressed LSNs: %u\n", + ptr[0] & TRANSLOG_REC_TYPE, + (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ? + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name : + "NULL"), + record_class_string[log_record_type_descriptor[ptr[0] & + TRANSLOG_REC_TYPE]. + rclass], + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. + compressed_LSN); + if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != + LOGRECTYPE_VARIABLE_LENGTH) + { + printf(" WARNING: this record class here can't be used " + "(stop interpretation)!!!\n"); + break; + } + } + else + printf(" Continuation of previous chunk 0 header \n"); + printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1)); + { + uchar *hdr_ptr= ptr + 1 + 2; /* chunk type and short trid */ + uint16 chunk_len; + printf (" Record length: %lu\n", + (ulong) translog_variable_record_1group_decode_len(&hdr_ptr)); + chunk_len= uint2korr(hdr_ptr); + if (chunk_len == 0) + printf (" It is 1 group record (chunk length == 0)\n"); + else + { + uint16 groups, i; + + printf (" Chunk length %u\n", (uint) chunk_len); + groups= uint2korr(hdr_ptr + 2); + hdr_ptr+= 4; + printf (" Number of groups left to the end %u:\n", (uint) groups); + for(i= 0; + i < groups && hdr_ptr < buffer + TRANSLOG_PAGE_SIZE; + i++, hdr_ptr+= LSN_STORE_SIZE + 1) + { + TRANSLOG_ADDRESS gpr_addr= lsn_korr(hdr_ptr); + uint pages= hdr_ptr[LSN_STORE_SIZE]; + printf (" Group +#%u: " LSN_FMT " pages: %u\n", + (uint) i, LSN_IN_PARTS(gpr_addr), pages); + } + } + } + break; + case TRANSLOG_CHUNK_FIXED: + printf(" LSN chunk type 1 (fixed size)\n"); + printf(" Record type %u: %s record class %s compressed LSNs: %u\n", + ptr[0] & TRANSLOG_REC_TYPE, + (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ? + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name : + "NULL"), + record_class_string[log_record_type_descriptor[ptr[0] & + TRANSLOG_REC_TYPE]. + rclass], + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE]. + compressed_LSN); + if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != + LOGRECTYPE_PSEUDOFIXEDLENGTH && + log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass != + LOGRECTYPE_FIXEDLENGTH) + { + printf(" WARNING: this record class here can't be used " + "(stop interpretation)!!!\n"); + } + printf(" Short transaction id: %u\n", (uint) uint2korr(ptr + 1)); + break; + case TRANSLOG_CHUNK_NOHDR: + printf(" No header chunk type 2(till the end of the page)\n"); + if (ptr[0] & TRANSLOG_REC_TYPE) + { + printf(" WARNING: chunk header content record type: 0x%02x " + "(dtop interpretation)!!!", + (uint) ptr[0]); + return NULL; + } + break; + case TRANSLOG_CHUNK_LNGTH: + printf(" Chunk with length type 3\n"); + if (ptr[0] & TRANSLOG_REC_TYPE) + { + printf(" WARNING: chunk header content record type: 0x%02x " + "(dtop interpretation)!!!", + (uint) ptr[0]); + return NULL; + } + break; + } + { + intptr offset= ptr - buffer; + DBUG_ASSERT(offset <= UINT_MAX16); + length= translog_get_total_chunk_length(buffer, (uint16)offset); + } + printf(" Length %u\n", length); + ptr+= length; + return ptr; +} + + +/** + @brief Dump information about page with data. +*/ + +static void dump_datapage(uchar *buffer, File handler) +{ + uchar *ptr; + ulong offset; + uint32 page, file; + uint header_len; + printf(" Page: %ld File number: %ld\n", + (ulong) (page= uint3korr(buffer)), + (ulong) (file= uint3korr(buffer + 3))); + if (page == 0) + printf(" WARNING: page == 0!!!\n"); + if (file == 0) + printf(" WARNING: file == 0!!!\n"); + offset= page * TRANSLOG_PAGE_SIZE; + printf(" Flags (0x%x):\n", (uint) buffer[TRANSLOG_PAGE_FLAGS]); + if (buffer[TRANSLOG_PAGE_FLAGS]) + { + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC) + printf(" Page CRC\n"); + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + printf(" Sector protection\n"); + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC) + printf(" Record CRC (WARNING: not yet implemented!!!)\n"); + if (buffer[TRANSLOG_PAGE_FLAGS] & ~(TRANSLOG_PAGE_CRC | + TRANSLOG_SECTOR_PROTECTION | + TRANSLOG_RECORD_CRC)) + { + printf(" WARNING: unknown flags (stop interpretation)!!!\n"); + return; + } + } + else + printf(" No flags\n"); + printf(" Page header length: %u\n", + (header_len= page_overhead[buffer[TRANSLOG_PAGE_FLAGS]])); + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC) + { + uint32 crc= uint4korr(buffer + TRANSLOG_PAGE_FLAGS + 1); + uint32 ccrc; + printf (" Page CRC 0x%04lx\n", (ulong) crc); + ccrc= translog_crc(buffer + header_len, TRANSLOG_PAGE_SIZE - header_len); + if (crc != ccrc) + printf(" WARNING: calculated CRC: 0x%04lx!!!\n", (ulong) ccrc); + } + if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) + { + TRANSLOG_FILE tfile; + { + uchar *table= buffer + header_len - + TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; + uint i; + printf(" Sector protection current value: 0x%02x\n", (uint) table[0]); + for (i= 1; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++) + { + printf(" Sector protection in sector: 0x%02x saved value 0x%02x\n", + (uint)buffer[i * DISK_DRIVE_SECTOR_SIZE], + (uint)table[i]); + } + } + tfile.number= file; + bzero(&tfile.handler, sizeof(tfile.handler)); + tfile.handler.file= handler; + tfile.was_recovered= 0; + tfile.is_sync= 1; + if (translog_check_sector_protection(buffer, &tfile)) + printf(" WARNING: sector protection found problems!!!\n"); + } + ptr= buffer + header_len; + while (ptr && ptr < buffer + TRANSLOG_PAGE_SIZE) + { + printf(" Chunk %d %lld:\n", + file,((longlong) (ptr - buffer)+ offset)); + ptr= dump_chunk(buffer, ptr); + } +} + + +/** + @brief Dump information about page. +*/ + +void dump_page(uchar *buffer, File handler) +{ + if (strncmp((char*)maria_trans_file_magic, (char*)buffer, + sizeof(maria_trans_file_magic)) == 0) + { + dump_header_page(buffer); + return; + } + dump_datapage(buffer, handler); +} + + +/* + Handle backup calls +*/ + +void translog_disable_purge() +{ + mysql_mutex_lock(&log_descriptor.purger_lock); + log_purge_disabled++; + mysql_mutex_unlock(&log_descriptor.purger_lock); +} + +void translog_enable_purge() +{ + mysql_mutex_lock(&log_descriptor.purger_lock); + log_purge_disabled--; + mysql_mutex_unlock(&log_descriptor.purger_lock); +} diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h new file mode 100644 index 00000000..3e5c58a8 --- /dev/null +++ b/storage/maria/ma_loghandler.h @@ -0,0 +1,538 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _ma_loghandler_h +#define _ma_loghandler_h + +#define MB (1024UL*1024) + +/* transaction log default cache size (TODO: make it global variable) */ +#define TRANSLOG_PAGECACHE_SIZE (2*MB) +/* transaction log default file size */ +#define TRANSLOG_FILE_SIZE (1024U*MB) +/* minimum possible transaction log size */ +#define TRANSLOG_MIN_FILE_SIZE (8*MB) +/* transaction log default flags (TODO: make it global variable) */ +#define TRANSLOG_DEFAULT_FLAGS 0 + +/* + Transaction log flags. + + We allow all kind protections to be switched on together for people who + really unsure in their hardware/OS. +*/ +#define TRANSLOG_PAGE_CRC 1U +#define TRANSLOG_SECTOR_PROTECTION (1U<<1) +#define TRANSLOG_RECORD_CRC (1U<<2) +#define TRANSLOG_FLAGS_NUM ((TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | \ + TRANSLOG_RECORD_CRC) + 1) + +#define RECHEADER_READ_ERROR -1 +#define RECHEADER_READ_EOF -2 + +/* + Page size in transaction log + It should be Power of 2 and multiple of DISK_DRIVE_SECTOR_SIZE + (DISK_DRIVE_SECTOR_SIZE * 2^N) +*/ +#define TRANSLOG_PAGE_SIZE (8U*1024) + +#include "ma_loghandler_lsn.h" +#include "trnman_public.h" + +/* short transaction ID type */ +typedef uint16 SHORT_TRANSACTION_ID; + +struct st_maria_handler; + +/* Changing one of the "SIZE" below will break backward-compatibility! */ +/* Length of CRC at end of pages */ +#define ROW_EXTENT_PAGE_SIZE 5 +#define ROW_EXTENT_COUNT_SIZE 2 +/* Size of file id in logs */ +#define FILEID_STORE_SIZE 2 +/* Size of page reference in log */ +#define PAGE_STORE_SIZE ROW_EXTENT_PAGE_SIZE +/* Size of page ranges in log */ +#define PAGERANGE_STORE_SIZE ROW_EXTENT_COUNT_SIZE +#define DIRPOS_STORE_SIZE 1 +#define CLR_TYPE_STORE_SIZE 1 +/* If table has live checksum we store its changes in UNDOs */ +#define HA_CHECKSUM_STORE_SIZE 4 +#define KEY_NR_STORE_SIZE 1 +#define PAGE_LENGTH_STORE_SIZE 2 + +/* Store methods to match the above sizes */ +#define fileid_store(T,A) int2store(T,A) +#define page_store(T,A) int5store(T,((ulonglong)(A))) +#define dirpos_store(T,A) ((*(uchar*) (T)) = A) +#define pagerange_store(T,A) int2store(T,A) +#define clr_type_store(T,A) ((*(uchar*) (T)) = A) +#define key_nr_store(T, A) ((*(uchar*) (T)) = A) +#define ha_checksum_store(T,A) int4store(T,A) +#define fileid_korr(P) uint2korr(P) +#define page_korr(P) uint5korr(P) +#define dirpos_korr(P) (*(const uchar *) (P)) +#define pagerange_korr(P) uint2korr(P) +#define clr_type_korr(P) (*(const uchar *) (P)) +#define key_nr_korr(P) (*(const uchar *) (P)) +#define ha_checksum_korr(P) uint4korr(P) + +/* + Length of disk drive sector size (we assume that writing it + to disk is an atomic operation) +*/ +#define DISK_DRIVE_SECTOR_SIZE 512U + +/* position reserved in an array of parts of a log record */ +#define TRANSLOG_INTERNAL_PARTS 2 + +/* types of records in the transaction log */ +/* TODO: Set numbers for these when we have all entries figured out */ + +enum translog_record_type +{ + LOGREC_RESERVED_FOR_CHUNKS23= 0, + LOGREC_REDO_INSERT_ROW_HEAD, + LOGREC_REDO_INSERT_ROW_TAIL, + LOGREC_REDO_NEW_ROW_HEAD, + LOGREC_REDO_NEW_ROW_TAIL, + LOGREC_REDO_INSERT_ROW_BLOBS, + LOGREC_REDO_PURGE_ROW_HEAD, + LOGREC_REDO_PURGE_ROW_TAIL, + LOGREC_REDO_FREE_BLOCKS, + LOGREC_REDO_FREE_HEAD_OR_TAIL, + LOGREC_REDO_DELETE_ROW, /* unused */ + LOGREC_REDO_UPDATE_ROW_HEAD, /* unused */ + LOGREC_REDO_INDEX, + LOGREC_REDO_INDEX_NEW_PAGE, + LOGREC_REDO_INDEX_FREE_PAGE, + LOGREC_REDO_UNDELETE_ROW, + LOGREC_CLR_END, + LOGREC_PURGE_END, + LOGREC_UNDO_ROW_INSERT, + LOGREC_UNDO_ROW_DELETE, + LOGREC_UNDO_ROW_UPDATE, + LOGREC_UNDO_KEY_INSERT, + LOGREC_UNDO_KEY_INSERT_WITH_ROOT, + LOGREC_UNDO_KEY_DELETE, + LOGREC_UNDO_KEY_DELETE_WITH_ROOT, + LOGREC_PREPARE, + LOGREC_PREPARE_WITH_UNDO_PURGE, + LOGREC_COMMIT, + LOGREC_COMMIT_WITH_UNDO_PURGE, + LOGREC_CHECKPOINT, + LOGREC_REDO_CREATE_TABLE, + LOGREC_REDO_RENAME_TABLE, + LOGREC_REDO_DROP_TABLE, + LOGREC_REDO_DELETE_ALL, + LOGREC_REDO_REPAIR_TABLE, + LOGREC_FILE_ID, + LOGREC_LONG_TRANSACTION_ID, + LOGREC_INCOMPLETE_LOG, + LOGREC_INCOMPLETE_GROUP, + LOGREC_UNDO_BULK_INSERT, + LOGREC_REDO_BITMAP_NEW_PAGE, + LOGREC_IMPORTED_TABLE, + LOGREC_DEBUG_INFO, + LOGREC_FIRST_FREE, + LOGREC_RESERVED_FUTURE_EXTENSION= 63 +}; +#define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */ + +/* Type of operations in LOGREC_REDO_INDEX */ + +enum en_key_op +{ + KEY_OP_NONE, /* Not used */ + KEY_OP_OFFSET, /* Set current position */ + KEY_OP_SHIFT, /* Shift up/or down at current position */ + KEY_OP_CHANGE, /* Change data at current position */ + KEY_OP_ADD_PREFIX, /* Insert data at start of page */ + KEY_OP_DEL_PREFIX, /* Delete data at start of page */ + KEY_OP_ADD_SUFFIX, /* Insert data at end of page */ + KEY_OP_DEL_SUFFIX, /* Delete data at end of page */ + KEY_OP_CHECK, /* For debugging; CRC of used part of page */ + KEY_OP_MULTI_COPY, /* List of memcpy()s with fixed-len sources in page */ + KEY_OP_SET_PAGEFLAG, /* Set pageflag from next byte */ + KEY_OP_COMPACT_PAGE, /* Compact key page */ + KEY_OP_MAX_PAGELENGTH, /* Set page to max page length */ + KEY_OP_DEBUG, /* Entry for storing what triggered redo_index */ + KEY_OP_DEBUG_2 /* Entry for pagelengths */ +}; + +enum en_key_debug +{ + KEY_OP_DEBUG_RTREE_COMBINE, /* 0 */ + KEY_OP_DEBUG_RTREE_SPLIT, /* 1 */ + KEY_OP_DEBUG_RTREE_SET_KEY, /* 2 */ + KEY_OP_DEBUG_FATHER_CHANGED_1, /* 3 */ + KEY_OP_DEBUG_FATHER_CHANGED_2, /* 4 */ + KEY_OP_DEBUG_LOG_SPLIT, /* 5 */ + KEY_OP_DEBUG_LOG_ADD_1, /* 6 */ + KEY_OP_DEBUG_LOG_ADD_2, /* 7 */ + KEY_OP_DEBUG_LOG_ADD_3, /* 8 */ + KEY_OP_DEBUG_LOG_ADD_4, /* 9 */ + KEY_OP_DEBUG_LOG_PREFIX_1, /* 10 */ + KEY_OP_DEBUG_LOG_PREFIX_2, /* 11 */ + KEY_OP_DEBUG_LOG_PREFIX_3, /* 12 */ + KEY_OP_DEBUG_LOG_PREFIX_4, /* 13 */ + KEY_OP_DEBUG_LOG_PREFIX_5, /* 14 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_1, /* 15 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_2, /* 16 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_3, /* 17 */ + KEY_OP_DEBUG_LOG_DEL_CHANGE_RT, /* 18 */ + KEY_OP_DEBUG_LOG_DEL_PREFIX, /* 19 */ + KEY_OP_DEBUG_LOG_MIDDLE /* 20 */ +}; + + +enum translog_debug_info_type +{ + LOGREC_DEBUG_INFO_QUERY +}; + +/* Size of log file; One log file is restricted to 4G */ +typedef uint32 translog_size_t; + +#define TRANSLOG_RECORD_HEADER_MAX_SIZE 1024U + +typedef struct st_translog_group_descriptor +{ + TRANSLOG_ADDRESS addr; + uint8 num; +} TRANSLOG_GROUP; + + +typedef struct st_translog_header_buffer +{ + /* LSN of the read record */ + LSN lsn; + /* array of groups descriptors, can be used only if groups_no > 0 */ + TRANSLOG_GROUP *groups; + /* short transaction ID or 0 if it has no sense for the record */ + SHORT_TRANSACTION_ID short_trid; + /* + The Record length in buffer (including read header, but excluding + hidden part of record (type, short TrID, length) + */ + translog_size_t record_length; + /* + Buffer for write decoded header of the record (depend on the record + type) + */ + uchar header[TRANSLOG_RECORD_HEADER_MAX_SIZE]; + /* number of groups listed in */ + uint groups_no; + /* in multi-group number of chunk0 pages (valid only if groups_no > 0) */ + uint chunk0_pages; + /* type of the read record */ + enum translog_record_type type; + /* chunk 0 data address (valid only if groups_no > 0) */ + TRANSLOG_ADDRESS chunk0_data_addr; + /* + Real compressed LSN(s) size economy (<number of LSN(s)>*7 - <real_size>) + */ + int16 compressed_LSN_economy; + /* short transaction ID or 0 if it has no sense for the record */ + uint16 non_header_data_start_offset; + /* non read body data length in this first chunk */ + uint16 non_header_data_len; + /* chunk 0 data size (valid only if groups_no > 0) */ + uint16 chunk0_data_len; +} TRANSLOG_HEADER_BUFFER; + + +typedef struct st_translog_scanner_data +{ + uchar buffer[TRANSLOG_PAGE_SIZE]; /* buffer for page content */ + TRANSLOG_ADDRESS page_addr; /* current page address */ + /* end of the log which we saw last time */ + TRANSLOG_ADDRESS horizon; + TRANSLOG_ADDRESS last_file_page; /* Last page on in this file */ + uchar *page; /* page content pointer */ + /* direct link on the current page or NULL if not supported/requested */ + PAGECACHE_BLOCK_LINK *direct_link; + /* offset of the chunk in the page */ + translog_size_t page_offset; + /* set horizon only once at init */ + my_bool fixed_horizon; + /* try to get direct link on the page if it is possible */ + my_bool use_direct_link; +} TRANSLOG_SCANNER_DATA; + + +typedef struct st_translog_reader_data +{ + TRANSLOG_HEADER_BUFFER header; /* Header */ + TRANSLOG_SCANNER_DATA scanner; /* chunks scanner */ + translog_size_t body_offset; /* current chunk body offset */ + /* data offset from the record beginning */ + translog_size_t current_offset; + /* number of bytes read in header */ + uint16 read_header; + uint16 chunk_size; /* current chunk size */ + uint current_group; /* current group */ + uint current_chunk; /* current chunk in the group */ + my_bool eor; /* end of the record */ +} TRANSLOG_READER_DATA; + +C_MODE_START + +/* Records types for unittests */ +#define LOGREC_FIXED_RECORD_0LSN_EXAMPLE 1 +#define LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE 2 +#define LOGREC_FIXED_RECORD_1LSN_EXAMPLE 3 +#define LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE 4 +#define LOGREC_FIXED_RECORD_2LSN_EXAMPLE 5 +#define LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE 6 + +extern void translog_example_table_init(); +extern void translog_table_init(); +#define translog_init(D,M,V,I,C,F,R) \ + translog_init_with_table(D,M,V,I,C,F,R,&translog_table_init,0) +extern my_bool translog_init_with_table(const char *directory, + uint32 log_file_max_size, + uint32 server_version, + uint32 server_id, + PAGECACHE *pagecache, + uint flags, + my_bool readonly, + void (*init_table_func)(), + my_bool no_error); +#ifndef DBUG_OFF +void check_translog_description_table(int num); +#endif + +extern my_bool +translog_write_record(LSN *lsn, enum translog_record_type type, TRN *trn, + MARIA_HA *tbl_info, + translog_size_t rec_len, uint part_no, + LEX_CUSTRING *parts_data, uchar *store_share_id, + void *hook_arg); + +extern void translog_destroy(); + +extern int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff); + +extern void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff); + +extern translog_size_t translog_read_record(LSN lsn, + translog_size_t offset, + translog_size_t length, + uchar *buffer, + struct st_translog_reader_data + *data); + +extern my_bool translog_flush(TRANSLOG_ADDRESS lsn); + +extern my_bool translog_scanner_init(LSN lsn, + my_bool fixed_horizon, + struct st_translog_scanner_data *scanner, + my_bool use_direct_link); +extern void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner); + +extern int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner, + TRANSLOG_HEADER_BUFFER *buff); +extern LSN translog_get_file_max_lsn_stored(uint32 file); +extern my_bool translog_purge(TRANSLOG_ADDRESS low); +extern my_bool translog_is_file(uint file_no); +extern void translog_lock(); +extern void translog_unlock(); +extern void translog_lock_handler_assert_owner(); +extern TRANSLOG_ADDRESS translog_get_horizon(); +extern TRANSLOG_ADDRESS translog_get_horizon_no_lock(); +extern int translog_assign_id_to_share(struct st_maria_handler *tbl_info, + TRN *trn); +extern void translog_deassign_id_from_share(struct st_maria_share *share); +extern void +translog_assign_id_to_share_from_recovery(struct st_maria_share *share, + uint16 id); +extern my_bool translog_walk_filenames(const char *directory, + my_bool (*callback)(const char *, + const char *)); +extern void dump_page(uchar *buffer, File handler); +extern my_bool translog_log_debug_info(TRN *trn, + enum translog_debug_info_type type, + uchar *info, size_t length); +extern void translog_disable_purge(void); +extern void translog_enable_purge(void); + +enum enum_translog_status +{ + TRANSLOG_UNINITED, /* no initialization done or error during initialization */ + TRANSLOG_OK, /* transaction log is functioning */ + TRANSLOG_READONLY, /* read only mode due to write errors */ + TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */ +}; +extern enum enum_translog_status translog_status; +extern ulonglong translog_syncs; /* Number of sync()s */ + +void translog_soft_sync(my_bool mode); +void translog_hard_group_commit(my_bool mode); +int translog_soft_sync_start(void); +void translog_soft_sync_end(void); +void translog_sync(); +void translog_set_group_commit_interval(uint32 interval); +extern void check_skipped_lsn(MARIA_HA *info, LSN lsn, my_bool index_file, + pgcache_page_no_t page); + +/* + all the rest added because of recovery; should we make + ma_loghandler_for_recovery.h ? +*/ + +/* + Information from transaction log file header +*/ + +typedef struct st_loghandler_file_info +{ + /* + LSN_IMPOSSIBLE for current file (not finished file). + Maximum LSN of the record which parts stored in the + file. + */ + LSN max_lsn; + ulonglong timestamp; /* Time stamp */ + ulong maria_version; /* Version of maria loghandler */ + ulong mysql_version; /* Version of mysql server */ + ulong server_id; /* Server ID */ + ulong page_size; /* Loghandler page size */ + ulong file_number; /* Number of the file (from the file header) */ +} LOGHANDLER_FILE_INFO; + +#define SHARE_ID_MAX 65535 /* array's size */ + +extern void translog_fill_overhead_table(); +extern void translog_interpret_file_header(LOGHANDLER_FILE_INFO *desc, + uchar *page_buff); +extern LSN translog_first_lsn_in_log(); +extern LSN translog_first_theoretical_lsn(); +extern LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon); +extern my_bool translog_purge_at_flush(); +extern uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon); +extern uint32 translog_get_first_needed_file(); +extern char *translog_filename_by_fileno(uint32 file_no, char *path); +extern void translog_set_file_size(uint32 size); + +/* record parts descriptor */ +struct st_translog_parts +{ + /* full record length */ + translog_size_t record_length; + /* full record length with chunk headers */ + translog_size_t total_record_length; + /* current part index */ + uint current; + /* total number of elements in parts */ + uint elements; + /* array of parts */ + LEX_CUSTRING *parts; +}; + +typedef my_bool(*prewrite_rec_hook) (enum translog_record_type type, + TRN *trn, + struct st_maria_handler *tbl_info, + void *hook_arg); + +typedef my_bool(*inwrite_rec_hook) (enum translog_record_type type, + TRN *trn, + struct st_maria_handler *tbl_info, + LSN *lsn, void *hook_arg); + +typedef uint16(*read_rec_hook) (enum translog_record_type type, + uint16 read_length, uchar *read_buff, + uchar *decoded_buff); + + +/* record classes */ +enum record_class +{ + LOGRECTYPE_NOT_ALLOWED, + LOGRECTYPE_VARIABLE_LENGTH, + LOGRECTYPE_PSEUDOFIXEDLENGTH, + LOGRECTYPE_FIXEDLENGTH +}; + +enum enum_record_in_group { + LOGREC_NOT_LAST_IN_GROUP= 0, LOGREC_LAST_IN_GROUP, LOGREC_IS_GROUP_ITSELF +}; + +/* + Descriptor of log record type +*/ +typedef struct st_log_record_type_descriptor +{ + /* internal class of the record */ + enum record_class rclass; + /* + length for fixed-size record, pseudo-fixed record + length with uncompressed LSNs + */ + uint16 fixed_length; + /* how much record body (belonged to headers too) read with headers */ + uint16 read_header_len; + /* HOOK for writing the record called before lock */ + prewrite_rec_hook prewrite_hook; + /* HOOK for writing the record called when LSN is known, inside lock */ + inwrite_rec_hook inwrite_hook; + /* HOOK for reading headers */ + read_rec_hook read_hook; + /* + For pseudo fixed records number of compressed LSNs followed by + system header + */ + int16 compressed_LSN; + /* the rest is for maria_read_log & Recovery */ + /** @brief for debug error messages or "maria_read_log" command-line tool */ + const char *name; + enum enum_record_in_group record_in_group; + /* a function to execute when we see the record during the REDO phase */ + int (*record_execute_in_redo_phase)(const TRANSLOG_HEADER_BUFFER *); + /* a function to execute when we see the record during the UNDO phase */ + int (*record_execute_in_undo_phase)(const TRANSLOG_HEADER_BUFFER *, TRN *); +} LOG_DESC; + +extern LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES]; + +typedef enum +{ + TRANSLOG_GCOMMIT_NONE, + TRANSLOG_GCOMMIT_HARD, + TRANSLOG_GCOMMIT_SOFT +} enum_maria_group_commit; +extern ulong maria_group_commit; +extern ulong maria_group_commit_interval; +typedef enum +{ + TRANSLOG_PURGE_IMMIDIATE, + TRANSLOG_PURGE_EXTERNAL, + TRANSLOG_PURGE_ONDEMAND +} enum_maria_translog_purge_type; +extern ulong log_purge_type; +extern ulong log_file_size; +extern uint log_purge_disabled; /* For backup */ + +typedef enum +{ + TRANSLOG_SYNC_DIR_NEVER, + TRANSLOG_SYNC_DIR_NEWFILE, + TRANSLOG_SYNC_DIR_ALWAYS +} enum_maria_sync_log_dir; +extern ulong sync_log_dir; + +C_MODE_END +#endif diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h new file mode 100644 index 00000000..c5bd76bb --- /dev/null +++ b/storage/maria/ma_loghandler_lsn.h @@ -0,0 +1,115 @@ +/* Copyright (C) 2007 MySQL AB & Sanja Belkin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _ma_loghandler_lsn_h +#define _ma_loghandler_lsn_h + +/* + Transaction log record address: + file_no << 32 | offset + file_no is only 3 bytes so we can use signed integer to make + comparison simpler. +*/ +typedef int64 TRANSLOG_ADDRESS; + +/* + Compare addresses + A1 > A2 -> result > 0 + A1 == A2 -> 0 + A1 < A2 -> result < 0 +*/ +#define cmp_translog_addr(A1,A2) ((A1) - (A2)) + +/* + TRANSLOG_ADDRESS is just address of some byte in the log (usually some + chunk) + LSN used where address of some record in the log needed (not just any + address) +*/ +typedef TRANSLOG_ADDRESS LSN; + +/* Gets file number part of a LSN/log address */ +#define LSN_FILE_NO(L) (uint32) ((L) >> 32) + +/* Gets raw file number part of a LSN/log address */ +#define LSN_FILE_NO_PART(L) ((L) & ((int64)0xFFFFFF00000000LL)) + +/* Parts of LSN for printing */ +#define LSN_IN_PARTS(L) (uint)LSN_FILE_NO(L),(uint)LSN_OFFSET(L) +#define LSN_FMT "(%u,0x%x)" + +/* Gets record offset of a LSN/log address */ +#define LSN_OFFSET(L) (ulong) ((L) & 0xFFFFFFFFL) + +/* Makes lsn/log address from file number and record offset */ +#define MAKE_LSN(F,S) ((LSN) ((((uint64)(F)) << 32) | (S))) + +/* checks LSN */ +#define LSN_VALID(L) \ + ((LSN_FILE_NO_PART(L) != FILENO_IMPOSSIBLE) && \ + (LSN_OFFSET(L) != LOG_OFFSET_IMPOSSIBLE)) + +/* size of stored LSN on a disk, don't change it! */ +#define LSN_STORE_SIZE 7 + +/* Puts LSN into buffer (dst) */ +#define lsn_store(dst, lsn) \ + do { \ + int3store((dst), LSN_FILE_NO(lsn)); \ + int4store((char*)(dst) + 3, LSN_OFFSET(lsn)); \ + } while (0) + +/* Unpacks LSN from the buffer (P) */ +#define lsn_korr(P) MAKE_LSN(uint3korr(P), uint4korr((const char*)(P) + 3)) + +/* what we need to add to LSN to increase it on one file */ +#define LSN_ONE_FILE ((int64)0x100000000LL) + +#define LSN_REPLACE_OFFSET(L, S) (LSN_FILE_NO_PART(L) | (S)) + +/* + an 8-byte type whose most significant uchar is used for "flags"; 7 + other bytes are a LSN. +*/ +typedef LSN LSN_WITH_FLAGS; +#define LSN_WITH_FLAGS_TO_LSN(x) (x & 0x00FFFFFFFFFFFFFFULL) +#define LSN_WITH_FLAGS_TO_FLAGS(x) (x & 0xFF00000000000000ULL) + +#define FILENO_IMPOSSIBLE 0 /**< log file's numbering starts at 1 */ +#define LOG_OFFSET_IMPOSSIBLE 0 /**< log always has a header */ +#define LSN_IMPOSSIBLE ((LSN)0) +/* following LSN also is impossible */ +#define LSN_ERROR ((LSN)1) + +/** @brief some impossible LSN serve as markers */ + +/** + When table is modified by maria_chk, or auto-zerofilled, old REDOs don't + apply, table is freshly born again somehow: its state's LSNs need to be + updated to the new instance which receives this table. +*/ +#define LSN_NEEDS_NEW_STATE_LSNS ((LSN)2) + +/** + @brief the maximum valid LSN. + Unlike ULONGLONG_MAX, it can be safely used in comparison with valid LSNs + (ULONGLONG_MAX is too big for correctness of cmp_translog_addr()). +*/ +#define LSN_MAX (LSN)0x00FFFFFFFFFFFFFFULL + +/* Max LSN error to print on check or recovery */ +#define MAX_LSN_ERRORS 10 + +#endif diff --git a/storage/maria/ma_norec.c b/storage/maria/ma_norec.c new file mode 100644 index 00000000..7bdde9fc --- /dev/null +++ b/storage/maria/ma_norec.c @@ -0,0 +1,66 @@ +/* Copyright (C) 2010 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Functions to handle tables with no row data (only index) + This is useful when you just want to do key reads or want to use + the index to check against duplicates. +*/ + +#include "maria_def.h" + +my_bool _ma_write_no_record(MARIA_HA *info __attribute__((unused)), + const uchar *record __attribute__((unused))) +{ + return 0; +} + +my_bool _ma_update_no_record(MARIA_HA *info __attribute__((unused)), + MARIA_RECORD_POS pos __attribute__((unused)), + const uchar *oldrec __attribute__((unused)), + const uchar *record __attribute__((unused))) +{ + return 1; +} + + +my_bool _ma_delete_no_record(MARIA_HA *info __attribute__((unused)), + const uchar *record __attribute__((unused))) +{ + return 1; +} + + +int _ma_read_no_record(MARIA_HA *info __attribute__((unused)), + uchar *record __attribute__((unused)), + MARIA_RECORD_POS pos __attribute__((unused))) +{ + return HA_ERR_WRONG_COMMAND; +} + + +int _ma_read_rnd_no_record(MARIA_HA *info __attribute__((unused)), + uchar *buf __attribute__((unused)), + MARIA_RECORD_POS filepos __attribute__((unused)), + my_bool skip_deleted_blocks __attribute__((unused))) +{ + return HA_ERR_WRONG_COMMAND; +} + +my_off_t _ma_no_keypos_to_recpos(MARIA_SHARE *share __attribute__ ((unused)), + my_off_t pos __attribute__ ((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c new file mode 100644 index 00000000..ad98a534 --- /dev/null +++ b/storage/maria/ma_open.c @@ -0,0 +1,2200 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2009, 2022, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* open an Aria table */ + +#include "ma_fulltext.h" +#include "ma_sp_defs.h" +#include "ma_rt_index.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_trnman.h" +#include <m_ctype.h> +#include "ma_crypt.h" +#include "s3_func.h" + +#ifdef _WIN32 +#include <fcntl.h> +#endif + +static void setup_key_functions(MARIA_KEYDEF *keyinfo); +static my_bool maria_scan_init_dummy(MARIA_HA *info); +static void maria_scan_end_dummy(MARIA_HA *info); +static my_bool maria_once_init_dummy(MARIA_SHARE *, File); +static my_bool maria_once_end_dummy(MARIA_SHARE *); +static uchar *_ma_state_info_read(uchar *, MARIA_STATE_INFO *, myf); + +#define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \ + pos+=size;} + + +#define disk_pos_assert(share, pos, end_pos) \ +if (pos > end_pos) \ +{ \ + _ma_set_fatal_error_with_share(share, HA_ERR_CRASHED); \ + goto err; \ +} + + +/****************************************************************************** +** Return the shared struct if the table is already open. +** In MySQL the server will handle version issues. +******************************************************************************/ + +MARIA_HA *_ma_test_if_reopen(const char *filename) +{ + LIST *pos; + + for (pos=maria_open_list ; pos ; pos=pos->next) + { + MARIA_HA *info=(MARIA_HA*) pos->data; + MARIA_SHARE *share= info->s; + if (!strcmp(share->unique_file_name.str,filename) && share->last_version) + return info; + } + return 0; +} + + +/* + Open a new instance of an already opened Maria table + + SYNOPSIS + maria_clone_internal() + share Share of already open table + mode Mode of table (O_RDONLY | O_RDWR) + data_file Filedescriptor of data file to use < 0 if one should open + open it. + internal_table <> 0 if this is an internal temporary table + + RETURN + # Maria handler + 0 Error +*/ + +static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, + int mode, File data_file, + uint internal_table, + struct ms3_st *s3) +{ + int save_errno; + uint errpos; + MARIA_HA info,*m_info; + my_bitmap_map *changed_fields_bitmap; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + DBUG_ENTER("maria_clone_internal"); + + errpos= 0; + bzero((uchar*) &info,sizeof(info)); + + if (mode == O_RDWR && share->mode == O_RDONLY) + { + my_errno=EACCES; /* Can't open in write mode */ + goto err; + } + if (data_file >= 0) + info.dfile.file= data_file; + else if (_ma_open_datafile(&info, share)) + goto err; + errpos= 5; + + /* alloc and set up private structure parts */ + if (!my_multi_malloc(PSI_INSTRUMENT_ME, flag, + &m_info,sizeof(MARIA_HA), + &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs, + &info.buff,(share->base.max_key_block_length*2+ + share->base.max_key_length), + &info.lastkey_buff,share->base.max_key_length*3, + &info.first_mbr_key, share->base.max_key_length, + &info.maria_rtree_recursion_state, + share->have_rtree ? 1024 : 0, + &changed_fields_bitmap, + bitmap_buffer_size(share->base.fields), + NullS)) + goto err; + errpos= 6; + + info.s3= s3; + memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs); + info.lastkey_buff2= info.lastkey_buff + share->base.max_key_length; + info.last_key.data= info.lastkey_buff; + + info.s=share; + info.cur_row.lastpos= HA_OFFSET_ERROR; + /* Impossible first index to force initialization in _ma_check_index() */ + info.lastinx= ~0; + info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND); + info.opt_flag=READ_CHECK_USED; + info.this_unique= (ulong) info.dfile.file; /* Uniq number in process */ +#ifdef MARIA_EXTERNAL_LOCKING + if (share->data_file_type == COMPRESSED_RECORD) + info.this_unique= share->state.unique; + info.this_loop=0; /* Update counter */ + info.last_unique= share->state.unique; + info.last_loop= share->state.update_count; +#endif + info.errkey= -1; + info.page_changed= 1; + info.autocommit= 1; + info.keyread_buff= info.buff + share->base.max_key_block_length; + + info.lock_type= F_UNLCK; + if (share->options & HA_OPTION_TMP_TABLE) + info.lock_type= F_WRLCK; + + _ma_set_data_pagecache_callbacks(&info.dfile, share); + my_bitmap_init(&info.changed_fields, changed_fields_bitmap, share->base.fields); + if ((*share->init)(&info)) + goto err; + + /* The following should be big enough for all pinning purposes */ + if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &info.pinned_pages, + sizeof(MARIA_PINNED_PAGE), + MY_MAX(share->base.blobs*2 + 4, + MARIA_MAX_TREE_LEVELS*3), 16, flag)) + goto err; + + + mysql_mutex_lock(&share->intern_lock); + info.read_record= share->read_record; + share->reopen++; + share->write_flag=MYF(MY_NABP | MY_WAIT_IF_FULL); + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + info.lock_type=F_RDLCK; + share->r_locks++; + share->tot_locks++; + } + if ((share->options & HA_OPTION_DELAY_KEY_WRITE) && + maria_delay_key_write) + share->delay_key_write=1; + + if (!share->now_transactional) /* If not transctional table */ + { + /* Pagecache requires access to info->trn->rec_lsn */ + _ma_set_tmp_trn_for_table(&info, &dummy_transaction_object); + info.state= &share->state.state; /* Change global values by default */ + } + else + { + info.state= &share->state.common; + *info.state= share->state.state; /* Initial values */ + } + info.state_start= info.state; /* Initial values */ + + mysql_mutex_unlock(&share->intern_lock); + + /* Allocate buffer for one record */ + /* prerequisites: info->rec_buffer == 0 && info->rec_buff_size == 0 */ + if (_ma_alloc_buffer(&info.rec_buff, &info.rec_buff_size, + share->base.default_rec_buff_size, flag)) + goto err; + + bzero(info.rec_buff, share->base.default_rec_buff_size); + + *m_info=info; + thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info); + + if (share->options & HA_OPTION_TMP_TABLE) + m_info->lock.type= TL_WRITE; + + if (!internal_table) + { + m_info->open_list.data= m_info->share_list.data= (void*) m_info; + maria_open_list= list_add(maria_open_list, &m_info->open_list); + share->open_list= list_add(share->open_list, &m_info->share_list); + } + else + { + /* We don't need to mark internal temporary tables as changed on disk */ + share->internal_table= 1; + share->global_changed= 1; + } + DBUG_RETURN(m_info); + +err: + DBUG_PRINT("error", ("error: %d", my_errno)); + save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE; + if ((save_errno == HA_ERR_CRASHED) || + (save_errno == HA_ERR_CRASHED_ON_USAGE) || + (save_errno == HA_ERR_CRASHED_ON_REPAIR)) + _ma_report_error(save_errno, &share->open_file_name, + MYF(ME_ERROR_LOG)); + switch (errpos) { + case 6: + (*share->end)(&info); + delete_dynamic(&info.pinned_pages); + my_free(m_info->s3); + my_free(m_info); + /* fall through */ + case 5: + if (data_file < 0) + mysql_file_close(info.dfile.file, MYF(0)); + break; + } + my_errno=save_errno; + DBUG_RETURN (NULL); +} /* maria_clone_internal */ + + +/****************************************************************************** + open a MARIA table + + See my_base.h for the handle_locking argument + if handle_locking and HA_OPEN_ABORT_IF_CRASHED then abort if the table + is marked crashed or if we are not using locking and the table doesn't + have an open count of 0. +******************************************************************************/ + +MARIA_HA *maria_open(const char *name, int mode, uint open_flags, + S3_INFO *s3) +{ + int open_mode= 0,save_errno; + uint i,j,len,errpos,head_length,base_pos,keys, realpath_err, + key_parts,base_key_parts,unique_key_parts,fulltext_keys,uniques; + uint internal_table= MY_TEST(open_flags & HA_OPEN_INTERNAL_TABLE); + myf common_flag= open_flags & HA_OPEN_TMP_TABLE ? MY_THREAD_SPECIFIC : 0; + uint file_version; + size_t info_length; + char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN], + data_name[FN_REFLEN]; + uchar *UNINIT_VAR(disk_cache), *disk_pos, *end_pos; + MARIA_HA info, *UNINIT_VAR(m_info), *old_info= NULL; + MARIA_SHARE share_buff,*share; + double *rec_per_key_part; + ulong *nulls_per_key_part; + my_off_t key_root[HA_MAX_POSSIBLE_KEY]; + ulonglong max_key_file_length, max_data_file_length; + my_bool versioning= 1, born_transactional; + File data_file= -1, kfile= -1; + struct ms3_st *s3_client= 0; + S3_INFO *share_s3= 0; + S3_BLOCK index_header; + DBUG_ENTER("maria_open"); + + errpos= 0; + head_length=sizeof(share_buff.state.header); + bzero((uchar*) &info,sizeof(info)); + bzero((uchar*) &index_header, sizeof(index_header)); + +#ifndef WITH_S3_STORAGE_ENGINE + DBUG_ASSERT(!s3); +#else + if (!s3) +#endif /* WITH_S3_STORAGE_ENGINE */ + { + realpath_err= my_realpath(name_buff, fn_format(org_name, name, "", + MARIA_NAME_IEXT, + MY_UNPACK_FILENAME),MYF(0)); + if (realpath_err > 0) /* File not found, no point in looking further. */ + { + DBUG_RETURN(NULL); + } + + if (my_is_symlink(org_name) && + (realpath_err || mysys_test_invalid_symlink(name_buff))) + { + my_errno= HA_WRONG_CREATE_OPTION; + DBUG_RETURN(0); + } + } +#ifdef WITH_S3_STORAGE_ENGINE + else + { + strmake(name_buff, name, sizeof(name_buff)-1); /* test_if_reopen() */ + if (!(s3_client= s3f.open_connection(s3))) + { + internal_table= 1; /* Avoid unlock on error */ + goto err; + } + } +#endif /* WITH_S3_STORAGE_ENGINE */ + + if (!internal_table) + mysql_mutex_lock(&THR_LOCK_maria); + if ((open_flags & HA_OPEN_COPY) || + (internal_table || !(old_info=_ma_test_if_reopen(name_buff)))) + { + share= &share_buff; + bzero((uchar*) &share_buff,sizeof(share_buff)); + share_buff.state.key_root=key_root; + share_buff.pagecache= multi_pagecache_search((uchar*) name_buff, + (uint) strlen(name_buff), + maria_pagecache); + + if (!s3) + { + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open", + if (strstr(name, "/t1")) + { + my_errno= HA_ERR_CRASHED; + goto err; + }); + DEBUG_SYNC_C("mi_open_kfile"); + if ((kfile=mysql_file_open(key_file_kfile, name_buff, + (open_mode=O_RDWR) | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(common_flag | MY_NOSYMLINKS))) < 0) + { + if ((errno != EROFS && errno != EACCES) || + mode != O_RDONLY || + (kfile=mysql_file_open(key_file_kfile, name_buff, + (open_mode=O_RDONLY) | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(common_flag | MY_NOSYMLINKS))) < 0) + goto err; + } + errpos= 1; + if (mysql_file_pread(kfile,share->state.header.file_version, head_length, + 0, MYF(MY_NABP))) + { + my_errno= HA_ERR_NOT_A_TABLE; + goto err; + } + } +#ifdef WITH_S3_STORAGE_ENGINE + else + { + open_mode= mode; + errpos= 1; + if (s3f.set_database_and_table_from_path(s3, name_buff)) + { + my_printf_error(HA_ERR_NO_SUCH_TABLE, + "Can't find database and path from %s", MYF(0), + name_buff); + my_errno= HA_ERR_NO_SUCH_TABLE; + goto err; + } + if (!(share_s3= share->s3_path= s3f.info_copy(s3))) + goto err; /* EiOM */ + + /* Check if table has changed in S3 */ + if (s3f.check_frm_version(s3_client, share_s3) == 1) + { + my_errno= HA_ERR_TABLE_DEF_CHANGED; + goto err; + } + + if (s3f.read_index_header(s3_client, share_s3, &index_header)) + goto err; + if (index_header.length < head_length) + { + my_errno=HA_ERR_NOT_A_TABLE; + goto err; + } + memcpy(share->state.header.file_version, index_header.str, + head_length); + kfile= s3f.unique_file_number(); + } +#endif /* WITH_S3_STORAGE_ENGINE */ + + share->mode=open_mode; + if (memcmp(share->state.header.file_version, maria_file_magic, 4)) + { + DBUG_PRINT("error",("Wrong header in %s",name_buff)); + DBUG_DUMP("error_dump", share->state.header.file_version, + head_length); + my_errno=HA_ERR_NOT_A_TABLE; + goto err; + } + share->options= mi_uint2korr(share->state.header.options); + if (share->options & + ~(HA_OPTION_PACK_RECORD | HA_OPTION_PACK_KEYS | + HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA | + HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE | + HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS | + HA_OPTION_PAGE_CHECKSUM)) + { + DBUG_PRINT("error",("wrong options: 0x%lx", share->options)); + my_errno=HA_ERR_NEW_FILE; + goto err; + } + if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) && + ! (open_flags & HA_OPEN_FROM_SQL_LAYER)) + { + DBUG_PRINT("error", ("table cannot be opened from non-sql layer")); + my_errno= HA_ERR_UNSUPPORTED; + goto err; + } + if (!s3) + { + /* Don't call realpath() if the name can't be a link */ + if (!strcmp(name_buff, org_name) || + my_readlink(index_name, org_name, MYF(0)) == -1) + (void) strmov(index_name, org_name); + *strrchr(org_name, FN_EXTCHAR)= '\0'; + (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT, + MY_APPEND_EXT|MY_UNPACK_FILENAME); + if (my_is_symlink(data_name)) + { + if (my_realpath(data_name, data_name, MYF(0))) + goto err; + if (mysys_test_invalid_symlink(data_name)) + { + my_errno= HA_WRONG_CREATE_OPTION; + goto err; + } + share->mode|= O_NOFOLLOW; /* all symlinks are resolved by realpath() */ + } + } + else + { + /* Don't show DIRECTORY in show create table */ + index_name[0]= data_name[0]= 0; + } + + info_length=mi_uint2korr(share->state.header.header_length); + base_pos= mi_uint2korr(share->state.header.base_pos); + + /* + Allocate space for header information and for data that is too + big to keep on stack + */ + if (!(disk_cache= my_malloc(PSI_INSTRUMENT_ME, info_length+128, + MYF(MY_WME | common_flag)))) + { + my_errno=ENOMEM; + goto err; + } + + end_pos=disk_cache+info_length; + errpos= 3; + if (!s3) + { + if (mysql_file_pread(kfile, disk_cache, info_length, 0L, MYF(MY_NABP))) + { + _ma_set_fatal_error_with_share(share, HA_ERR_CRASHED); + goto err; + } + } +#ifdef WITH_S3_STORAGE_ENGINE + else + { + if (index_header.length < info_length) + { + my_errno=HA_ERR_NOT_A_TABLE; + goto err; + } + memcpy(disk_cache, index_header.str, info_length); + } +#endif /* WITH_S3_STORAGE_ENGINE */ + + len=mi_uint2korr(share->state.header.state_info_length); + keys= (uint) share->state.header.keys; + uniques= (uint) share->state.header.uniques; + fulltext_keys= (uint) share->state.header.fulltext_keys; + base_key_parts= key_parts= mi_uint2korr(share->state.header.key_parts); + unique_key_parts= mi_uint2korr(share->state.header.unique_key_parts); + if (len != MARIA_STATE_INFO_SIZE) + { + DBUG_PRINT("warning", + ("saved_state_info_length: %d state_info_length: %d", + len,MARIA_STATE_INFO_SIZE)); + } + share->state_diff_length=len-MARIA_STATE_INFO_SIZE; + + if (!_ma_state_info_read(disk_cache, &share->state, common_flag)) + goto err; + len= mi_uint2korr(share->state.header.base_info_length); + if (len != MARIA_BASE_INFO_SIZE) + { + DBUG_PRINT("warning",("saved_base_info_length: %d base_info_length: %d", + len,MARIA_BASE_INFO_SIZE)); + } + disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base); + /* + Check if old version of Aria file. Version 0 has language + stored in header.not_used + */ + file_version= (share->state.header.not_used == 0); + if (file_version == 0) + share->base.language= share->state.header.not_used; + born_transactional= share->base.born_transactional; + + share->state.state_length=base_pos; + /* For newly opened tables we reset the error-has-been-printed flag */ + share->state.changed&= ~STATE_CRASHED_PRINTED; + share->state.org_changed= share->state.changed; + + if (!(open_flags & HA_OPEN_FOR_REPAIR) && + ((share->state.changed & STATE_CRASHED_FLAGS) || + ((open_flags & HA_OPEN_ABORT_IF_CRASHED) && + (my_disable_locking && share->state.open_count)))) + { + DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u " + "changed: %u open_count: %u !locking: %d", + open_flags, share->state.changed, + share->state.open_count, my_disable_locking)); + my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ? + HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE); + goto err; + } + if (share->state.open_count) + share->open_count_not_zero_on_open= 1; + + /* + A transactional table is not usable on this system if: + - share->state.create_trid > trnman_get_max_trid() + - Critical as trid as stored releative to create_trid. + - uuid is different + + STATE_NOT_MOVABLE is reset when a table is zerofilled + (has no LSN's and no trids) + + We can ignore testing uuid if STATE_NOT_MOVABLE is not set, as in this + case the uuid will be set in _ma_mark_file_changed(). + */ + if (born_transactional && + ((share->state.create_trid > max_trid_in_system() && + !maria_in_recovery) || + ((share->state.changed & STATE_NOT_MOVABLE) && + ((!(open_flags & HA_OPEN_IGNORE_MOVED_STATE) && + memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE)))) || + ((share->state.changed & (STATE_MOVED | STATE_NOT_ZEROFILLED)) == + (STATE_MOVED | STATE_NOT_ZEROFILLED)))) + { + DBUG_PRINT("warning", ("table is moved from another system. uuid_diff: %d create_trid: %lu max_trid: %lu moved: %d", + memcmp(share->base.uuid, maria_uuid, + MY_UUID_SIZE) != 0, + (ulong) share->state.create_trid, + (ulong) trnman_get_max_trid(), + MY_TEST((share->state.changed & STATE_MOVED)))); + if (open_flags & HA_OPEN_FOR_REPAIR) + share->state.changed|= STATE_MOVED; + else + { + my_errno= HA_ERR_OLD_FILE; + goto err; + } + } + + /* sanity check */ + if (share->base.keystart > 65535 || share->base.rec_reflength > 8) + { + _ma_set_fatal_error_with_share(share, HA_ERR_CRASHED); + goto err; + } + + key_parts+=fulltext_keys*FT_SEGS; + if (share->base.max_key_length > _ma_max_key_length() || + keys > MARIA_MAX_KEY || key_parts > MARIA_MAX_KEY * HA_MAX_KEY_SEG) + { + DBUG_PRINT("error",("Wrong key info: Max_key_length: %d keys: %d key_parts: %d", share->base.max_key_length, keys, key_parts)); + my_errno=HA_ERR_UNSUPPORTED; + goto err; + } + + /* Ensure we have space in the key buffer for transaction id's */ + if (born_transactional) + share->base.max_key_length= ALIGN_SIZE(share->base.max_key_length + + MARIA_MAX_PACK_TRANSID_SIZE); + + /* + If page cache is not initialized, then assume we will create the + page_cache after the table is opened! + This is only used by maria_check to allow it to check/repair tables + with different block sizes. + */ + if (share->base.block_size != maria_block_size && + share_buff.pagecache->inited != 0) + { + DBUG_PRINT("error", ("Wrong block size %u; Expected %u", + (uint) share->base.block_size, + (uint) maria_block_size)); + my_errno=HA_ERR_UNSUPPORTED; + my_printf_error(my_errno, "Wrong block size %u; Expected %u", + MYF(0), + (uint) share->base.block_size, + (uint) maria_block_size); + goto err; + } + + /* Correct max_file_length based on length of sizeof(off_t) */ + max_data_file_length= + (share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ? + (((ulonglong) 1 << (share->base.rec_reflength*8))-1) : + (_ma_safe_mul(share->base.pack_reclength, + (ulonglong) 1 << (share->base.rec_reflength*8))-1); + + max_key_file_length= + _ma_safe_mul(share->base.block_size, + ((ulonglong) 1 << (share->base.key_reflength*8))-1); +#if SIZEOF_OFF_T == 4 + set_if_smaller(max_data_file_length, INT_MAX32); + set_if_smaller(max_key_file_length, INT_MAX32); +#endif + /* For internal temporary tables, max_data_file_length is already set */ + if (!internal_table || !share->base.max_data_file_length) + share->base.max_data_file_length=(my_off_t) max_data_file_length; + DBUG_ASSERT(share->base.max_data_file_length); + share->base.max_key_file_length=(my_off_t) max_key_file_length; + + if (share->options & HA_OPTION_COMPRESS_RECORD) + share->base.max_key_length+=2; /* For safety */ + /* Add space for node pointer */ + share->base.max_key_length+= share->base.key_reflength; + + share->unique_file_name.length= strlen(name_buff); + share->index_file_name.length= strlen(index_name); + share->data_file_name.length= strlen(data_name); + share->open_file_name.length= strlen(name); + if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME | common_flag), + &share,sizeof(*share), + &rec_per_key_part, sizeof(double) * key_parts, + &nulls_per_key_part, sizeof(long)* key_parts, + &share->keyinfo,keys*sizeof(MARIA_KEYDEF), + &share->uniqueinfo,uniques*sizeof(MARIA_UNIQUEDEF), + &share->keyparts, + (key_parts+unique_key_parts+keys+uniques) * + sizeof(HA_KEYSEG), + &share->columndef, + (share->base.fields+1)*sizeof(MARIA_COLUMNDEF), + &share->column_nr, share->base.fields*sizeof(uint16), + &share->blobs,sizeof(MARIA_BLOB)*share->base.blobs, + &share->unique_file_name.str, + share->unique_file_name.length+1, + &share->index_file_name.str, + share->index_file_name.length+1, + &share->data_file_name.str, + share->data_file_name.length+1, + &share->open_file_name.str, + share->open_file_name.length+1, + &share->state.key_root,keys*sizeof(my_off_t), + &share->mmap_lock,sizeof(mysql_rwlock_t), + NullS)) + goto err; + errpos= 4; + + *share= share_buff; + share->state.rec_per_key_part= rec_per_key_part; + share->state.nulls_per_key_part= nulls_per_key_part; + + memcpy((char*) rec_per_key_part, + (char*) share_buff.state.rec_per_key_part, + sizeof(double)*base_key_parts); + memcpy((char*) nulls_per_key_part, + (char*) share_buff.state.nulls_per_key_part, + sizeof(long)*base_key_parts); + memcpy((char*) share->state.key_root, + (char*) key_root, sizeof(my_off_t)*keys); + strmov(share->unique_file_name.str, name_buff); + strmov(share->index_file_name.str, index_name); + strmov(share->data_file_name.str, data_name); + strmov(share->open_file_name.str, name); + + share->block_size= share->base.block_size; /* Convenience */ + share->max_index_block_size= share->block_size - KEYPAGE_CHECKSUM_SIZE; + share->keypage_header= ((born_transactional ? + LSN_STORE_SIZE + TRANSID_SIZE : + 0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + + KEYPAGE_USED_SIZE); + + if (MY_TEST(share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED)) + { + share->keypage_header+= ma_crypt_get_index_page_header_space(share); + } + + { + HA_KEYSEG *pos=share->keyparts; + uint32 ftkey_nr= 1; + for (i=0 ; i < keys ; i++) + { + MARIA_KEYDEF *keyinfo= &share->keyinfo[i]; + keyinfo->share= share; + disk_pos=_ma_keydef_read(disk_pos, keyinfo); + keyinfo->key_nr= i; + + /* Calculate length to store a key + nod flag and transaction info */ + keyinfo->max_store_length= (keyinfo->maxlength + + share->base.key_reflength); + if (born_transactional) + keyinfo->max_store_length+= MARIA_INDEX_OVERHEAD_SIZE; + + /* See ma_delete.cc::underflow() */ + if (!(keyinfo->flag & (HA_BINARY_PACK_KEY | HA_PACK_KEY))) + keyinfo->underflow_block_length= keyinfo->block_length/3; + else + { + /* Packed key, ensure we don't get overflow in underflow() */ + keyinfo->underflow_block_length= + MY_MAX((int) (share->max_index_block_size - keyinfo->maxlength * 3), + (int) (share->keypage_header + share->base.key_reflength)); + set_if_smaller(keyinfo->underflow_block_length, + keyinfo->block_length/3); + } + + disk_pos_assert(share, + disk_pos + keyinfo->keysegs * HA_KEYSEG_SIZE, + end_pos); + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + share->have_rtree= 1; + keyinfo->seg=pos; + for (j=0 ; j < keyinfo->keysegs; j++,pos++) + { + disk_pos=_ma_keyseg_read(disk_pos, pos); + if (pos->type == HA_KEYTYPE_TEXT || + pos->type == HA_KEYTYPE_VARTEXT1 || + pos->type == HA_KEYTYPE_VARTEXT2) + { + if (!pos->language) + pos->charset=default_charset_info; + else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME)))) + { + my_errno=HA_ERR_UNKNOWN_CHARSET; + goto err; + } + } + else if (pos->type == HA_KEYTYPE_BINARY) + pos->charset= &my_charset_bin; + } + if (keyinfo->flag & HA_SPATIAL) + { +#ifdef HAVE_SPATIAL + uint sp_segs=SPDIMS*2; + keyinfo->seg=pos-sp_segs; + keyinfo->keysegs--; + versioning= 0; +#else + my_errno=HA_ERR_UNSUPPORTED; + goto err; +#endif + } + else if (keyinfo->flag & HA_FULLTEXT) + { + versioning= 0; + DBUG_ASSERT(fulltext_keys); + { + uint k; + keyinfo->seg=pos; + for (k=0; k < FT_SEGS; k++) + { + *pos= ft_keysegs[k]; + pos[0].language= pos[-1].language; + if (!(pos[0].charset= pos[-1].charset)) + { + _ma_set_fatal_error_with_share(share, HA_ERR_CRASHED); + goto err; + } + pos++; + } + } + if (!share->ft2_keyinfo.seg) + { + memcpy(&share->ft2_keyinfo, keyinfo, sizeof(MARIA_KEYDEF)); + share->ft2_keyinfo.keysegs=1; + share->ft2_keyinfo.flag=0; + share->ft2_keyinfo.keylength= + share->ft2_keyinfo.minlength= + share->ft2_keyinfo.maxlength=HA_FT_WLEN+share->base.rec_reflength; + share->ft2_keyinfo.seg=pos-1; + share->ft2_keyinfo.end=pos; + setup_key_functions(& share->ft2_keyinfo); + } + keyinfo->ftkey_nr= ftkey_nr++; + } + setup_key_functions(keyinfo); + keyinfo->end=pos; + pos->type=HA_KEYTYPE_END; /* End */ + pos->length=share->base.rec_reflength; + pos->null_bit=0; + pos->flag=0; /* For purify */ + pos++; + } + for (i=0 ; i < uniques ; i++) + { + disk_pos=_ma_uniquedef_read(disk_pos, &share->uniqueinfo[i]); + disk_pos_assert(share, + disk_pos + share->uniqueinfo[i].keysegs * + HA_KEYSEG_SIZE, end_pos); + share->uniqueinfo[i].seg=pos; + for (j=0 ; j < share->uniqueinfo[i].keysegs; j++,pos++) + { + disk_pos=_ma_keyseg_read(disk_pos, pos); + if (pos->type == HA_KEYTYPE_TEXT || + pos->type == HA_KEYTYPE_VARTEXT1 || + pos->type == HA_KEYTYPE_VARTEXT2) + { + if (!pos->language) + pos->charset=default_charset_info; + else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME)))) + { + my_errno=HA_ERR_UNKNOWN_CHARSET; + goto err; + } + } + } + share->uniqueinfo[i].end=pos; + pos->type=HA_KEYTYPE_END; /* End */ + pos->null_bit=0; + pos->flag=0; + pos++; + } + share->ftkeys= ftkey_nr; + } + share->data_file_type= share->state.header.data_file_type; + share->base_length= (BASE_ROW_HEADER_SIZE + + share->base.is_nulls_extended + + share->base.null_bytes + + share->base.pack_bytes + + MY_TEST(share->options & HA_OPTION_CHECKSUM)); + share->kfile.file= kfile; + + if (open_flags & HA_OPEN_COPY) + { + /* + this instance will be a temporary one used just to create a data + file for REPAIR. Don't do logging. This base information will not go + to disk. + */ + born_transactional= FALSE; + } + if (born_transactional) + { + share->page_type= PAGECACHE_LSN_PAGE; + if (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS) + { + /* + Was repaired with maria_chk, maybe later maria_pack-ed. Some sort of + import into the server. It starts its existence (from the point of + view of the server, including server's recovery) now. + */ + if (((open_flags & HA_OPEN_FROM_SQL_LAYER) && + (share->state.changed & STATE_NOT_MOVABLE)) || maria_in_recovery) + _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE, + trnman_get_min_safe_trid(), TRUE, TRUE); + } + else if ((!LSN_VALID(share->state.create_rename_lsn) || + !LSN_VALID(share->state.is_of_horizon) || + (cmp_translog_addr(share->state.create_rename_lsn, + share->state.is_of_horizon) > 0) || + !LSN_VALID(share->state.skip_redo_lsn) || + (cmp_translog_addr(share->state.create_rename_lsn, + share->state.skip_redo_lsn) > 0))) + { + if (!(open_flags & HA_OPEN_FOR_REPAIR)) + { + /* + If in Recovery, it will not work. If LSN is invalid and not + LSN_NEEDS_NEW_STATE_LSNS, header must be corrupted. + In both cases, must repair. + */ + my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ? + HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE); + goto err; + } + else + { + /* + Open in repair mode. Ensure that we mark the table crashed, so + that we run auto_repair on it + */ + maria_mark_crashed_share(share); + } + } + else if (!(open_flags & HA_OPEN_FOR_REPAIR)) + { + /* create_rename_lsn != LSN_NEEDS_NEW_STATE_LSNS */ + share->state.changed|= STATE_NOT_MOVABLE; + } + } + else + share->page_type= PAGECACHE_PLAIN_PAGE; + share->now_transactional= born_transactional; + + /* Use pack_reclength as we don't want to modify base.pack_recklength */ + if (share->state.header.org_data_file_type == DYNAMIC_RECORD) + { + /* add bits used to pack data to pack_reclength for faster allocation */ + share->base.pack_reclength+= share->base.pack_bytes; + share->base.extra_rec_buff_size= + (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER) + MARIA_SPLIT_LENGTH + + MARIA_REC_BUFF_OFFSET); + } + if (share->data_file_type == COMPRESSED_RECORD) + { + /* Need some extra bytes for decode_bytes */ + share->base.extra_rec_buff_size+= 7; + } + share->base.default_rec_buff_size= MY_MAX(share->base.pack_reclength + + share->base.extra_rec_buff_size, + share->base.max_key_length); + + disk_pos_assert(share, + disk_pos + share->base.fields *MARIA_COLUMNDEF_SIZE, + end_pos); + for (i= j= 0 ; i < share->base.fields ; i++) + { + disk_pos=_ma_columndef_read(disk_pos,&share->columndef[i]); + share->columndef[i].pack_type=0; + share->columndef[i].huff_tree=0; + if (share->columndef[i].type == FIELD_BLOB) + { + share->blobs[j].pack_length= + share->columndef[i].length-portable_sizeof_char_ptr; + share->blobs[j].offset= share->columndef[i].offset; + j++; + } + if (share->columndef[i].type == FIELD_VARCHAR) + share->has_varchar_fields= 1; + if (share->columndef[i].null_bit) + share->has_null_fields= 1; + } + share->columndef[i].type= FIELD_LAST; /* End marker */ + disk_pos= _ma_column_nr_read(disk_pos, share->column_nr, + share->base.fields); + + if (MY_TEST(share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED)) + { + if (!(disk_pos= ma_crypt_read(share, disk_pos, + MY_TEST(open_flags & HA_OPEN_FOR_DROP)))) + goto err; + } + + if ((share->data_file_type == BLOCK_RECORD || + share->data_file_type == COMPRESSED_RECORD)) + { + if (!s3) + { + if (_ma_open_datafile(&info, share)) + goto err; + data_file= info.dfile.file; + } +#ifdef WITH_S3_STORAGE_ENGINE + else + data_file= info.dfile.file= s3f.unique_file_number(); +#endif /* WITH_S3_STORAGE_ENGINE */ + } + errpos= 5; + + if (open_flags & HA_OPEN_DELAY_KEY_WRITE) + share->options|= HA_OPTION_DELAY_KEY_WRITE; + if (mode == O_RDONLY) + share->options|= HA_OPTION_READ_ONLY_DATA; + share->is_log_table= FALSE; + + if (open_flags & HA_OPEN_TMP_TABLE || share->options & HA_OPTION_TMP_TABLE) + { + common_flag|= MY_THREAD_SPECIFIC; + share->options|= HA_OPTION_TMP_TABLE; + share->temporary= share->delay_key_write= 1; + share->write_flag=MYF(MY_NABP); + share->w_locks++; /* We don't have to update status */ + share->tot_locks++; + } + + _ma_set_index_pagecache_callbacks(&share->kfile, share); + share->this_process=(ulong) getpid(); +#ifdef MARIA_EXTERNAL_LOCKING + share->last_process= share->state.process; +#endif + share->base.key_parts=key_parts; + share->base.all_key_parts=key_parts+unique_key_parts; + if (!(share->last_version=share->state.version)) + share->last_version=1; /* Safety */ + share->rec_reflength=share->base.rec_reflength; /* May be changed */ + share->base.margin_key_file_length=(share->base.max_key_file_length - + (keys ? MARIA_INDEX_BLOCK_MARGIN * + share->block_size * keys : 0)); + my_free(disk_cache); + my_free(share_buff.state.rec_per_key_part); + disk_cache= 0; + share_buff.state.rec_per_key_part= 0; + + _ma_setup_functions(share); + max_data_file_length= share->base.max_data_file_length; + if ((*share->once_init)(share, info.dfile.file)) + goto err; + errpos= 6; + if (internal_table) + set_if_smaller(share->base.max_data_file_length, + max_data_file_length); + if (share->now_transactional) + { + /* Setup initial state that is visible for all */ + MARIA_STATE_HISTORY_CLOSED *history; + if ((history= (MARIA_STATE_HISTORY_CLOSED *) + my_hash_search(&maria_stored_state, + (uchar*) &share->state.create_rename_lsn, + sizeof(share->state.create_rename_lsn)))) + { + /* + Move history from hash to share. This is safe to do as we + know we are the only one that is using the share. + */ + share->state_history= + _ma_remove_not_visible_states(history->state_history, 0, 0); + history->state_history= 0; + (void) my_hash_delete(&maria_stored_state, (uchar*) history); + DBUG_PRINT("info", ("Reading state history. trid: %lu records: %lld", + (ulong) share->state_history->trid, + share->state_history->state.records)); + } + else + { + /* Table is not part of any active transaction; Create new history */ + if (!(share->state_history= (MARIA_STATE_HISTORY *) + my_malloc(PSI_INSTRUMENT_ME, sizeof(*share->state_history), + MYF(MY_WME)))) + goto err; + share->state_history->trid= 0; /* Visible by all */ + share->state_history->state= share->state.state; + share->state_history->next= 0; + } + } + errpos= 7; + thr_lock_init(&share->lock); + mysql_mutex_init(key_SHARE_intern_lock, + &share->intern_lock, MY_MUTEX_INIT_FAST); + mysql_mutex_init(key_SHARE_key_del_lock, + &share->key_del_lock, MY_MUTEX_INIT_FAST); + mysql_cond_init(key_SHARE_key_del_cond, &share->key_del_cond, 0); + mysql_mutex_init(key_SHARE_close_lock, + &share->close_lock, MY_MUTEX_INIT_FAST); + for (i=0; i<keys; i++) + mysql_rwlock_init(key_KEYINFO_root_lock, + &share->keyinfo[i].root_lock); + mysql_rwlock_init(key_SHARE_mmap_lock, &share->mmap_lock); + + share->row_is_visible= _ma_row_visible_always; + share->lock.get_status= _ma_reset_update_flag; + share->lock.start_trans= _ma_start_trans; + + if (!thr_lock_inited) + { + /* Probably a single threaded program; Don't use concurrent inserts */ + maria_concurrent_insert=0; + } + else if (maria_concurrent_insert) + { + share->non_transactional_concurrent_insert= + ((share->options & (HA_OPTION_READ_ONLY_DATA | HA_OPTION_TMP_TABLE | + HA_OPTION_COMPRESS_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD)) || + (open_flags & HA_OPEN_TMP_TABLE) || + share->data_file_type == BLOCK_RECORD || + share->have_rtree) ? 0 : 1; + if (share->non_transactional_concurrent_insert || + (!share->temporary && share->now_transactional && versioning)) + { + share->lock_key_trees= 1; + if (share->data_file_type == BLOCK_RECORD) + { + DBUG_ASSERT(share->now_transactional); + share->have_versioning= 1; + share->row_is_visible= _ma_row_visible_transactional_table; + share->lock.get_status= _ma_block_get_status; + share->lock.check_status= _ma_block_check_status; + share->lock.start_trans= _ma_block_start_trans; + /* + We can for the moment only allow multiple concurrent inserts + only if there is no auto-increment key. To lift this restriction + we have to: + - Extend statement base replication to support auto-increment + intervalls. + - Fix that we allocate auto-increment in intervals and that + it's properly reset if the interval was not used + */ + share->lock.allow_multiple_concurrent_insert= + share->base.auto_key == 0; + share->lock_restore_status= 0; + } + else + { + share->row_is_visible= _ma_row_visible_non_transactional_table; + share->lock.get_status= _ma_get_status; + share->lock.copy_status= _ma_copy_status; + share->lock.update_status= _ma_update_status; + share->lock.restore_status= _ma_restore_status; + share->lock.check_status= _ma_check_status; + share->lock_restore_status= _ma_restore_status; + } + } + else if (share->now_transactional) + { + DBUG_ASSERT(share->data_file_type == BLOCK_RECORD); + share->lock.start_trans= _ma_block_start_trans_no_versioning; + } + } +#ifdef SAFE_MUTEX + if (share->data_file_type == BLOCK_RECORD) + { + /* + We must have internal_lock before bitmap_lock because we call + _ma_flush_table_files() with internal_lock locked. + */ + mysql_mutex_lock(&share->intern_lock); + mysql_mutex_lock(&share->bitmap.bitmap_lock); + mysql_mutex_unlock(&share->bitmap.bitmap_lock); + mysql_mutex_unlock(&share->intern_lock); + } +#endif + /* + Memory mapping can only be requested after initializing intern_lock. + */ + if (open_flags & HA_OPEN_MMAP) + { + info.s= share; + maria_extra(&info, HA_EXTRA_MMAP, 0); + } +#ifdef WITH_S3_STORAGE_ENGINE + if (s3_client) + { + size_t block_size= share->base.s3_block_size; + s3f.set_option(s3_client, MS3_OPT_BUFFER_CHUNK_SIZE, &block_size); + } +#endif /* WITH_S3_STORAGE_ENGINE */ + } + else + { + share= old_info->s; + if (share->data_file_type == BLOCK_RECORD) + data_file= share->bitmap.file.file; /* Only opened once */ + } + +#ifdef WITH_S3_STORAGE_ENGINE + if (index_header.alloc_ptr) + s3f.free(&index_header); +#endif /* WITH_S3_STORAGE_ENGINE */ + + if (!(m_info= maria_clone_internal(share, mode, data_file, + internal_table, s3_client))) + goto err; + + if (maria_is_crashed(m_info)) + DBUG_PRINT("warning", ("table is crashed: changed: %u", + share->state.changed)); + + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); + + m_info->open_flags= open_flags; + m_info->stack_end_ptr= &my_thread_var->stack_ends_here; + DBUG_PRINT("exit", ("table: %p name: %s",m_info, name)); + DBUG_RETURN(m_info); + +err: + DBUG_PRINT("error", ("error: %d errpos: %d", my_errno, errpos)); + save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE; + if ((save_errno == HA_ERR_CRASHED) || + (save_errno == HA_ERR_CRASHED_ON_USAGE) || + (save_errno == HA_ERR_CRASHED_ON_REPAIR)) + { + LEX_STRING tmp_name; + tmp_name.str= (char*) name; + tmp_name.length= strlen(name); + _ma_report_error(save_errno, &tmp_name, MYF(ME_ERROR_LOG)); + } + switch (errpos) { + case 7: + thr_lock_delete(&share->lock); + /* fall through */ + case 6: + /* Avoid mutex test in _ma_bitmap_end() */ + share->internal_table= 1; + (*share->once_end)(share); + /* fall through */ + case 5: + if (data_file >= 0 && !s3_client) + mysql_file_close(data_file, MYF(0)); + if (old_info) + break; /* Don't remove open table */ + /* fall through */ + case 4: + ma_crypt_free(share); + my_free(share); + /* fall through */ + case 3: + my_free(disk_cache); + my_free(share_buff.state.rec_per_key_part); + /* fall through */ + case 1: + if (!s3) + mysql_file_close(kfile,MYF(0)); + my_free(share_s3); + /* fall through */ + case 0: + default: + break; + } +#ifdef WITH_S3_STORAGE_ENGINE + if (s3_client) + s3f.deinit(s3_client); + if (index_header.alloc_ptr) + s3f.free(&index_header); +#endif /* WITH_S3_STORAGE_ENGINE */ + if (!internal_table) + mysql_mutex_unlock(&THR_LOCK_maria); + my_errno= save_errno; + DBUG_RETURN (NULL); +} /* maria_open */ + + +/* + Reallocate a buffer, if the current buffer is not large enough +*/ + +my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size, + size_t new_size, myf flag) +{ + if (*old_size < new_size) + { + uchar *addr; + if (!(addr= (uchar*) my_realloc(PSI_INSTRUMENT_ME, *old_addr, new_size, + MYF(MY_ALLOW_ZERO_PTR | flag)))) + return 1; + *old_addr= addr; + *old_size= new_size; + } + return 0; +} + + +ulonglong _ma_safe_mul(ulonglong a, ulonglong b) +{ + ulonglong max_val= ~ (ulonglong) 0; /* my_off_t is unsigned */ + + if (!a || max_val / a < b) + return max_val; + return a*b; +} + + /* Set up functions in structs */ + +void _ma_setup_functions(register MARIA_SHARE *share) +{ + share->once_init= maria_once_init_dummy; + share->once_end= maria_once_end_dummy; + share->init= maria_scan_init_dummy; + share->end= maria_scan_end_dummy; + share->scan_init= maria_scan_init_dummy;/* Compat. dummy function */ + share->scan_end= maria_scan_end_dummy;/* Compat. dummy function */ + share->scan_remember_pos= _ma_def_scan_remember_pos; + share->scan_restore_pos= _ma_def_scan_restore_pos; + + share->write_record_init= _ma_write_init_default; + share->write_record_abort= _ma_write_abort_default; + share->keypos_to_recpos= _ma_transparent_recpos; + share->recpos_to_keypos= _ma_transparent_recpos; + + switch (share->data_file_type) { + case COMPRESSED_RECORD: + share->read_record= _ma_read_pack_record; + share->scan= _ma_read_rnd_pack_record; + share->once_init= _ma_once_init_pack_row; + share->once_end= _ma_once_end_pack_row; + /* + Calculate checksum according to data in the original, not compressed, + row. + */ + if (share->state.header.org_data_file_type == STATIC_RECORD && + ! (share->options & HA_OPTION_NULL_FIELDS)) + share->calc_checksum= _ma_static_checksum; + else + share->calc_checksum= _ma_checksum; + share->calc_write_checksum= share->calc_checksum; + break; + case DYNAMIC_RECORD: + share->read_record= _ma_read_dynamic_record; + share->scan= _ma_read_rnd_dynamic_record; + share->delete_record= _ma_delete_dynamic_record; + share->compare_record= _ma_cmp_dynamic_record; + share->compare_unique= _ma_cmp_dynamic_unique; + share->calc_checksum= share->calc_write_checksum= _ma_checksum; + if (share->base.blobs) + { + share->update_record= _ma_update_blob_record; + share->write_record= _ma_write_blob_record; + } + else + { + share->write_record= _ma_write_dynamic_record; + share->update_record= _ma_update_dynamic_record; + } + break; + case STATIC_RECORD: + share->read_record= _ma_read_static_record; + share->scan= _ma_read_rnd_static_record; + share->delete_record= _ma_delete_static_record; + share->compare_record= _ma_cmp_static_record; + share->update_record= _ma_update_static_record; + share->write_record= _ma_write_static_record; + share->compare_unique= _ma_cmp_static_unique; + share->keypos_to_recpos= _ma_static_keypos_to_recpos; + share->recpos_to_keypos= _ma_static_recpos_to_keypos; + if (share->state.header.org_data_file_type == STATIC_RECORD && + ! (share->options & HA_OPTION_NULL_FIELDS)) + share->calc_checksum= _ma_static_checksum; + else + share->calc_checksum= _ma_checksum; + break; + case NO_RECORD: + share->read_record= _ma_read_no_record; + share->scan= _ma_read_rnd_no_record; + share->delete_record= _ma_delete_no_record; + share->update_record= _ma_update_no_record; + share->write_record= _ma_write_no_record; + share->recpos_to_keypos= _ma_no_keypos_to_recpos; + share->keypos_to_recpos= _ma_no_keypos_to_recpos; + + /* Abort if following functions are called */ + share->compare_record= 0; + share->compare_unique= 0; + share->calc_checksum= 0; + break; + case BLOCK_RECORD: + share->once_init= _ma_once_init_block_record; + share->once_end= _ma_once_end_block_record; + share->init= _ma_init_block_record; + share->end= _ma_end_block_record; + share->write_record_init= _ma_write_init_block_record; + share->write_record_abort= _ma_write_abort_block_record; + share->scan_init= _ma_scan_init_block_record; + share->scan_end= _ma_scan_end_block_record; + share->scan= _ma_scan_block_record; + share->scan_remember_pos= _ma_scan_remember_block_record; + share->scan_restore_pos= _ma_scan_restore_block_record; + share->read_record= _ma_read_block_record; + share->delete_record= _ma_delete_block_record; + share->compare_record= _ma_compare_block_record; + share->update_record= _ma_update_block_record; + share->write_record= _ma_write_block_record; + share->compare_unique= _ma_cmp_block_unique; + share->calc_checksum= _ma_checksum; + share->keypos_to_recpos= _ma_transaction_keypos_to_recpos; + share->recpos_to_keypos= _ma_transaction_recpos_to_keypos; + + /* + write_block_record() will calculate the checksum; Tell maria_write() + that it doesn't have to do this. + */ + share->calc_write_checksum= 0; + break; + } + share->file_read= _ma_nommap_pread; + share->file_write= _ma_nommap_pwrite; + share->calc_check_checksum= share->calc_checksum; + + if (!(share->options & HA_OPTION_CHECKSUM) && + share->data_file_type != COMPRESSED_RECORD) + share->calc_checksum= share->calc_write_checksum= 0; + return; +} + + +static void setup_key_functions(register MARIA_KEYDEF *keyinfo) +{ + if (keyinfo->key_alg == HA_KEY_ALG_RTREE) + { +#ifdef HAVE_RTREE_KEYS + keyinfo->ck_insert = maria_rtree_insert; + keyinfo->ck_delete = maria_rtree_delete; +#else + DBUG_ASSERT(0); /* maria_open should check it never happens */ +#endif + } + else + { + keyinfo->ck_insert = _ma_ck_write; + keyinfo->ck_delete = _ma_ck_delete; + } + if (keyinfo->flag & HA_SPATIAL) + keyinfo->make_key= _ma_sp_make_key; + else + keyinfo->make_key= _ma_make_key; + + if (keyinfo->flag & HA_BINARY_PACK_KEY) + { /* Simple prefix compression */ + keyinfo->bin_search= _ma_seq_search; + keyinfo->get_key= _ma_get_binary_pack_key; + keyinfo->skip_key= _ma_skip_binary_pack_key; + keyinfo->pack_key= _ma_calc_bin_pack_key_length; + keyinfo->store_key= _ma_store_bin_pack_key; + } + else if (keyinfo->flag & HA_VAR_LENGTH_KEY) + { + keyinfo->get_key= _ma_get_pack_key; + keyinfo->skip_key= _ma_skip_pack_key; + if (keyinfo->seg[0].flag & HA_PACK_KEY) + { /* Prefix compression */ + /* + _ma_prefix_search() compares end-space against ASCII blank (' '). + It cannot be used for character sets, that do not encode the + blank character like ASCII does. UCS2 is an example. All + character sets with a fixed width > 1 or a mimimum width > 1 + cannot represent blank like ASCII does. In these cases we have + to use _ma_seq_search() for the search. + */ + if (!keyinfo->seg->charset || use_strnxfrm(keyinfo->seg->charset) || + (keyinfo->seg->flag & HA_NULL_PART) || + keyinfo->seg->charset->mbminlen > 1) + keyinfo->bin_search= _ma_seq_search; + else + keyinfo->bin_search= _ma_prefix_search; + keyinfo->pack_key= _ma_calc_var_pack_key_length; + keyinfo->store_key= _ma_store_var_pack_key; + } + else + { + keyinfo->bin_search= _ma_seq_search; + keyinfo->pack_key= _ma_calc_var_key_length; /* Variable length key */ + keyinfo->store_key= _ma_store_static_key; + } + } + else + { + keyinfo->bin_search= _ma_bin_search; + keyinfo->get_key= _ma_get_static_key; + keyinfo->skip_key= _ma_skip_static_key; + keyinfo->pack_key= _ma_calc_static_key_length; + keyinfo->store_key= _ma_store_static_key; + } + + /* set keyinfo->write_comp_flag */ + if (keyinfo->flag & HA_SORT_ALLOWS_SAME) + keyinfo->write_comp_flag=SEARCH_BIGGER; /* Put after same key */ + else if (keyinfo->flag & ( HA_NOSAME | HA_FULLTEXT)) + { + keyinfo->write_comp_flag= SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */ + if (keyinfo->flag & HA_NULL_ARE_EQUAL) + keyinfo->write_comp_flag|= SEARCH_NULL_ARE_EQUAL; + } + else + keyinfo->write_comp_flag= SEARCH_SAME; /* Keys in rec-pos order */ + keyinfo->write_comp_flag|= SEARCH_INSERT; + return; +} + + +/** + @brief Function to save and store the header in the index file (.MAI) + + Operates under MARIA_SHARE::intern_lock if requested. + Sets MARIA_SHARE::MARIA_STATE_INFO::is_of_horizon if transactional table. + Then calls _ma_state_info_write_sub(). + + @param share table + @param pWrite bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) + is set my_pwrite() is used otherwise my_write(); + if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info + about keys is written (should only be needed + after ALTER TABLE ENABLE/DISABLE KEYS, and + REPAIR/OPTIMIZE); if 4 (MA_STATE_INFO_WRITE_LOCK) + is set, MARIA_SHARE::intern_lock is taken. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite) +{ + uint res; + if (share->options & HA_OPTION_READ_ONLY_DATA) + return 0; + + if (pWrite & MA_STATE_INFO_WRITE_LOCK) + mysql_mutex_lock(&share->intern_lock); + else if (maria_multi_threaded && !share->temporary) + mysql_mutex_assert_owner(&share->intern_lock); + if (share->base.born_transactional && translog_status == TRANSLOG_OK && + !maria_in_recovery) + { + /* + In a recovery, we want to set is_of_horizon to the LSN of the last + record executed by Recovery, not the current EOF of the log (which + is too new). Recovery does it by itself. + */ + share->state.is_of_horizon= translog_get_horizon(); + DBUG_PRINT("info", ("is_of_horizon set to LSN " LSN_FMT "", + LSN_IN_PARTS(share->state.is_of_horizon))); + } + res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite); + if (pWrite & MA_STATE_INFO_WRITE_LOCK) + mysql_mutex_unlock(&share->intern_lock); + /* If open_count != 0 we have to write the state again at close */ + share->changed= share->state.open_count != 0; + return res; +} + + +/** + @brief Function to save and store the header in the index file (.MYI). + + Shortcut to use instead of _ma_state_info_write() when appropriate. + + @param file descriptor of the index file to write + @param state state information to write to the file + @param pWrite bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) + is set my_pwrite() is used otherwise my_write(); + if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info + about keys is written (should only be needed + after ALTER TABLE ENABLE/DISABLE KEYS, and + REPAIR/OPTIMIZE). + + @notes + For transactional multiuser tables, this function is called + with intern_lock & translog_lock or when the last thread who + is using the table is closing it. + Because of the translog_lock we don't need to have a lock on + key_del_lock. + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite) +{ + uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE]; + uchar *ptr=buff; + uint i, keys= (uint) state->header.keys; + size_t res; + DBUG_ENTER("_ma_state_info_write_sub"); + DBUG_PRINT("info", ("Records: %lld", state->state.records)); + + memcpy(ptr,&state->header,sizeof(state->header)); + ptr+=sizeof(state->header); + + /* open_count must be first because of _ma_mark_file_changed ! */ + mi_int2store(ptr,state->open_count); ptr+= 2; + /* changed must be second, because of _ma_mark_file_crashed */ + mi_int2store(ptr,state->changed); ptr+= 2; + + /* + If you change the offset of these LSNs, note that some functions do a + direct write of them without going through this function. + */ + lsn_store(ptr, state->create_rename_lsn); ptr+= LSN_STORE_SIZE; + lsn_store(ptr, state->is_of_horizon); ptr+= LSN_STORE_SIZE; + lsn_store(ptr, state->skip_redo_lsn); ptr+= LSN_STORE_SIZE; + mi_rowstore(ptr,state->state.records); ptr+= 8; + mi_rowstore(ptr,state->state.del); ptr+= 8; + mi_rowstore(ptr,state->split); ptr+= 8; + mi_sizestore(ptr,state->dellink); ptr+= 8; + mi_sizestore(ptr,state->first_bitmap_with_space); ptr+= 8; + mi_sizestore(ptr,state->state.key_file_length); ptr+= 8; + mi_sizestore(ptr,state->state.data_file_length); ptr+= 8; + mi_sizestore(ptr,state->state.empty); ptr+= 8; + mi_sizestore(ptr,state->state.key_empty); ptr+= 8; + mi_int8store(ptr,state->auto_increment); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->state.checksum); ptr+= 8; + mi_int8store(ptr,state->create_trid); ptr+= 8; + mi_int4store(ptr,state->status); ptr+= 4; + mi_int4store(ptr,state->update_count); ptr+= 4; + *ptr++= state->sortkey; + *ptr++= 0; /* Reserved */ + ptr+= state->state_diff_length; + + for (i=0; i < keys; i++) + { + mi_sizestore(ptr,state->key_root[i]); ptr+= 8; + } + mi_sizestore(ptr,state->key_del); ptr+= 8; + if (pWrite & MA_STATE_INFO_WRITE_FULL_INFO) /* From maria_chk */ + { + uint key_parts= mi_uint2korr(state->header.key_parts); + mi_int4store(ptr,state->sec_index_changed); ptr+= 4; + mi_int4store(ptr,state->sec_index_used); ptr+= 4; + mi_int4store(ptr,state->version); ptr+= 4; + mi_int8store(ptr,state->key_map); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->create_time); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->recover_time); ptr+= 8; + mi_int8store(ptr,(ulonglong) state->check_time); ptr+= 8; + mi_sizestore(ptr, state->records_at_analyze); ptr+= 8; + /* reserve place for some information per key */ + bzero(ptr, keys*4); ptr+= keys*4; + for (i=0 ; i < key_parts ; i++) + { + float8store(ptr, state->rec_per_key_part[i]); ptr+= 8; + mi_int4store(ptr, state->nulls_per_key_part[i]); ptr+= 4; + } + } + + res= (pWrite & MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) ? + mysql_file_pwrite(file, buff, (size_t) (ptr-buff), 0L, + MYF(MY_NABP | MY_THREADSAFE)) : + mysql_file_write(file, buff, (size_t) (ptr-buff), + MYF(MY_NABP)); + DBUG_RETURN(res != 0); +} + + +static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state, myf flag) +{ + uint i,keys,key_parts; + DBUG_ENTER("_ma_state_info_read"); + + memcpy(&state->header,ptr, sizeof(state->header)); + ptr+= sizeof(state->header); + keys= (uint) state->header.keys; + key_parts= mi_uint2korr(state->header.key_parts); + + /* Allocate memory for key parts if not already done */ + if (!state->rec_per_key_part && + !my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME | flag), + &state->rec_per_key_part, + sizeof(*state->rec_per_key_part) * key_parts, + &state->nulls_per_key_part, + sizeof(*state->nulls_per_key_part) * key_parts, + NullS)) + DBUG_RETURN(0); + + state->open_count = mi_uint2korr(ptr); ptr+= 2; + state->changed= mi_uint2korr(ptr); ptr+= 2; + state->create_rename_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->is_of_horizon= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->skip_redo_lsn= lsn_korr(ptr); ptr+= LSN_STORE_SIZE; + state->state.records= mi_rowkorr(ptr); ptr+= 8; + state->state.del = mi_rowkorr(ptr); ptr+= 8; + state->split = mi_rowkorr(ptr); ptr+= 8; + state->dellink= mi_sizekorr(ptr); ptr+= 8; + state->first_bitmap_with_space= mi_sizekorr(ptr); ptr+= 8; + state->state.key_file_length = mi_sizekorr(ptr); ptr+= 8; + state->state.data_file_length= mi_sizekorr(ptr); ptr+= 8; + state->state.empty = mi_sizekorr(ptr); ptr+= 8; + state->state.key_empty= mi_sizekorr(ptr); ptr+= 8; + state->auto_increment=mi_uint8korr(ptr); ptr+= 8; + state->state.checksum=(ha_checksum) mi_uint8korr(ptr);ptr+= 8; + state->create_trid= mi_uint8korr(ptr); ptr+= 8; + state->status = mi_uint4korr(ptr); ptr+= 4; + state->update_count=mi_uint4korr(ptr); ptr+= 4; + state->sortkey= (uint) *ptr++; + ptr++; /* reserved */ + + ptr+= state->state_diff_length; + + for (i=0; i < keys; i++) + { + state->key_root[i]= mi_sizekorr(ptr); ptr+= 8; + } + state->key_del= mi_sizekorr(ptr); ptr+= 8; + state->sec_index_changed = mi_uint4korr(ptr); ptr+= 4; + state->sec_index_used = mi_uint4korr(ptr); ptr+= 4; + state->version = mi_uint4korr(ptr); ptr+= 4; + state->key_map = mi_uint8korr(ptr); ptr+= 8; + state->create_time = (time_t) mi_sizekorr(ptr); ptr+= 8; + state->recover_time =(time_t) mi_sizekorr(ptr); ptr+= 8; + state->check_time = (time_t) mi_sizekorr(ptr); ptr+= 8; + state->records_at_analyze= mi_sizekorr(ptr); ptr+= 8; + ptr+= keys * 4; /* Skip reserved bytes */ + for (i=0 ; i < key_parts ; i++) + { + float8get(state->rec_per_key_part[i], ptr); ptr+= 8; + state->nulls_per_key_part[i]= mi_uint4korr(ptr); ptr+= 4; + } + + DBUG_PRINT("info", ("Records: %lld", state->state.records)); + DBUG_RETURN(ptr); +} + + +/** + @brief Fills the state by reading its copy on disk. + + Should not be called for transactional tables, as their state on disk is + rarely current and so is often misleading for a reader. + Does nothing in single user mode. + + @param file file to read from + @param state state which will be filled +*/ + +uint _ma_state_info_read_dsk(File file __attribute__((unused)), + MARIA_STATE_INFO *state __attribute__((unused))) +{ +#ifdef MARIA_EXTERNAL_LOCKING + uchar buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE]; + + /* trick to detect transactional tables */ + DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE); + if (!maria_single_user) + { + if (mysql_file_pread(file, buff, state->state_length, 0L, MYF(MY_NABP))) + return 1; + _ma_state_info_read(buff, state); + } +#endif + return 0; +} + + +/**************************************************************************** +** store MARIA_BASE_INFO +****************************************************************************/ + +uint _ma_base_info_write(File file, MARIA_BASE_INFO *base) +{ + uchar buff[MARIA_BASE_INFO_SIZE], *ptr=buff; + + bmove(ptr, maria_uuid, MY_UUID_SIZE); + ptr+= MY_UUID_SIZE; + mi_sizestore(ptr,base->keystart); ptr+= 8; + mi_sizestore(ptr,base->max_data_file_length); ptr+= 8; + mi_sizestore(ptr,base->max_key_file_length); ptr+= 8; + mi_rowstore(ptr,base->records); ptr+= 8; + mi_rowstore(ptr,base->reloc); ptr+= 8; + mi_int4store(ptr,base->mean_row_length); ptr+= 4; + mi_int4store(ptr,base->reclength); ptr+= 4; + mi_int4store(ptr,base->pack_reclength); ptr+= 4; + mi_int4store(ptr,base->min_pack_length); ptr+= 4; + mi_int4store(ptr,base->max_pack_length); ptr+= 4; + mi_int4store(ptr,base->min_block_length); ptr+= 4; + mi_int2store(ptr,base->fields); ptr+= 2; + mi_int2store(ptr,base->fixed_not_null_fields); ptr+= 2; + mi_int2store(ptr,base->fixed_not_null_fields_length); ptr+= 2; + mi_int2store(ptr,base->max_field_lengths); ptr+= 2; + mi_int2store(ptr,base->pack_fields); ptr+= 2; + mi_int2store(ptr,base->extra_options) ptr+= 2; + mi_int2store(ptr,base->null_bytes); ptr+= 2; + mi_int2store(ptr,base->original_null_bytes); ptr+= 2; + mi_int2store(ptr,base->field_offsets); ptr+= 2; + mi_int2store(ptr,base->language); ptr+= 2; + mi_int2store(ptr,base->block_size); ptr+= 2; + *ptr++= base->rec_reflength; + *ptr++= base->key_reflength; + *ptr++= base->keys; + *ptr++= base->auto_key; + *ptr++= base->born_transactional; + *ptr++= base->compression_algorithm; + mi_int2store(ptr,base->pack_bytes); ptr+= 2; + mi_int2store(ptr,base->blobs); ptr+= 2; + mi_int2store(ptr,base->max_key_block_length); ptr+= 2; + mi_int2store(ptr,base->max_key_length); ptr+= 2; + mi_int2store(ptr,base->extra_alloc_bytes); ptr+= 2; + *ptr++= base->extra_alloc_procent; + mi_int3store(ptr, base->s3_block_size); ptr+= 3; + bzero(ptr,13); ptr+= 13; /* extra */ + DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE); + return mysql_file_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + + +/*-------------------------------------------------------------------------- + maria_keydef +---------------------------------------------------------------------------*/ + +my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef) +{ + uchar buff[MARIA_KEYDEF_SIZE]; + uchar *ptr=buff; + + *ptr++= (uchar) keydef->keysegs; + *ptr++= keydef->key_alg; /* Rtree or Btree */ + mi_int2store(ptr,keydef->flag); ptr+= 2; + mi_int2store(ptr,keydef->block_length); ptr+= 2; + mi_int2store(ptr,keydef->keylength); ptr+= 2; + mi_int2store(ptr,keydef->minlength); ptr+= 2; + mi_int2store(ptr,keydef->maxlength); ptr+= 2; + return mysql_file_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef) +{ + keydef->keysegs = (uint) *ptr++; + keydef->key_alg = *ptr++; /* Rtree or Btree */ + + keydef->flag = mi_uint2korr(ptr); ptr+= 2; + keydef->block_length = mi_uint2korr(ptr); ptr+= 2; + keydef->keylength = mi_uint2korr(ptr); ptr+= 2; + keydef->minlength = mi_uint2korr(ptr); ptr+= 2; + keydef->maxlength = mi_uint2korr(ptr); ptr+= 2; + keydef->version = 0; /* Not saved */ + keydef->parser = &ft_default_parser; + keydef->ftkey_nr = 0; + return ptr; +} + +/*************************************************************************** +** maria_keyseg +***************************************************************************/ + +my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg) +{ + uchar buff[HA_KEYSEG_SIZE]; + uchar *ptr=buff; + ulong pos; + + *ptr++= keyseg->type; + *ptr++= keyseg->language & 0xFF; /* Collation ID, low byte */ + *ptr++= keyseg->null_bit; + *ptr++= keyseg->bit_start; + *ptr++= keyseg->language >> 8; /* Collation ID, high byte */ + *ptr++= keyseg->bit_length; + mi_int2store(ptr,keyseg->flag); ptr+= 2; + mi_int2store(ptr,keyseg->length); ptr+= 2; + mi_int4store(ptr,keyseg->start); ptr+= 4; + pos= keyseg->null_bit ? keyseg->null_pos : keyseg->bit_pos; + mi_int4store(ptr, pos); + ptr+=4; + + return mysql_file_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + + +uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg) +{ + keyseg->type = *ptr++; + keyseg->language = *ptr++; + keyseg->null_bit = *ptr++; + keyseg->bit_start = *ptr++; + keyseg->language += ((uint16) (*ptr++)) << 8; + keyseg->bit_length = *ptr++; + keyseg->flag = mi_uint2korr(ptr); ptr+= 2; + keyseg->length = mi_uint2korr(ptr); ptr+= 2; + keyseg->start = mi_uint4korr(ptr); ptr+= 4; + keyseg->null_pos = mi_uint4korr(ptr); ptr+= 4; + keyseg->charset=0; /* Will be filled in later */ + if (keyseg->null_bit) + keyseg->bit_pos= (uint16)(keyseg->null_pos + (keyseg->null_bit == 7)); + else + { + keyseg->bit_pos= (uint16)keyseg->null_pos; + keyseg->null_pos= 0; + } + return ptr; +} + +/*-------------------------------------------------------------------------- + maria_uniquedef +---------------------------------------------------------------------------*/ + +my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *def) +{ + uchar buff[MARIA_UNIQUEDEF_SIZE]; + uchar *ptr=buff; + + mi_int2store(ptr,def->keysegs); ptr+=2; + *ptr++= (uchar) def->key; + *ptr++ = (uchar) def->null_are_equal; + + return mysql_file_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *def) +{ + def->keysegs = mi_uint2korr(ptr); + def->key = ptr[2]; + def->null_are_equal=ptr[3]; + return ptr+4; /* 1 extra uchar */ +} + +/*************************************************************************** +** MARIA_COLUMNDEF +***************************************************************************/ + +my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef) +{ + uchar buff[MARIA_COLUMNDEF_SIZE]; + uchar *ptr=buff; + uint low_offset= (uint) (columndef->offset & 0xffff); + uint high_offset= (uint) (columndef->offset >> 16); + + mi_int2store(ptr,(ulong) columndef->column_nr); ptr+= 2; + mi_int2store(ptr, low_offset); ptr+= 2; + mi_int2store(ptr,columndef->type); ptr+= 2; + mi_int2store(ptr,columndef->length); ptr+= 2; + mi_int2store(ptr,columndef->fill_length); ptr+= 2; + mi_int2store(ptr,columndef->null_pos); ptr+= 2; + mi_int2store(ptr,columndef->empty_pos); ptr+= 2; + + (*ptr++)= columndef->null_bit; + (*ptr++)= columndef->empty_bit; + mi_int2store(ptr, high_offset); ptr+= 2; + ptr[0]= ptr[1]= 0; ptr+= 2; /* For future */ + return mysql_file_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0; +} + +uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef) +{ + uint high_offset; + columndef->column_nr= mi_uint2korr(ptr); ptr+= 2; + columndef->offset= mi_uint2korr(ptr); ptr+= 2; + columndef->type= mi_sint2korr(ptr); ptr+= 2; + columndef->length= mi_uint2korr(ptr); ptr+= 2; + columndef->fill_length= mi_uint2korr(ptr); ptr+= 2; + columndef->null_pos= mi_uint2korr(ptr); ptr+= 2; + columndef->empty_pos= mi_uint2korr(ptr); ptr+= 2; + columndef->null_bit= (uint8) *ptr++; + columndef->empty_bit= (uint8) *ptr++; + high_offset= mi_uint2korr(ptr); ptr+= 2; + columndef->offset|= ((ulong) high_offset << 16); + ptr+= 2; + return ptr; +} + +my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns) +{ + uchar *buff, *ptr, *end; + size_t size= columns*2; + my_bool res; + + if (!(buff= (uchar*) my_alloca(size))) + return 1; + for (ptr= buff, end= ptr + size; ptr < end ; ptr+= 2, offsets++) + int2store(ptr, *offsets); + res= mysql_file_write(file, buff, size, MYF(MY_NABP)) != 0; + my_afree(buff); + return res; +} + + +uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns) +{ + uchar *end; + size_t size= columns*2; + for (end= ptr + size; ptr < end ; ptr+=2, offsets++) + *offsets= uint2korr(ptr); + return ptr; +} + +/** + @brief Set callbacks for data pages + + @note + We don't use pagecache_file_init here, as we want to keep the + code readable +*/ + +void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share) +{ + pagecache_file_set_null_hooks(file); + file->callback_data= (uchar*) share; + file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */ + file->post_write_hook= maria_page_write_failure; + + if (share->temporary) + { + file->post_read_hook= &maria_page_crc_check_none; + file->pre_write_hook= &maria_page_filler_set_none; + } + else + { + file->post_read_hook= &maria_page_crc_check_data; + if (share->options & HA_OPTION_PAGE_CHECKSUM) + file->pre_write_hook= &maria_page_crc_set_normal; + else + file->pre_write_hook= &maria_page_filler_set_normal; + if (share->now_transactional) + file->flush_log_callback= maria_flush_log_for_page; + } + + if (MY_TEST(share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED)) + { + ma_crypt_set_data_pagecache_callbacks(file, share); + } +} + + +/** + @brief Set callbacks for index pages + + @note + We don't use pagecache_file_init here, as we want to keep the + code readable +*/ + +void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share) +{ + pagecache_file_set_null_hooks(file); + file->callback_data= (uchar*) share; + file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */ + file->post_write_hook= maria_page_write_failure; + + if (share->temporary) + { + file->post_read_hook= &maria_page_crc_check_none; + file->pre_write_hook= &maria_page_filler_set_none; + } + else + { + file->post_read_hook= &maria_page_crc_check_index; + if (share->options & HA_OPTION_PAGE_CHECKSUM) + file->pre_write_hook= &maria_page_crc_set_index; + else + file->pre_write_hook= &maria_page_filler_set_normal; + + if (share->now_transactional) + file->flush_log_callback= maria_flush_log_for_page; + } + + if (MY_TEST(share->base.extra_options & MA_EXTRA_OPTIONS_ENCRYPTED)) + { + ma_crypt_set_index_pagecache_callbacks(file, share); + } +} + + +/************************************************************************** + Open data file + We can't use dup() here as the data file descriptors need to have different + active seek-positions. +*************************************************************************/ + +int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share) +{ + myf flags= (share->mode & O_NOFOLLOW) ? MY_NOSYMLINKS | MY_WME : MY_WME; + if (share->temporary) + flags|= MY_THREAD_SPECIFIC; + DEBUG_SYNC_C("mi_open_datafile"); + info->dfile.file= share->bitmap.file.file= + mysql_file_open(key_file_dfile, share->data_file_name.str, + share->mode | O_SHARE | O_CLOEXEC, flags); + return info->dfile.file >= 0 ? 0 : 1; +} + + +int _ma_open_keyfile(MARIA_SHARE *share) +{ + /* + Modifications to share->kfile should be under intern_lock to protect + against a concurrent checkpoint. + */ + mysql_mutex_lock(&share->intern_lock); + share->kfile.file= mysql_file_open(key_file_kfile, + share->unique_file_name.str, + share->mode | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(MY_WME | MY_NOSYMLINKS)); + mysql_mutex_unlock(&share->intern_lock); + return (share->kfile.file < 0); +} + + +/* + Disable all indexes. + + SYNOPSIS + maria_disable_indexes() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Disable all indexes. + + RETURN + 0 ok +*/ + +int maria_disable_indexes(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + maria_clear_all_keys_active(share->state.key_map); + return 0; +} + + +/* + Enable all indexes + + SYNOPSIS + maria_enable_indexes() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Enable all indexes. The indexes might have been disabled + by maria_disable_index() before. + The function works only if both data and indexes are empty, + otherwise a repair is required. + To be sure, call handler::delete_all_rows() before. + + RETURN + 0 ok + HA_ERR_CRASHED data or index is non-empty. +*/ + +int maria_enable_indexes(MARIA_HA *info) +{ + int error= 0; + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_enable_indexes"); + + if ((share->state.state.data_file_length != + (share->data_file_type == BLOCK_RECORD ? share->block_size : 0)) || + (share->state.state.key_file_length != share->base.keystart)) + { + DBUG_PRINT("error", ("data_file_length: %lu key_file_length: %lu", + (ulong) share->state.state.data_file_length, + (ulong) share->state.state.key_file_length)); + _ma_set_fatal_error(info, HA_ERR_CRASHED); + error= HA_ERR_CRASHED; + } + else + maria_set_all_keys_active(share->state.key_map, share->base.keys); + DBUG_RETURN(error); +} + + +/* + Test if indexes are disabled. + + SYNOPSIS + maria_indexes_are_disabled() + info A pointer to the MARIA storage engine MARIA_HA struct. + + DESCRIPTION + Test if indexes are disabled. + + RETURN + 0 indexes are not disabled + 1 all indexes are disabled + 2 non-unique indexes are disabled +*/ + +int maria_indexes_are_disabled(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + + /* + No keys or all are enabled. keys is the number of keys. Left shifted + gives us only one bit set. When decreased by one, gives us all all bits + up to this one set and it gets unset. + */ + if (!share->base.keys || + (maria_is_all_keys_active(share->state.key_map, share->base.keys))) + return 0; + + /* All are disabled */ + if (maria_is_any_key_active(share->state.key_map)) + return 1; + + /* + We have keys. Some enabled, some disabled. + Don't check for any non-unique disabled but return directly 2 + */ + return 2; +} + + +static my_bool maria_scan_init_dummy(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + +static void maria_scan_end_dummy(MARIA_HA *info __attribute__((unused))) +{ +} + +static my_bool maria_once_init_dummy(MARIA_SHARE *share + __attribute__((unused)), + File dfile __attribute__((unused))) +{ + return 0; +} + +static my_bool maria_once_end_dummy(MARIA_SHARE *share __attribute__((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c new file mode 100644 index 00000000..19783423 --- /dev/null +++ b/storage/maria/ma_packrec.c @@ -0,0 +1,1733 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + /* Functions to compressed records */ + +#include "maria_def.h" + +#define IS_CHAR ((uint) 32768) /* Bit if char (not offset) in tree */ + +/* Some definitions to keep in sync with maria_pack.c */ +#define HEAD_LENGTH 32 /* Length of fixed header */ + +#if INT_MAX > 32767 +#define BITS_SAVED 32 +#define MAX_QUICK_TABLE_BITS 9 /* Because we may shift in 24 bits */ +#else +#define BITS_SAVED 16 +#define MAX_QUICK_TABLE_BITS 6 +#endif + +#define get_bit(BU) ((BU)->bits ? \ + (BU)->current_byte & ((maria_bit_type) 1 << --(BU)->bits) :\ + (fill_buffer(BU), (BU)->bits= BITS_SAVED-1,\ + (BU)->current_byte & ((maria_bit_type) 1 << (BITS_SAVED-1)))) +#define skip_to_next_byte(BU) ((BU)->bits&=~7) +#define get_bits(BU,count) (((BU)->bits >= count) ? (((BU)->current_byte >> ((BU)->bits-=count)) & mask[count]) : fill_and_get_bits(BU,count)) + +#define decode_bytes_test_bit(bit) \ + if (low_byte & (1 << (7-bit))) \ + pos++; \ + if (*pos & IS_CHAR) \ + { bits-=(bit+1); break; } \ + pos+= *pos + +/* + Size in uint16 of a Huffman tree for uchar compression of 256 uchar values +*/ +#define OFFSET_TABLE_SIZE 512 + +static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file, + pbool fix_keys); +static uint read_huff_table(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree, + uint16 **decode_table,uchar **intervall_buff, + uint16 *tmp_buff); +static void make_quick_table(uint16 *to_table,uint16 *decode_table, + uint *next_free,uint value,uint bits, + uint max_bits); +static void fill_quick_table(uint16 *table,uint bits, uint max_bits, + uint value); +static uint copy_decode_table(uint16 *to_pos,uint offset, + uint16 *decode_table); +static uint find_longest_bitstream(uint16 *table, uint16 *end); +static void (*get_unpack_function(MARIA_COLUMNDEF *rec))(MARIA_COLUMNDEF *field, + MARIA_BIT_BUFF *buff, + uchar *to, + uchar *end); +static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_skip_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_endspace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_prespace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_space_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_zerofill_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_constant(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_intervall(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end); +static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to,uchar *end); +static uint decode_pos(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree); +static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff,uchar *buffer, + uint length); +static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff,uint count); +static void fill_buffer(MARIA_BIT_BUFF *bit_buff); +static uint max_bit(uint value); +static uint read_pack_length(uint version, const uchar *buf, ulong *length); +#ifdef HAVE_MMAP +static uchar *_ma_mempack_get_block_info(MARIA_HA *maria, + MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, + size_t *rec_buff_size_p, + uchar *header); +#endif + +static maria_bit_type mask[]= +{ + 0x00000000, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, +#if BITS_SAVED > 16 + 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, + 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, + 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff, +#endif +}; + + +my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile) +{ + share->options|= HA_OPTION_READ_ONLY_DATA; + return (_ma_read_pack_info(share, dfile, + (pbool) + MY_TEST(!(share->options & + (HA_OPTION_PACK_RECORD | + HA_OPTION_TEMP_COMPRESS_RECORD))))); +} + + +my_bool _ma_once_end_pack_row(MARIA_SHARE *share) +{ + if (share->decode_trees) + { + my_free(share->decode_trees); + my_free(share->decode_tables); + } + return 0; +} + + +/* Read all packed info, allocate memory and fix field structs */ + +static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file, + pbool fix_keys) +{ + int diff_length; + uint i,trees,huff_tree_bits,rec_reflength,length; + uint16 *decode_table,*tmp_buff; + ulong elements,intervall_length; + uchar *disk_cache; + uchar *intervall_buff; + uchar header[HEAD_LENGTH]; + MARIA_BIT_BUFF bit_buff; + DBUG_ENTER("_ma_read_pack_info"); + + if (maria_quick_table_bits < 4) + maria_quick_table_bits=4; + else if (maria_quick_table_bits > MAX_QUICK_TABLE_BITS) + maria_quick_table_bits=MAX_QUICK_TABLE_BITS; + + my_errno=0; + if (mysql_file_read(file, header, sizeof(header), MYF(MY_NABP))) + { + if (!my_errno) + my_errno=HA_ERR_END_OF_FILE; + goto err0; + } + /* Only the first three bytes of magic number are independent of version. */ + if (memcmp(header, maria_pack_file_magic, 3)) + { + _ma_set_fatal_error_with_share(share, HA_ERR_WRONG_IN_RECORD); + goto err0; + } + share->pack.version= header[3]; /* fourth uchar of magic number */ + share->pack.header_length= uint4korr(header+4); + share->min_pack_length=(uint) uint4korr(header+8); + share->max_pack_length=(uint) uint4korr(header+12); + set_if_bigger(share->base.default_rec_buff_size, + share->max_pack_length + 7); + elements=uint4korr(header+16); + intervall_length=uint4korr(header+20); + trees=uint2korr(header+24); + share->pack.ref_length=header[26]; + rec_reflength=header[27]; + diff_length=(int) rec_reflength - (int) share->base.rec_reflength; + if (fix_keys) + share->rec_reflength=rec_reflength; + DBUG_PRINT("info", ("fixed header length: %u", HEAD_LENGTH)); + DBUG_PRINT("info", ("total header length: %lu", share->pack.header_length)); + DBUG_PRINT("info", ("pack file version: %u", share->pack.version)); + DBUG_PRINT("info", ("min pack length: %lu", share->min_pack_length)); + DBUG_PRINT("info", ("max pack length: %lu", share->max_pack_length)); + DBUG_PRINT("info", ("elements of all trees: %lu", elements)); + DBUG_PRINT("info", ("distinct values bytes: %lu", intervall_length)); + DBUG_PRINT("info", ("number of code trees: %u", trees)); + DBUG_PRINT("info", ("bytes for record lgt: %u", share->pack.ref_length)); + DBUG_PRINT("info", ("record pointer length: %u", rec_reflength)); + + + /* + Memory segment #1: + - Decode tree heads + - Distinct column values + */ + if (!(share->decode_trees=(MARIA_DECODE_TREE*) + my_malloc(PSI_INSTRUMENT_ME, (uint) (trees*sizeof(MARIA_DECODE_TREE)+ + intervall_length*sizeof(uchar)), + MYF(MY_WME)))) + goto err0; + intervall_buff=(uchar*) (share->decode_trees+trees); + + /* + Memory segment #2: + - Decode tables + - Quick decode tables + - Temporary decode table + - Compressed data file header cache + This segment will be reallocated after construction of the tables. + */ + length=(uint) (elements*2+trees*(1 << maria_quick_table_bits)); + if (!(share->decode_tables=(uint16*) + my_malloc(PSI_INSTRUMENT_ME, (length+OFFSET_TABLE_SIZE)*sizeof(uint16)+ + (uint) (share->pack.header_length - sizeof(header)) + + share->base.extra_rec_buff_size, + MYF(MY_WME | MY_ZEROFILL)))) + goto err1; + tmp_buff=share->decode_tables+length; + disk_cache=(uchar*) (tmp_buff+OFFSET_TABLE_SIZE); + + if (mysql_file_read(file,disk_cache, + (uint) (share->pack.header_length-sizeof(header)), + MYF(MY_NABP))) + goto err2; +#ifdef HAVE_valgrind + /* Zero bytes accessed by fill_buffer */ + bzero(disk_cache + (share->pack.header_length-sizeof(header)), + share->base.extra_rec_buff_size); +#endif + + huff_tree_bits=max_bit(trees ? trees-1 : 0); + init_bit_buffer(&bit_buff, disk_cache, + (uint) (share->pack.header_length-sizeof(header))); + /* Read new info for each field */ + for (i=0 ; i < share->base.fields ; i++) + { + share->columndef[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5); + share->columndef[i].pack_type=(uint) get_bits(&bit_buff,6); + share->columndef[i].space_length_bits=get_bits(&bit_buff,5); + share->columndef[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff, + huff_tree_bits); + share->columndef[i].unpack= get_unpack_function(share->columndef + i); + DBUG_PRINT("info", ("col: %2u type: %2u pack: %u slbits: %2u", + i, share->columndef[i].base_type, + share->columndef[i].pack_type, + share->columndef[i].space_length_bits)); + } + skip_to_next_byte(&bit_buff); + /* + Construct the decoding tables from the file header. Keep track of + the used memory. + */ + decode_table=share->decode_tables; + for (i=0 ; i < trees ; i++) + if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table, + &intervall_buff,tmp_buff)) + goto err3; + /* Reallocate the decoding tables to the used size. */ + decode_table=(uint16*) + my_realloc(PSI_INSTRUMENT_ME, (uchar*) share->decode_tables, + (uint) ((uchar*) decode_table - (uchar*) share->decode_tables), + MYF(0)); + /* Fix the table addresses in the tree heads. */ + { + my_ptrdiff_t diff= PTR_BYTE_DIFF(decode_table,share->decode_tables); + share->decode_tables=decode_table; + for (i=0 ; i < trees ; i++) + share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table, + diff, uint16*); + } + + /* Fix record-ref-length for keys */ + if (fix_keys) + { + for (i=0 ; i < share->base.keys ; i++) + { + MARIA_KEYDEF *keyinfo= &share->keyinfo[i]; + keyinfo->keylength+= (uint16) diff_length; + keyinfo->minlength+= (uint16) diff_length; + keyinfo->maxlength+= (uint16) diff_length; + keyinfo->seg[keyinfo->flag & HA_FULLTEXT ? + FT_SEGS : keyinfo->keysegs].length= (uint16) rec_reflength; + } + if (share->ft2_keyinfo.seg) + { + MARIA_KEYDEF *ft2_keyinfo= &share->ft2_keyinfo; + ft2_keyinfo->keylength+= (uint16) diff_length; + ft2_keyinfo->minlength+= (uint16) diff_length; + ft2_keyinfo->maxlength+= (uint16) diff_length; + } + } + + if (bit_buff.error || bit_buff.pos < bit_buff.end) + goto err3; + + DBUG_RETURN(0); + +err3: + _ma_set_fatal_error_with_share(share, HA_ERR_WRONG_IN_RECORD); +err2: + my_free(share->decode_tables); +err1: + my_free(share->decode_trees); +err0: + DBUG_RETURN(1); +} + + +/* + Read a huff-code-table from datafile. + + SYNOPSIS + read_huff_table() + bit_buff Bit buffer pointing at start of the + decoding table in the file header cache. + decode_tree Pointer to the decode tree head. + decode_table IN/OUT Address of a pointer to the next free space. + intervall_buff IN/OUT Address of a pointer to the next unused values. + tmp_buff Buffer for temporary extraction of a full + decoding table as read from bit_buff. + + RETURN + 0 OK. + 1 Error. +*/ +static uint read_huff_table(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree, + uint16 **decode_table, uchar **intervall_buff, + uint16 *tmp_buff) +{ + uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits, + next_free_offset; + uint16 *ptr,*end; + DBUG_ENTER("read_huff_table"); + + if (!get_bits(bit_buff,1)) + { + /* Byte value compression. */ + min_chr=get_bits(bit_buff,8); + elements=get_bits(bit_buff,9); + char_bits=get_bits(bit_buff,5); + offset_bits=get_bits(bit_buff,5); + intervall_length=0; + ptr=tmp_buff; + ptr=tmp_buff; + DBUG_PRINT("info", ("byte value compression")); + DBUG_PRINT("info", ("minimum uchar value: %u", min_chr)); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("bits for values: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + if (elements > 256) + { + DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u", + elements)); + DBUG_RETURN(1); + } + } + else + { + /* Distinct column value compression. */ + min_chr=0; + elements=get_bits(bit_buff,15); + intervall_length=get_bits(bit_buff,16); + char_bits=get_bits(bit_buff,5); + offset_bits=get_bits(bit_buff,5); + decode_tree->quick_table_bits=0; + ptr= *decode_table; + DBUG_PRINT("info", ("distinct column value compression")); + DBUG_PRINT("info", ("number of tree nodes: %u", elements)); + DBUG_PRINT("info", ("value buffer length: %u", intervall_length)); + DBUG_PRINT("info", ("bits for value index: %u", char_bits)); + DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits)); + } + size=elements*2-2; + DBUG_PRINT("info", ("tree size in uint16: %u", size)); + DBUG_PRINT("info", ("tree size in bytes: %u", + size * (uint) sizeof(uint16))); + + for (end=ptr+size ; ptr < end ; ptr++) + { + if (get_bit(bit_buff)) + { + *ptr= (uint16) get_bits(bit_buff,offset_bits); + if ((ptr + *ptr >= end) || !*ptr) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + DBUG_RETURN(1); + } + } + else + *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr)); + } + skip_to_next_byte(bit_buff); + + decode_tree->table= *decode_table; + decode_tree->intervalls= *intervall_buff; + if (! intervall_length) + { + /* Byte value compression. ptr started from tmp_buff. */ + /* Find longest Huffman code from begin to end of tree in bits. */ + table_bits= find_longest_bitstream(tmp_buff, ptr); + if (table_bits >= OFFSET_TABLE_SIZE) + DBUG_RETURN(1); + if (table_bits > maria_quick_table_bits) + table_bits=maria_quick_table_bits; + DBUG_PRINT("info", ("table bits: %u", table_bits)); + + next_free_offset= (1 << table_bits); + make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits, + table_bits); + (*decode_table)+= next_free_offset; + decode_tree->quick_table_bits=table_bits; + } + else + { + /* Distinct column value compression. ptr started from *decode_table */ + (*decode_table)=end; + /* + get_bits() moves some bytes to a cache buffer in advance. May need + to step back. + */ + bit_buff->pos-= bit_buff->bits/8; + /* Copy the distinct column values from the buffer. */ + memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length); + (*intervall_buff)+=intervall_length; + bit_buff->pos+=intervall_length; + bit_buff->bits=0; + } + DBUG_RETURN(0); +} + + +/* + Make a quick_table for faster decoding. + + SYNOPSIS + make_quick_table() + to_table Target quick_table and remaining decode table. + decode_table Source Huffman (sub-)tree within tmp_buff. + next_free_offset IN/OUT Next free offset from to_table. + Starts behind quick_table on the top-level. + value Huffman bits found so far. + bits Remaining bits to be collected. + max_bits Total number of bits to collect (table_bits). + + DESCRIPTION + + The quick table is an array of 16-bit values. There exists one value + for each possible code representable by max_bits (table_bits) bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot for + this value is a valid Huffman code for a resulting uchar value. + + The low-order 8 bits (1..8) are the resulting uchar value. + + Bits 9..14 are the length of the Huffman code for this uchar value. + This means so many bits from the input stream were needed to + represent this uchar value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + + RETURN + void +*/ + +static void make_quick_table(uint16 *to_table, uint16 *decode_table, + uint *next_free_offset, uint value, uint bits, + uint max_bits) +{ + DBUG_ENTER("make_quick_table"); + + /* + When down the table to the requested maximum, copy the rest of the + Huffman table. + */ + if (!bits--) + { + /* + Remaining left Huffman tree segment starts behind quick table. + Remaining right Huffman tree segment starts behind left segment. + */ + to_table[value]= (uint16) *next_free_offset; + /* + Re-construct the remaining Huffman tree segment at + next_free_offset in to_table. + */ + *next_free_offset=copy_decode_table(to_table, *next_free_offset, + decode_table); + DBUG_VOID_RETURN; + } + + /* Descent on the left side. Left side bits are clear (0). */ + if (!(*decode_table & IS_CHAR)) + { + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table,decode_table+ *decode_table, + next_free_offset,value,bits,max_bits); + } + else + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + } + + /* Descent on the right side. Right side bits are set (1). */ + decode_table++; + value|= (1 << bits); + if (!(*decode_table & IS_CHAR)) + { + /* Not a leaf. Follow the pointer. */ + make_quick_table(to_table,decode_table+ *decode_table, + next_free_offset,value,bits,max_bits); + } + else + { + /* + A leaf. A Huffman code is complete. Fill the quick_table + array for all possible bit strings starting with this Huffman + code. + */ + fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table); + } + + DBUG_VOID_RETURN; +} + + +/* + Fill quick_table for all possible values starting with this Huffman code. + + SYNOPSIS + fill_quick_table() + table Target quick_table position. + bits Unused bits from max_bits. + max_bits Total number of bits to collect (table_bits). + value The uchar encoded by the found Huffman code. + + DESCRIPTION + + Fill the segment (all slots) of the quick_table array with the + resulting value for the found Huffman code. There are as many slots + as there are combinations representable by the unused bits. + + In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then + there are 6 unused bits. Hence we fill 2**6 = 64 slots with the + value. + + RETURN + void +*/ + +static void fill_quick_table(uint16 *table, uint bits, uint max_bits, + uint value) +{ + uint16 *end; + DBUG_ENTER("fill_quick_table"); + + /* + Bits 1..8 of value represent the decoded uchar value. + Bits 9..14 become the length of the Huffman code for this uchar value. + Bit 16 flags a valid code (IS_CHAR). + */ + value|= (max_bits - bits) << 8 | IS_CHAR; + + for (end= table + ((my_ptrdiff_t) 1 << bits); table < end; table++) + { + *table= (uint16) value; + } + DBUG_VOID_RETURN; +} + + +/* + Reconstruct a decode subtree at the target position. + + SYNOPSIS + copy_decode_table() + to_pos Target quick_table and remaining decode table. + offset Next free offset from to_pos. + decode_table Source Huffman subtree within tmp_buff. + + NOTE + Pointers in the decode tree are relative to the pointers position. + + RETURN + next free offset from to_pos. +*/ + +static uint copy_decode_table(uint16 *to_pos, uint offset, + uint16 *decode_table) +{ + uint prev_offset= offset; + DBUG_ENTER("copy_decode_table"); + + /* Descent on the left side. */ + if (!(*decode_table & IS_CHAR)) + { + /* Set a pointer to the next target node. */ + to_pos[offset]=2; + /* Copy the left hand subtree there. */ + offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table); + } + else + { + /* Copy the uchar value. */ + to_pos[offset]= *decode_table; + /* Step behind this node. */ + offset+=2; + } + + /* Descent on the right side. */ + decode_table++; + if (!(*decode_table & IS_CHAR)) + { + /* Set a pointer to the next free target node. */ + to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1); + /* Copy the right hand subtree to the entry of that node. */ + offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table); + } + else + { + /* Copy the uchar value. */ + to_pos[prev_offset+1]= *decode_table; + } + DBUG_RETURN(offset); +} + + +/* + Find the length of the longest Huffman code in this table in bits. + + SYNOPSIS + find_longest_bitstream() + table Code (sub-)table start. + end End of code table. + + IMPLEMENTATION + + Recursively follow the branch(es) of the code pair on every level of + the tree until two uchar values (and no branch) are found. Add one to + each level when returning back from each recursion stage. + + 'end' is used for error checking only. A clean tree terminates + before reaching 'end'. Hence the exact value of 'end' is not too + important. However having it higher than necessary could lead to + misbehaviour should 'next' jump into the dirty area. + + RETURN + length Length of longest Huffman code in bits. + >= OFFSET_TABLE_SIZE Error, broken tree. It does not end before 'end'. +*/ + +static uint find_longest_bitstream(uint16 *table, uint16 *end) +{ + uint length=1; + uint length2; + if (!(*table & IS_CHAR)) + { + uint16 *next= table + *table; + if (next > end || next == table) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length=find_longest_bitstream(next, end)+1; + } + table++; + if (!(*table & IS_CHAR)) + { + uint16 *next= table + *table; + if (next > end || next == table) + { + DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree")); + return OFFSET_TABLE_SIZE; + } + length2= find_longest_bitstream(next, end) + 1; + length=MY_MAX(length,length2); + } + return length; +} + + +/* + Read record from datafile. + + SYNOPSIS + _ma_read_pack_record() + info A pointer to MARIA_HA. + filepos File offset of the record. + buf RETURN The buffer to receive the record. + + RETURN + 0 On success + # Error number +*/ + +int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + File file; + DBUG_ENTER("maria_read_pack_record"); + + if (filepos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno); /* _search() didn't find record */ + + file= info->dfile.file; + if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, file, + filepos)) + goto err; + if (mysql_file_read(file, info->rec_buff + block_info.offset , + block_info.rec_len - block_info.offset, MYF(MY_NABP))) + goto panic; + info->update|= HA_STATE_AKTIV; + + info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */ + DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf, + info->rec_buff, block_info.rec_len)); +panic: + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); +err: + DBUG_RETURN(my_errno); +} + + + +int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, + register uchar *to, uchar *from, ulong reclength) +{ + uchar *end_field; + reg3 MARIA_COLUMNDEF *end; + MARIA_COLUMNDEF *current_field; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_pack_rec_unpack"); + + if (info->s->base.null_bytes) + { + memcpy(to, from, info->s->base.null_bytes); + to+= info->s->base.null_bytes; + from+= info->s->base.null_bytes; + reclength-= info->s->base.null_bytes; + } + init_bit_buffer(bit_buff, from, reclength); + for (current_field=share->columndef, end=current_field+share->base.fields ; + current_field < end ; + current_field++,to=end_field) + { + end_field=to+current_field->length; + (*current_field->unpack)(current_field, bit_buff, to, end_field); + } + if (!bit_buff->error && + bit_buff->pos - bit_buff->bits / 8 == bit_buff->end) + DBUG_RETURN(0); + info->update&= ~HA_STATE_AKTIV; + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); +} /* _ma_pack_rec_unpack */ + + + /* Return function to unpack field */ + +static void (*get_unpack_function(MARIA_COLUMNDEF *rec)) + (MARIA_COLUMNDEF *, MARIA_BIT_BUFF *, uchar *, uchar *) +{ + switch (rec->base_type) { + case FIELD_SKIP_ZERO: + if (rec->pack_type & PACK_TYPE_ZERO_FILL) + return &uf_zerofill_skip_zero; + return &uf_skip_zero; + case FIELD_NORMAL: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + return &uf_space_normal; + if (rec->pack_type & PACK_TYPE_ZERO_FILL) + return &uf_zerofill_normal; + return &decode_bytes; + case FIELD_SKIP_ENDSPACE: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + { + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_space_endspace_selected; + return &uf_space_endspace; + } + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_endspace_selected; + return &uf_endspace; + case FIELD_SKIP_PRESPACE: + if (rec->pack_type & PACK_TYPE_SPACE_FIELDS) + { + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_space_prespace_selected; + return &uf_space_prespace; + } + if (rec->pack_type & PACK_TYPE_SELECTED) + return &uf_prespace_selected; + return &uf_prespace; + case FIELD_CONSTANT: + return &uf_constant; + case FIELD_INTERVALL: + return &uf_intervall; + case FIELD_ZERO: + case FIELD_CHECK: + return &uf_zero; + case FIELD_BLOB: + return &uf_blob; + case FIELD_VARCHAR: + if (rec->length <= 256) /* 255 + 1 uchar length */ + return &uf_varchar1; + return &uf_varchar2; + case FIELD_LAST: + default: + return 0; /* This should never happend */ + } +} + + /* The different functions to unpack a field */ + +static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero((char*) to,(uint) (end-to)); + else + { + end-=rec->space_length_bits; + decode_bytes(rec,bit_buff,to,end); + bzero((char*) end,rec->space_length_bits); + } +} + +static void uf_skip_zero(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero((char*) to,(uint) (end-to)); + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); + } + else + decode_bytes(rec,bit_buff,to,end); + } +} + +static void uf_endspace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); + } + else + decode_bytes(rec,bit_buff,to,end); +} + +static void uf_space_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); + } +} + +static void uf_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + if (to+spaces != end) + decode_bytes(rec,bit_buff,to,end-spaces); + bfill(end - spaces, spaces, ' '); +} + +static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } + else + decode_bytes(rec,bit_buff,to,end); + } +} + + +static void uf_prespace_selected(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } + else + decode_bytes(rec,bit_buff,to,end); +} + + +static void uf_space_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if (get_bit(bit_buff)) + bfill(to, (end-to), ' '); + else + { + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); + } +} + +static void uf_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + uint spaces; + if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end) + { + bit_buff->error=1; + return; + } + bfill(to, spaces, ' '); + if (to+spaces != end) + decode_bytes(rec,bit_buff,to+spaces,end); +} + +static void uf_zerofill_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + end-=rec->space_length_bits; + decode_bytes(rec,bit_buff, to, end); + bzero((char*) end,rec->space_length_bits); +} + +static void uf_constant(MARIA_COLUMNDEF *rec, + MARIA_BIT_BUFF *bit_buff __attribute__((unused)), + uchar *to, uchar *end) +{ + memcpy(to,rec->huff_tree->intervalls,(size_t) (end-to)); +} + +static void uf_intervall(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, + uchar *end) +{ + reg1 uint field_length=(uint) (end-to); + memcpy(to,rec->huff_tree->intervalls+field_length*decode_pos(bit_buff, + rec->huff_tree), + (size_t) field_length); +} + + +/*ARGSUSED*/ +static void uf_zero(MARIA_COLUMNDEF *rec __attribute__((unused)), + MARIA_BIT_BUFF *bit_buff __attribute__((unused)), + uchar *to, uchar *end) +{ + bzero(to, (uint) (end-to)); +} + +static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + if (get_bit(bit_buff)) + bzero(to, (uint) (end-to)); + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr; + if (bit_buff->blob_pos+length > bit_buff->blob_end) + { + bit_buff->error=1; + bzero(to, (end-to)); + return; + } + decode_bytes(rec, bit_buff, bit_buff->blob_pos, + bit_buff->blob_pos + length); + _ma_store_blob_length(to, pack_length, length); + memcpy(to+pack_length, &bit_buff->blob_pos, sizeof(uchar*)); + bit_buff->blob_pos+=length; + } +} + + +static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end __attribute__((unused))) +{ + if (get_bit(bit_buff)) + to[0]= 0; /* Zero lengths */ + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + *to= (char) length; + decode_bytes(rec,bit_buff,to+1,to+1+length); + } +} + + +static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end __attribute__((unused))) +{ + if (get_bit(bit_buff)) + to[0]=to[1]=0; /* Zero lengths */ + else + { + ulong length=get_bits(bit_buff,rec->space_length_bits); + int2store(to,length); + decode_bytes(rec,bit_buff,to+2,to+2+length); + } +} + + /* Functions to decode of buffer of bits */ + +#if BITS_SAVED == 64 + +static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + reg1 uint bits,low_byte; + reg3 uint16 *pos; + reg4 uint table_bits,table_and; + MARIA_DECODE_TREE *decode_tree; + + decode_tree=rec->decode_tree; + bits=bit_buff->bits; /* Save in reg for quicker access */ + table_bits=decode_tree->quick_table_bits; + table_and= (1 << table_bits)-1; + + do + { + if (bits <= 32) + { + if (bit_buff->pos > bit_buff->end+4) + { + bit_buff->error=1; + return; /* Can't be right */ + } + bit_buff->current_byte= (bit_buff->current_byte << 32) | + ((((uint) bit_buff->pos[3])) | + (((uint) bit_buff->pos[2]) << 8) | + (((uint) bit_buff->pos[1]) << 16) | + (((uint) bit_buff->pos[0]) << 24)); + bit_buff->pos+=4; + bits+=32; + } + /* + First use info in quick_table. + + The quick table is an array of 16-bit values. There exists one + value for each possible code representable by table_bits bits. + In most cases table_bits is 9. So there are 512 16-bit values. + + If the high-order bit (16) is set (IS_CHAR) then the array slot + for this value is a valid Huffman code for a resulting uchar value. + + The low-order 8 bits (1..8) are the resulting uchar value. + + Bits 9..14 are the length of the Huffman code for this uchar value. + This means so many bits from the input stream were needed to + represent this uchar value. The remaining bits belong to later + Huffman codes. This also means that for every Huffman code shorter + than table_bits there are multiple entires in the array, which + differ just in the unused bits. + + If the high-order bit (16) is clear (0) then the remaining bits are + the position of the remaining Huffman decode tree segment behind the + quick table. + */ + low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and; + low_byte=decode_tree->table[low_byte]; + if (low_byte & IS_CHAR) + { + /* + All Huffman codes of less or equal table_bits length are in the + quick table. This is one of them. + */ + *to++ = (char) (low_byte & 255); /* Found char in quick table */ + bits-= ((low_byte >> 8) & 31); /* Remove bits used */ + } + else + { /* Map through rest of decode-table */ + /* This means that the Huffman code must be longer than table_bits. */ + pos=decode_tree->table+low_byte; + bits-=table_bits; + /* NOTE: decode_bytes_test_bit() is a macro which contains a break !!! */ + for (;;) + { + low_byte=(uint) (bit_buff->current_byte >> (bits-8)); + decode_bytes_test_bit(0); + decode_bytes_test_bit(1); + decode_bytes_test_bit(2); + decode_bytes_test_bit(3); + decode_bytes_test_bit(4); + decode_bytes_test_bit(5); + decode_bytes_test_bit(6); + decode_bytes_test_bit(7); + bits-=8; + } + *to++ = (char) *pos; + } + } while (to != end); + + bit_buff->bits=bits; + return; +} + +#else + +static void decode_bytes(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *end) +{ + reg1 uint bits,low_byte; + reg3 uint16 *pos; + reg4 uint table_bits,table_and; + MARIA_DECODE_TREE *decode_tree; + + decode_tree=rec->huff_tree; + bits=bit_buff->bits; /* Save in reg for quicker access */ + table_bits=decode_tree->quick_table_bits; + table_and= (1 << table_bits)-1; + + do + { + if (bits < table_bits) + { + if (bit_buff->pos > bit_buff->end+1) + { + bit_buff->error=1; + return; /* Can't be right */ + } +#if BITS_SAVED == 32 + bit_buff->current_byte= (bit_buff->current_byte << 24) | + (((uint) ((uchar) bit_buff->pos[2]))) | + (((uint) ((uchar) bit_buff->pos[1])) << 8) | + (((uint) ((uchar) bit_buff->pos[0])) << 16); + bit_buff->pos+=3; + bits+=24; +#else + if (bits) /* We must have at leasts 9 bits */ + { + bit_buff->current_byte= (bit_buff->current_byte << 8) | + (uint) ((uchar) bit_buff->pos[0]); + bit_buff->pos++; + bits+=8; + } + else + { + bit_buff->current_byte= ((uint) ((uchar) bit_buff->pos[0]) << 8) | + ((uint) ((uchar) bit_buff->pos[1])); + bit_buff->pos+=2; + bits+=16; + } +#endif + } + /* First use info in quick_table */ + low_byte=(bit_buff->current_byte >> (bits - table_bits)) & table_and; + low_byte=decode_tree->table[low_byte]; + if (low_byte & IS_CHAR) + { + *to++ = (low_byte & 255); /* Found char in quick table */ + bits-= ((low_byte >> 8) & 31); /* Remove bits used */ + } + else + { /* Map through rest of decode-table */ + pos=decode_tree->table+low_byte; + bits-=table_bits; + for (;;) + { + if (bits < 8) + { /* We don't need to check end */ +#if BITS_SAVED == 32 + bit_buff->current_byte= (bit_buff->current_byte << 24) | + (((uint) ((uchar) bit_buff->pos[2]))) | + (((uint) ((uchar) bit_buff->pos[1])) << 8) | + (((uint) ((uchar) bit_buff->pos[0])) << 16); + bit_buff->pos+=3; + bits+=24; +#else + bit_buff->current_byte= (bit_buff->current_byte << 8) | + (uint) ((uchar) bit_buff->pos[0]); + bit_buff->pos+=1; + bits+=8; +#endif + } + low_byte=(uint) (bit_buff->current_byte >> (bits-8)); + decode_bytes_test_bit(0); + decode_bytes_test_bit(1); + decode_bytes_test_bit(2); + decode_bytes_test_bit(3); + decode_bytes_test_bit(4); + decode_bytes_test_bit(5); + decode_bytes_test_bit(6); + decode_bytes_test_bit(7); + bits-=8; + } + *to++ = (char) *pos; + } + } while (to != end); + + bit_buff->bits=bits; + return; +} +#endif /* BIT_SAVED == 64 */ + + +static uint decode_pos(MARIA_BIT_BUFF *bit_buff, + MARIA_DECODE_TREE *decode_tree) +{ + uint16 *pos=decode_tree->table; + for (;;) + { + if (get_bit(bit_buff)) + pos++; + if (*pos & IS_CHAR) + return (uint) (*pos & ~IS_CHAR); + pos+= *pos; + } +} + + +int _ma_read_rnd_pack_record(MARIA_HA *info, + uchar *buf, + register MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + File file; + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_read_rnd_pack_record"); + + if (filepos >= info->state->data_file_length) + { + my_errno= HA_ERR_END_OF_FILE; + goto err; + } + + file= info->dfile.file; + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(info, &info->rec_cache, block_info.header, + filepos, share->pack.ref_length, + skip_deleted_blocks ? READING_NEXT : 0)) + goto err; + file= -1; + } + if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info, + &info->rec_buff, &info->rec_buff_size, + file, filepos)) + goto err; /* Error code is already set */ +#ifndef DBUG_OFF + if (block_info.rec_len > share->max_pack_length) + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; + } +#endif + + if (info->opt_flag & READ_CACHE_USED) + { + if (_ma_read_cache(info, &info->rec_cache, info->rec_buff, + block_info.filepos, block_info.rec_len, + skip_deleted_blocks ? READING_NEXT : 0)) + goto err; + } + else + { + if (mysql_file_read(info->dfile.file, info->rec_buff + block_info.offset, + block_info.rec_len-block_info.offset, + MYF(MY_NABP))) + goto err; + } + info->packed_length= block_info.rec_len; + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= block_info.filepos+block_info.rec_len; + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + + info->rec_buff[block_info.rec_len]= 0; /* Keep valgrind happy */ + DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf, + info->rec_buff, block_info.rec_len)); + err: + DBUG_RETURN(my_errno); +} + + + /* Read and process header from a huff-record-file */ + +uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, size_t *rec_buff_size_p, + File file, my_off_t filepos) +{ + uchar *header= info->header; + uint head_length,UNINIT_VAR(ref_length); + MARIA_SHARE *share= maria->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + + if (file >= 0) + { + ref_length=share->pack.ref_length; + /* + We can't use my_pread() here because _ma_read_rnd_pack_record assumes + position is ok + */ + mysql_file_seek(file,filepos,MY_SEEK_SET,MYF(0)); + if (mysql_file_read(file, header,ref_length,MYF(MY_NABP))) + return BLOCK_FATAL_ERROR; + DBUG_DUMP("header", header, ref_length); + } + head_length= read_pack_length((uint) share->pack.version, header, + &info->rec_len); + if (share->base.blobs) + { + head_length+= read_pack_length((uint) share->pack.version, + header + head_length, &info->blob_len); + /* + Ensure that the record buffer is big enough for the compressed + record plus all expanded blobs. [We do not have an extra buffer + for the resulting blobs. Sigh.] + */ + if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p, + info->rec_len + info->blob_len + + share->base.extra_rec_buff_size, flag)) + return BLOCK_FATAL_ERROR; /* not enough memory */ + bit_buff->blob_pos= *rec_buff_p + info->rec_len; + bit_buff->blob_end= bit_buff->blob_pos + info->blob_len; + maria->blob_length=info->blob_len; + } + info->filepos=filepos+head_length; + if (file >= 0) + { + info->offset=MY_MIN(info->rec_len, ref_length - head_length); + memcpy(*rec_buff_p, header + head_length, info->offset); + } + return 0; +} + + + /* rutines for bit buffer */ + /* Note buffer must be 6 uchar bigger than longest row */ + +static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff, uchar *buffer, + uint length) +{ + bit_buff->pos=buffer; + bit_buff->end=buffer+length; + bit_buff->bits=bit_buff->error=0; + bit_buff->current_byte=0; /* Avoid purify errors */ +} + +static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff, uint count) +{ + uint tmp; + count-=bit_buff->bits; + tmp=(bit_buff->current_byte & mask[bit_buff->bits]) << count; + fill_buffer(bit_buff); + bit_buff->bits=BITS_SAVED - count; + return tmp+(bit_buff->current_byte >> (BITS_SAVED - count)); +} + + /* Fill in empty bit_buff->current_byte from buffer */ + /* Sets bit_buff->error if buffer is exhausted */ + +static void fill_buffer(MARIA_BIT_BUFF *bit_buff) +{ + if (bit_buff->pos >= bit_buff->end) + { + bit_buff->error= 1; + bit_buff->current_byte=0; + return; + } +#if BITS_SAVED == 64 + bit_buff->current_byte= ((((uint) ((uchar) bit_buff->pos[7]))) | + (((uint) ((uchar) bit_buff->pos[6])) << 8) | + (((uint) ((uchar) bit_buff->pos[5])) << 16) | + (((uint) ((uchar) bit_buff->pos[4])) << 24) | + ((ulonglong) + ((((uint) ((uchar) bit_buff->pos[3]))) | + (((uint) ((uchar) bit_buff->pos[2])) << 8) | + (((uint) ((uchar) bit_buff->pos[1])) << 16) | + (((uint) ((uchar) bit_buff->pos[0])) << 24)) << 32)); + bit_buff->pos+=8; +#else +#if BITS_SAVED == 32 + bit_buff->current_byte= (((uint) ((uchar) bit_buff->pos[3])) | + (((uint) ((uchar) bit_buff->pos[2])) << 8) | + (((uint) ((uchar) bit_buff->pos[1])) << 16) | + (((uint) ((uchar) bit_buff->pos[0])) << 24)); + bit_buff->pos+=4; +#else + bit_buff->current_byte= (uint) (((uint) ((uchar) bit_buff->pos[1])) | + (((uint) ((uchar) bit_buff->pos[0])) << 8)); + bit_buff->pos+=2; +#endif +#endif +} + + /* Get number of bits neaded to represent value */ + +static uint max_bit(register uint value) +{ + reg2 uint power=1; + + while ((value>>=1)) + power++; + return (power); +} + + +/***************************************************************************** + Some redefined functions to handle files when we are using memmap +*****************************************************************************/ + +#ifdef HAVE_MMAP + +static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos); +static int _ma_read_rnd_mempack_record(MARIA_HA*, uchar *, MARIA_RECORD_POS, + my_bool); + +my_bool _ma_memmap_file(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_memmap_file"); + + if (!info->s->file_map) + { + if (mysql_file_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) < + share->state.state.data_file_length+MEMMAP_EXTRA_MARGIN) + { + DBUG_PRINT("warning",("File isn't extended for memmap")); + DBUG_RETURN(0); + } + if (_ma_dynmap_file(info, share->state.state.data_file_length)) + DBUG_RETURN(0); + } + info->opt_flag|= MEMMAP_USED; + info->read_record= share->read_record= _ma_read_mempack_record; + share->scan= _ma_read_rnd_mempack_record; + DBUG_RETURN(1); +} + + +void _ma_unmap_file(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + my_munmap((char*) share->file_map, + (size_t) share->mmaped_length + MEMMAP_EXTRA_MARGIN); + share->file_map= 0; + share->file_read= _ma_nommap_pread; + share->file_write= _ma_nommap_pwrite; + info->opt_flag&= ~MEMMAP_USED; +} + + +static uchar * +_ma_mempack_get_block_info(MARIA_HA *maria, + MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, + uchar **rec_buff_p, + size_t *rec_buff_size_p, + uchar *header) +{ + MARIA_SHARE *share= maria->s; + myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0); + + header+= read_pack_length((uint) share->pack.version, header, + &info->rec_len); + if (share->base.blobs) + { + header+= read_pack_length((uint) share->pack.version, header, + &info->blob_len); + /* _ma_alloc_rec_buff sets my_errno on error */ + if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p, + info->blob_len + share->base.extra_rec_buff_size, flag)) + return 0; /* not enough memory */ + bit_buff->blob_pos= *rec_buff_p; + bit_buff->blob_end= *rec_buff_p + info->blob_len; + } + return header; +} + + +static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + uchar *pos; + DBUG_ENTER("maria_read_mempack_record"); + + if (filepos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno); /* _search() didn't find record */ + + if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff, + &block_info, &info->rec_buff, + &info->rec_buff_size, + (uchar*) share->file_map+ + filepos))) + DBUG_RETURN(my_errno); + DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf, + pos, block_info.rec_len)); +} + + +/*ARGSUSED*/ +static int _ma_read_rnd_mempack_record(MARIA_HA *info, + uchar *buf, + register MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks + __attribute__((unused))) +{ + MARIA_BLOCK_INFO block_info; + MARIA_SHARE *share= info->s; + uchar *pos,*start; + DBUG_ENTER("_ma_read_rnd_mempack_record"); + + if (filepos >= share->state.state.data_file_length) + { + my_errno=HA_ERR_END_OF_FILE; + goto err; + } + if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff, + &block_info, + &info->rec_buff, + &info->rec_buff_size, + (uchar*) + (start= share->file_map + + filepos)))) + goto err; +#ifndef DBUG_OFF + if (block_info.rec_len > info->s->max_pack_length) + { + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + goto err; + } +#endif + info->packed_length=block_info.rec_len; + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= filepos+(uint) (pos-start)+block_info.rec_len; + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + + DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf, + pos, block_info.rec_len)); + err: + DBUG_RETURN(my_errno); +} + +#endif /* HAVE_MMAP */ + + /* Save length of row */ + +uint _ma_save_pack_length(uint version, uchar *block_buff, ulong length) +{ + if (length < 254) + { + *(uchar*) block_buff= (uchar) length; + return 1; + } + if (length <= 65535) + { + *(uchar*) block_buff=254; + int2store(block_buff+1,(uint) length); + return 3; + } + *(uchar*) block_buff=255; + if (version == 1) /* old format */ + { + DBUG_ASSERT(length <= 0xFFFFFF); + int3store(block_buff + 1, (ulong) length); + return 4; + } + else + { + int4store(block_buff + 1, (ulong) length); + return 5; + } +} + + +static uint read_pack_length(uint version, const uchar *buf, ulong *length) +{ + if (buf[0] < 254) + { + *length= buf[0]; + return 1; + } + else if (buf[0] == 254) + { + *length= uint2korr(buf + 1); + return 3; + } + if (version == 1) /* old format */ + { + *length= uint3korr(buf + 1); + return 4; + } + else + { + *length= uint4korr(buf + 1); + return 5; + } +} + + +uint _ma_calc_pack_length(uint version, ulong length) +{ + return (length < 254) ? 1 : (length < 65536) ? 3 : (version == 1) ? 4 : 5; +} diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c new file mode 100644 index 00000000..5881456a --- /dev/null +++ b/storage/maria/ma_page.c @@ -0,0 +1,635 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Read and write key blocks + + The basic structure of a key block is as follows: + + LSN 7 (LSN_STORE_SIZE); Log number for last change; + Only for transactional pages + PACK_TRANSID 6 (TRANSID_SIZE); Relative transid to pack page transid's + Only for transactional pages + KEYNR 1 (KEYPAGE_KEYID_SIZE) Which index this page belongs to + FLAG 1 (KEYPAGE_FLAG_SIZE) Flags for page + PAGE_SIZE 2 (KEYPAGE_USED_SIZE) How much of the page is used. + high-byte-first + + The flag is a combination of the following values: + + KEYPAGE_FLAG_ISNOD Page is a node + KEYPAGE_FLAG_HAS_TRANSID There may be a transid on the page. + + After this we store key data, either packed or not packed, directly + after each other. If the page is a node flag, there is a pointer to + the next key page at page start and after each key. + + At end of page the last KEYPAGE_CHECKSUM_SIZE bytes are reserved for a + page checksum. +*/ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +/** + Fill MARIA_PAGE structure for usage with _ma_write_keypage +*/ + +void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, my_off_t pos, + uchar *buff) +{ + MARIA_SHARE *share= info->s; + + page->info= info; + page->keyinfo= keyinfo; + page->buff= buff; + page->pos= pos; + page->size= _ma_get_page_used(share, buff); + page->org_size= page->size; + page->flag= _ma_get_keypage_flag(share, buff); + page->node= ((page->flag & KEYPAGE_FLAG_ISNOD) ? + share->base.key_reflength : 0); +} + +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY +void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page) +{ + uint length= page->size; + DBUG_ASSERT(length <= share->max_index_block_size); + bzero(page->buff + length, share->block_size - length); +} +#endif + + +/** + Fetch a key-page in memory + + @fn _ma_fetch_keypage() + @param page Fill this struct with information about read page + @param info Maria handler + @param keyinfo Key definition for used key + @param pos Position for page (in bytes) + @param lock Lock type for page + @param level Importance of page; Priority for page cache + @param buff Buffer to use for page + @param return_buffer Set to 1 if we want to force useage of buff + + @return + @retval 0 ok + @retval 1 error +*/ + +my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, + my_off_t pos, enum pagecache_page_lock lock, + int level, uchar *buff, + my_bool return_buffer __attribute__ ((unused))) +{ + uchar *tmp; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + DBUG_ENTER("_ma_fetch_keypage"); + DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size))); + + tmp= pagecache_read(share->pagecache, &share->kfile, + (pgcache_page_no_t) (pos / block_size), level, buff, + share->page_type, lock, &page_link.link); + + if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) + { + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || lock == PAGECACHE_LOCK_READ); + page_link.unlock= (lock == PAGECACHE_LOCK_WRITE ? + PAGECACHE_LOCK_WRITE_UNLOCK : + PAGECACHE_LOCK_READ_UNLOCK); + page_link.changed= 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + page->link_offset= (uint)info->pinned_pages.elements-1; + } + + if (tmp == info->buff) + info->keyread_buff_used=1; + else if (!tmp) + { + DBUG_PRINT("error",("Got errno: %d from pagecache_read",my_errno)); + info->last_keypage=HA_OFFSET_ERROR; + _ma_set_fatal_error(info, my_errno); + DBUG_RETURN(1); + } + info->last_keypage= pos; + + /* + Setup page structure to make pages easy to use + This is same as page_fill_info, but here inlined as this si used + so often. + */ + page->info= info; + page->keyinfo= keyinfo; + page->buff= tmp; + page->pos= pos; + page->size= _ma_get_page_used(share, tmp); + page->org_size= page->size; /* For debugging */ + page->flag= _ma_get_keypage_flag(share, tmp); + page->node= ((page->flag & KEYPAGE_FLAG_ISNOD) ? + share->base.key_reflength : 0); + +#ifdef EXTRA_DEBUG + { + uint page_size= page->size; + if (page_size < 4 || page_size > share->max_index_block_size || + _ma_get_keynr(share, tmp) != keyinfo->key_nr) + { + DBUG_PRINT("error",("page %lu had wrong page length: %u page_header: %u keynr: %u", + (ulong) (pos / block_size), page_size, + share->keypage_header, + _ma_get_keynr(share, tmp))); + DBUG_DUMP("page", tmp, page_size); + info->last_keypage = HA_OFFSET_ERROR; + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(1); + } + } +#endif + DBUG_RETURN(0); +} /* _ma_fetch_keypage */ + + +/* Write a key-page on disk */ + +my_bool _ma_write_keypage(MARIA_PAGE *page, enum pagecache_page_lock lock, + int level) +{ + MARIA_SHARE *share= page->info->s; + uint block_size= share->block_size; + uchar *buff= page->buff; + my_bool res; + MARIA_PINNED_PAGE page_link; + DBUG_ENTER("_ma_write_keypage"); + + /* + The following ensures that for transactional tables we have logged + all changes that changes the page size (as the logging code sets + page->org_size) + */ + DBUG_ASSERT(!share->now_transactional || page->size == page->org_size); + +#ifdef EXTRA_DEBUG /* Safety check */ + { + uint page_length, nod_flag; + page_length= _ma_get_page_used(share, buff); + nod_flag= _ma_test_if_nod(share, buff); + + DBUG_ASSERT(page->size == page_length); + DBUG_ASSERT(page->size <= share->max_index_block_size); + DBUG_ASSERT(page->flag == _ma_get_keypage_flag(share, buff)); + + if (page->pos < share->base.keystart || + page->pos+block_size > share->state.state.key_file_length || + (page->pos & (maria_block_size-1))) + { + DBUG_PRINT("error",("Trying to write inside key status region: " + "key_start: %lu length: %lu page_pos: %lu", + (long) share->base.keystart, + (long) share->state.state.key_file_length, + (long) page->pos)); + my_errno=EINVAL; + DBUG_ASSERT(0); + DBUG_RETURN(1); + } + DBUG_PRINT("page",("write page at: %lu",(ulong) (page->pos / block_size))); + DBUG_DUMP("buff", buff, page_length); + DBUG_ASSERT(page_length >= share->keypage_header + nod_flag + + page->keyinfo->minlength || maria_in_recovery); + } +#endif + + /* Verify that keynr is correct */ + DBUG_ASSERT(_ma_get_keynr(share, buff) == page->keyinfo->key_nr); + +#if defined(EXTRA_DEBUG) && defined(HAVE_valgrind) && defined(WHEN_DEBUGGING) + MEM_CHECK_DEFINED(buff, block_size); +#endif + + page_cleanup(share, page); + { + PAGECACHE_BLOCK_LINK **link; + enum pagecache_page_pin pin; + if (lock == PAGECACHE_LOCK_LEFT_WRITELOCKED) + { + pin= PAGECACHE_PIN_LEFT_PINNED; + link= &page_link.link; + } + else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK) + { + pin= PAGECACHE_UNPIN; + /* + We unlock this page so link should be 0 to prevent it usage + even accidentally + */ + link= NULL; + } + else + { + pin= PAGECACHE_PIN; + link= &page_link.link; + } + res= pagecache_write(share->pagecache, + &share->kfile, + (pgcache_page_no_t) (page->pos / block_size), + level, buff, share->page_type, + lock, pin, PAGECACHE_WRITE_DELAY, link, + LSN_IMPOSSIBLE); + } + + if (lock == PAGECACHE_LOCK_WRITE) + { + /* It was not locked before, we have to unlock it when we unpin pages */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&page->info->pinned_pages, (void*) &page_link); + } + DBUG_RETURN(res); +} + + +/** + @brief Put page in free list + + @fn _ma_dispose() + @param info Maria handle + @param pos Address to page + @param page_not_read 1 if page has not yet been read + + @note + The page at 'pos' must have been read with a write lock. + This function does logging (unlike _ma_new()). + + @return + @retval 0 ok + @retval 1 error + +*/ + +int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read) +{ + my_off_t old_link; + uchar buff[MAX_KEYPAGE_HEADER_SIZE+ 8 + 2]; + ulonglong page_no; + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + uint block_size= share->block_size; + int result= 0; + enum pagecache_page_lock lock_method; + enum pagecache_page_pin pin_method; + DBUG_ENTER("_ma_dispose"); + DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size))); + DBUG_ASSERT(pos % block_size == 0); + + (void) _ma_lock_key_del(info, 0); + + old_link= share->key_del_current; + share->key_del_current= pos; + page_no= pos / block_size; + bzero(buff, share->keypage_header); + _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR); + _ma_store_page_used(share, buff, share->keypage_header + 8); + mi_sizestore(buff + share->keypage_header, old_link); + share->state.changed|= STATE_NOT_SORTED_PAGES; + + if (share->now_transactional) + { + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + my_off_t page; + + /* Store address of deleted page */ + page_store(log_data + FILEID_STORE_SIZE, page_no); + + /* Store link to next unused page (the link that is written to page) */ + page= (old_link == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO : + old_link / block_size); + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX_FREE_PAGE, + info->trn, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + result= 1; + } + + if (page_not_read) + { + lock_method= PAGECACHE_LOCK_WRITE; + pin_method= PAGECACHE_PIN; + } + else + { + lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + pin_method= PAGECACHE_PIN_LEFT_PINNED; + } + + if (pagecache_write_part(share->pagecache, + &share->kfile, (pgcache_page_no_t) page_no, + PAGECACHE_PRIORITY_LOW, buff, + share->page_type, + lock_method, pin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + LSN_IMPOSSIBLE, + 0, share->keypage_header + 8)) + result= 1; + +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + uchar *page_buff= pagecache_block_link_to_buffer(page_link.link); + bzero(page_buff + share->keypage_header + 8, + block_size - share->keypage_header - 8 - KEYPAGE_CHECKSUM_SIZE); + } +#endif + + if (page_not_read) + { + /* It was not locked before, we have to unlock it when we unpin pages */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + + DBUG_RETURN(result); +} /* _ma_dispose */ + + +/** + @brief Get address for free page to use + + @fn _ma_new() + @param info Maria handle + @param level Type of key block (caching priority for pagecache) + @param page_link Pointer to page in page cache if read. One can + check if this is used by checking if + page_link->changed != 0 + + @note Logging of this is left to the caller (so that the "new"ing and the + first changes done to this new page can be logged as one single entry - one + single _ma_log_new()) call). + + @return + HA_OFFSET_ERROR File is full or page read error + # Page address to use +*/ + +my_off_t _ma_new(register MARIA_HA *info, int level, + MARIA_PINNED_PAGE **page_link) + +{ + my_off_t pos; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; + DBUG_ENTER("_ma_new"); + + if (_ma_lock_key_del(info, 1)) + { + mysql_mutex_lock(&share->intern_lock); + pos= share->state.state.key_file_length; + if (pos >= share->base.max_key_file_length - block_size) + { + my_errno=HA_ERR_INDEX_FILE_FULL; + mysql_mutex_unlock(&share->intern_lock); + DBUG_RETURN(HA_OFFSET_ERROR); + } + share->state.state.key_file_length+= block_size; + /* Following is for not transactional tables */ + info->state->key_file_length= share->state.state.key_file_length; + mysql_mutex_unlock(&share->intern_lock); + (*page_link)->changed= 0; + (*page_link)->write_lock= PAGECACHE_LOCK_WRITE; + } + else + { + uchar *buff; + pos= share->key_del_current; /* Protected */ + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, + (pgcache_page_no_t) (pos / block_size), level, + 0, share->page_type, + PAGECACHE_LOCK_WRITE, &(*page_link)->link))) + { + pos= HA_OFFSET_ERROR; + _ma_set_fatal_error(info, my_errno); + } + else + { + /* + Next deleted page's number is in the header of the present page + (single linked list): + */ +#ifdef DBUG_ASSERT_EXISTS + my_off_t key_del_current; +#endif + share->key_del_current= mi_sizekorr(buff+share->keypage_header); +#ifdef DBUG_ASSERT_EXISTS + key_del_current= share->key_del_current; + DBUG_ASSERT((key_del_current != 0) && + ((key_del_current == HA_OFFSET_ERROR) || + (key_del_current <= + (share->state.state.key_file_length - block_size)))); +#endif + } + + (*page_link)->unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + (*page_link)->write_lock= PAGECACHE_LOCK_WRITE; + /* + We have to mark it changed as _ma_flush_pending_blocks() uses + 'changed' to know if we used the page cache or not + */ + (*page_link)->changed= 1; + push_dynamic(&info->pinned_pages, (void*) *page_link); + *page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE *); + } + share->state.changed|= STATE_NOT_SORTED_PAGES; + DBUG_PRINT("exit",("Pos: %ld",(long) pos)); + DBUG_RETURN(pos); +} /* _ma_new */ + + +/** + Log compactation of a index page +*/ + +static my_bool _ma_log_compact_keypage(MARIA_PAGE *ma_page, + TrID min_read_from) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 7 + TRANSID_SIZE]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + uint translog_parts, extra_length; + my_off_t page= ma_page->pos; + DBUG_ENTER("_ma_log_compact_keypage"); + DBUG_PRINT("enter", ("page: %lu", (ulong) (page / share->block_size))); + + /* Store address of new root page */ + page/= share->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + + log_pos= log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE; + + log_pos[0]= KEY_OP_COMPACT_PAGE; + transid_store(log_pos + 1, min_read_from); + log_pos+= 1 + TRANSID_SIZE; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + translog_parts= 1; + extra_length= 0; + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t)(log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length), + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + Remove all transaction id's less than given one from a key page + + @fn _ma_compact_keypage() + @param keyinfo Key handler + @param page_pos Page position on disk + @param page Buffer for page + @param min_read_from Remove all trids from page less than this + + @retval 0 Ok + ®retval 1 Error; my_errno contains the error +*/ + +my_bool _ma_compact_keypage(MARIA_PAGE *ma_page, TrID min_read_from) +{ + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + uchar *page, *endpos, *start_of_empty_space; + uint page_flag, nod_flag, saved_space; + my_bool page_has_transid; + DBUG_ENTER("_ma_compact_keypage"); + + page_flag= ma_page->flag; + if (!(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + DBUG_RETURN(0); /* No transaction id on page */ + + nod_flag= ma_page->node; + page= ma_page->buff; + endpos= page + ma_page->size; + key.data= info->lastkey_buff; + key.keyinfo= (MARIA_KEYDEF*) ma_page->keyinfo; + + page_has_transid= 0; + page+= share->keypage_header + nod_flag; + key.data[0]= 0; /* safety */ + start_of_empty_space= 0; + saved_space= 0; + do + { + if (!(page= (*ma_page->keyinfo->skip_key)(&key, 0, 0, page))) + { + DBUG_PRINT("error",("Couldn't find last key: page_pos: %p", + page)); + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(1); + } + if (key_has_transid(page-1)) + { + uint transid_length; + transid_length= transid_packed_length(page); + + if (min_read_from == ~(TrID) 0 || + min_read_from < transid_get_packed(share, page)) + { + page[-1]&= 254; /* Remove transid marker */ + transid_length= transid_packed_length(page); + if (start_of_empty_space) + { + /* Move block before the transid up in page */ + uint copy_length= (uint) (page - start_of_empty_space) - saved_space; + memmove(start_of_empty_space, start_of_empty_space + saved_space, + copy_length); + start_of_empty_space+= copy_length; + } + else + start_of_empty_space= page; + saved_space+= transid_length; + } + else + page_has_transid= 1; /* At least one id left */ + page+= transid_length; + } + page+= nod_flag; + } while (page < endpos); + + DBUG_ASSERT(page == endpos); + + if (start_of_empty_space) + { + /* + Move last block down + This is always true if any transid was removed + */ + uint copy_length= (uint) (endpos - start_of_empty_space) - saved_space; + + if (copy_length) + memmove(start_of_empty_space, start_of_empty_space + saved_space, + copy_length); + ma_page->size= (uint) (start_of_empty_space + copy_length - ma_page->buff); + page_store_size(share, ma_page); + } + + if (!page_has_transid) + { + ma_page->flag&= ~KEYPAGE_FLAG_HAS_TRANSID; + _ma_store_keypage_flag(share, ma_page->buff, ma_page->flag); + /* Clear packed transid (in case of zerofill) */ + bzero(ma_page->buff + LSN_STORE_SIZE, TRANSID_SIZE); + } + + if (share->now_transactional) + { + if (_ma_log_compact_keypage(ma_page, min_read_from)) + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c new file mode 100644 index 00000000..144b10a8 --- /dev/null +++ b/storage/maria/ma_pagecache.c @@ -0,0 +1,5719 @@ +/* Copyright (C) 2000-2008 MySQL AB, 2008-2011 Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + These functions handle page caching for Maria tables. + + One cache can handle many files. + It must contain buffers of the same blocksize. + init_pagecache() should be used to init cache handler. + + The free list (free_block_list) is a stack like structure. + When a block is freed by free_block(), it is pushed onto the stack. + When a new block is required it is first tried to pop one from the stack. + If the stack is empty, it is tried to get a never-used block from the pool. + If this is empty too, then a block is taken from the LRU ring, flushing it + to disk, if necessary. This is handled in find_block(). + With the new free list, the blocks can have three temperatures: + hot, warm and cold (which is free). This is remembered in the block header + by the enum PCBLOCK_TEMPERATURE temperature variable. Remembering the + temperature is necessary to correctly count the number of warm blocks, + which is required to decide when blocks are allowed to become hot. Whenever + a block is inserted to another (sub-)chain, we take the old and new + temperature into account to decide if we got one more or less warm block. + blocks_unused is the sum of never used blocks in the pool and of currently + free blocks. blocks_used is the number of blocks fetched from the pool and + as such gives the maximum number of in-use blocks at any time. + + TODO: Write operation locks whole cache till the end of the operation. + Should be fixed. +*/ + +#include "maria_def.h" +#include <m_string.h> +#include "ma_pagecache.h" +#include "ma_blockrec.h" +#include <my_bit.h> +#include <errno.h> + +/* + Some compilation flags have been added specifically for this module + to control the following: + - not to let a thread to yield the control when reading directly + from page cache, which might improve performance in many cases; + to enable this add: + #define SERIALIZED_READ_FROM_CACHE + - to set an upper bound for number of threads simultaneously + using the page cache; this setting helps to determine an optimal + size for hash table and improve performance when the number of + blocks in the page cache much less than the number of threads + accessing it; + to set this number equal to <N> add + #define MAX_THREADS <N> + - to substitute calls of mysql_cond_wait for calls of + mysql_cond_timedwait (wait with timeout set up); + this setting should be used only when you want to trap a deadlock + situation, which theoretically should not happen; + to set timeout equal to <T> seconds add + #define PAGECACHE_TIMEOUT <T> + - to enable the module traps and to send debug information from + page cache module to a special debug log add: + #define PAGECACHE_DEBUG + the name of this debug log file <LOG NAME> can be set through: + #define PAGECACHE_DEBUG_LOG <LOG NAME> + if the name is not defined, it's set by default; + if the PAGECACHE_DEBUG flag is not set up and we are in a debug + mode, i.e. when ! defined(DBUG_OFF), the debug information from the + module is sent to the regular debug log. + + Example of the settings: + #define SERIALIZED_READ_FROM_CACHE + #define MAX_THREADS 100 + #define PAGECACHE_TIMEOUT 1 + #define PAGECACHE_DEBUG + #define PAGECACHE_DEBUG_LOG "my_pagecache_debug.log" +*/ +#undef PAGECACHE_DEBUG +#define PAGECACHE_DEBUG_LOG "my_pagecache_debug.log" +#define _VARARGS(X) X + +/* + In key cache we have external raw locking here we use + SERIALIZED_READ_FROM_CACHE to avoid problem of reading + not consistent data from the page. + (keycache functions (key_cache_read(), key_cache_insert() and + key_cache_write()) rely on external MyISAM lock, we don't) +*/ +#define SERIALIZED_READ_FROM_CACHE yes + +#define PCBLOCK_INFO(B) \ + DBUG_PRINT("info", \ + ("block: %p fd: %lu page: %lu status: 0x%x " \ + "hshL: %p requests: %u/%u wrlocks: %u rdlocks: %u " \ + "rdlocks_q: %u pins: %u type: %s", \ + (B), \ + (ulong)((B)->hash_link ? \ + (B)->hash_link->file.file : \ + 0), \ + (ulong)((B)->hash_link ? \ + (B)->hash_link->pageno : \ + 0), \ + (uint) (B)->status, \ + (B)->hash_link, \ + (uint) (B)->requests, \ + (uint)((B)->hash_link ? \ + (B)->hash_link->requests : \ + 0), \ + (B)->wlocks, (B)->rlocks, (B)->rlocks_queue, \ + (uint)(B)->pins, \ + page_cache_page_type_str[(B)->type])) + +/* TODO: put it to my_static.c */ +my_bool my_disable_flush_pagecache_blocks= 0; + +#define STRUCT_PTR(TYPE, MEMBER, a) \ + (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER)) + +/* types of condition variables */ +#define COND_FOR_REQUESTED 0 /* queue of thread waiting for read operation */ +#define COND_FOR_SAVED 1 /* queue of thread waiting for flush */ +#define COND_FOR_WRLOCK 2 /* queue of write lock */ +#define COND_SIZE 3 /* number of COND_* queues */ + +typedef mysql_cond_t KEYCACHE_CONDVAR; + +/* descriptor of the page in the page cache block buffer */ +struct st_pagecache_page +{ + PAGECACHE_FILE file; /* file to which the page belongs to */ + pgcache_page_no_t pageno; /* number of the page in the file */ +}; + +/* element in the chain of a hash table bucket */ +struct st_pagecache_hash_link +{ + struct st_pagecache_hash_link + *next, **prev; /* to connect links in the same bucket */ + struct st_pagecache_block_link + *block; /* reference to the block for the page: */ + PAGECACHE_FILE file; /* from such a file */ + pgcache_page_no_t pageno; /* this page */ + uint requests; /* number of requests for the page */ +}; + +/* simple states of a block */ +#define PCBLOCK_ERROR 1 /* an error occurred when performing disk i/o */ +#define PCBLOCK_READ 2 /* there is an active page in the block buffer */ + +/* + A tread is reading the data to the page. + If the page contained old changed data, it will be written out with + this state set on the block. + The page is not yet ready to be used for reading. +*/ +#define PCBLOCK_IN_SWITCH 4 +/* + Block does not accept new requests for old page that would cause + the page to be pinned or written to. + (Reads that copies the block can still continue). + This state happens when another thread is waiting for readers to finish + to read data to the block (after the block, if it was changed, has been + flushed out to disk). +*/ +#define PCBLOCK_REASSIGNED 8 +#define PCBLOCK_IN_FLUSH 16 /* block is in flush operation */ +#define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */ +#define PCBLOCK_DIRECT_W 64 /* possible direct write to the block */ +#define PCBLOCK_DEL_WRITE 128 /* should be written on delete */ +#define PCBLOCK_BIG_READ 256 /* the first block of the big read in progress + or not first block which other thread wait + to be read in big read operation */ + +/* page status, returned by find_block */ +#define PAGE_READ 0 +#define PAGE_TO_BE_READ 1 +#define PAGE_WAIT_TO_BE_READ 2 + +/* block temperature determines in which (sub-)chain the block currently is */ +enum PCBLOCK_TEMPERATURE { PCBLOCK_COLD /*free*/ , PCBLOCK_WARM , PCBLOCK_HOT }; + +/* debug info */ +#ifdef DBUG_TRACE +static const char *page_cache_page_type_str[]= +{ + /* used only for control page type changing during debugging */ + "EMPTY", + "PLAIN", + "LSN", + "READ_UNKNOWN" +}; + +static const char *page_cache_page_write_mode_str[]= +{ + "DELAY", + "DONE" +}; + +static const char *page_cache_page_lock_str[]= +{ + "free -> free", + "read -> read", + "write -> write", + "free -> read", + "free -> write", + "read -> free", + "write -> free", + "write -> read" +}; + +static const char *page_cache_page_pin_str[]= +{ + "pinned -> pinned", + "unpinned -> unpinned", + "unpinned -> pinned", + "pinned -> unpinned" +}; +#endif /* DBUG_TRACE */ + +#ifndef DBUG_OFF +typedef struct st_pagecache_pin_info +{ + struct st_pagecache_pin_info *next, **prev; + struct st_my_thread_var *thread; +} PAGECACHE_PIN_INFO; + +/* + st_pagecache_lock_info structure should be kept in next, prev, thread part + compatible with st_pagecache_pin_info to be compatible in functions. +*/ + +typedef struct st_pagecache_lock_info +{ + struct st_pagecache_lock_info *next, **prev; + struct st_my_thread_var *thread; + my_bool write_lock; +} PAGECACHE_LOCK_INFO; + + +/* service functions maintain debugging info about pin & lock */ + + +/* + Links information about thread pinned/locked the block to the list + + SYNOPSIS + info_link() + list the list to link in + node the node which should be linked +*/ + +static void info_link(PAGECACHE_PIN_INFO **list, PAGECACHE_PIN_INFO *node) +{ + if ((node->next= *list)) + node->next->prev= &(node->next); + *list= node; + node->prev= list; +} + + +/* + Unlinks information about thread pinned/locked the block from the list + + SYNOPSIS + info_unlink() + node the node which should be unlinked +*/ + +static void info_unlink(PAGECACHE_PIN_INFO *node) +{ + if ((*node->prev= node->next)) + node->next->prev= node->prev; +} + + +/* + Finds information about given thread in the list of threads which + pinned/locked this block. + + SYNOPSIS + info_find() + list the list where to find the thread + thread thread ID (reference to the st_my_thread_var + of the thread) + any return any thread of the list + + RETURN + 0 - the thread was not found + pointer to the information node of the thread in the list, or, if 'any', + to any thread of the list. +*/ + +static PAGECACHE_PIN_INFO *info_find(PAGECACHE_PIN_INFO *list, + struct st_my_thread_var *thread, + my_bool any) +{ + register PAGECACHE_PIN_INFO *i= list; + if (any) + return i; + for(; i != 0; i= i->next) + if (i->thread == thread) + return i; + return 0; +} + +#endif /* !DBUG_OFF */ + +/* page cache block */ +struct st_pagecache_block_link +{ + struct st_pagecache_block_link + *next_used, **prev_used; /* to connect links in the LRU chain (ring) */ + struct st_pagecache_block_link + *next_changed, **prev_changed; /* for lists of file dirty/clean blocks */ + struct st_pagecache_hash_link + *hash_link; /* backward ptr to referring hash_link */ +#ifndef DBUG_OFF + PAGECACHE_PIN_INFO *pin_list; + PAGECACHE_LOCK_INFO *lock_list; +#endif + KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */ + uchar *buffer; /* buffer for the block page */ + pthread_t write_locker; + + ulonglong last_hit_time; /* timestamp of the last hit */ + WQUEUE + wqueue[COND_SIZE]; /* queues on waiting requests for new/old pages */ + uint32 requests; /* number of requests for the block */ + uint32 pins; /* pin counter */ + uint32 wlocks; /* write locks counter */ + uint32 rlocks; /* read locks counter */ + uint32 rlocks_queue; /* rd. locks waiting wr. lock of this thread */ + uint16 status; /* state of the block */ + int16 error; /* error code for block in case of error */ + enum PCBLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot*/ + enum pagecache_page_type type; /* type of the block */ + uint hits_left; /* number of hits left until promotion */ + /** @brief LSN when first became dirty; LSN_MAX means "not yet set" */ + LSN rec_lsn; +}; + +/** @brief information describing a run of flush_pagecache_blocks_int() */ +struct st_file_in_flush +{ + File file; + /** + @brief threads waiting for the thread currently flushing this file to be + done + */ + WQUEUE flush_queue; + /** + @brief if the thread currently flushing the file has a non-empty + first_in_switch list. + */ + my_bool first_in_switch; +}; + +#ifndef DBUG_OFF +/* debug checks */ + +#ifdef NOT_USED +static my_bool info_check_pin(PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_pin mode + __attribute__((unused))) +{ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_PIN_INFO *info= info_find(block->pin_list, thread); + DBUG_ENTER("info_check_pin"); + DBUG_PRINT("enter", ("thread: 0x%lx pin: %s", + (ulong) thread, page_cache_page_pin_str[mode])); + if (info) + { + if (mode == PAGECACHE_PIN_LEFT_UNPINNED) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_UNPINNED!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + else if (mode == PAGECACHE_PIN) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; PIN!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + } + else + { + if (mode == PAGECACHE_PIN_LEFT_PINNED) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; LEFT_PINNED!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + else if (mode == PAGECACHE_UNPIN) + { + DBUG_PRINT("info", + ("info_check_pin: thread: 0x%lx block: 0x%lx ; UNPIN!!!", + (ulong)thread, (ulong)block)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Debug function which checks current lock/pin state and requested changes + + SYNOPSIS + info_check_lock() + lock requested lock changes + pin requested pin changes + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin) +{ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *) info_find((PAGECACHE_PIN_INFO *) block->lock_list, + thread); + DBUG_ENTER("info_check_lock"); + switch(lock) { + case PAGECACHE_LOCK_LEFT_UNLOCKED: + if (pin != PAGECACHE_PIN_LEFT_UNPINNED || + info) + goto error; + break; + case PAGECACHE_LOCK_LEFT_READLOCKED: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_PIN_LEFT_PINNED) || + info == 0 || info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_LEFT_WRITELOCKED: + if (pin != PAGECACHE_PIN_LEFT_PINNED || + info == 0 || !info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_READ: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_PIN) || + info != 0) + goto error; + break; + case PAGECACHE_LOCK_WRITE: + if (pin != PAGECACHE_PIN || + info != 0) + goto error; + break; + case PAGECACHE_LOCK_READ_UNLOCK: + if ((pin != PAGECACHE_PIN_LEFT_UNPINNED && + pin != PAGECACHE_UNPIN) || + info == 0 || info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_WRITE_UNLOCK: + if (pin != PAGECACHE_UNPIN || + info == 0 || !info->write_lock) + goto error; + break; + case PAGECACHE_LOCK_WRITE_TO_READ: + if ((pin != PAGECACHE_PIN_LEFT_PINNED && + pin != PAGECACHE_UNPIN) || + info == 0 || !info->write_lock) + goto error; + break; + } + DBUG_RETURN(0); +error: + DBUG_PRINT("info", + ("info_check_lock: thread: 0x%lx block 0x%lx: info: %d wrt: %d," + "to lock: %s, to pin: %s", + (ulong) thread, (ulong) block, MY_TEST(info), + (info ? info->write_lock : 0), + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_RETURN(1); +} +#endif /* NOT_USED */ +#endif /* !DBUG_OFF */ + +#define FLUSH_CACHE 2000 /* sort this many blocks at once */ + +static my_bool free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool abort_if_pinned); +static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link); +#ifndef DBUG_OFF +static void test_key_cache(PAGECACHE *pagecache, + const char *where, my_bool lock); +#endif + +#define PAGECACHE_HASH(p, f, pos) (((size_t) (pos) + \ + (size_t) (f).file) & (p->hash_entries-1)) +#define FILE_HASH(f,cache) ((uint) (f).file & (cache->changed_blocks_hash_size-1)) + +#define DEFAULT_PAGECACHE_DEBUG_LOG "pagecache_debug.log" + +#if defined(PAGECACHE_DEBUG) +static FILE *pagecache_debug_log= NULL; +static void pagecache_debug_print _VARARGS((const char *fmt, ...)); +#define PAGECACHE_DEBUG_OPEN \ + if (!pagecache_debug_log) \ + { \ + if ((pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w"))) \ + (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ); \ + } + +#define PAGECACHE_DEBUG_CLOSE \ + if (pagecache_debug_log) \ + { \ + fclose(pagecache_debug_log); \ + pagecache_debug_log= 0; \ + } +#else +#define PAGECACHE_DEBUG_OPEN +#define PAGECACHE_DEBUG_CLOSE +#endif /* defined(PAGECACHE_DEBUG_LOG) */ + +#if defined(PAGECACHE_DEBUG) +#define KEYCACHE_PRINT(l, m) KEYCACHE_DBUG_PRINT(l,m) + +#ifdef PAGECACHE_DEBUG_DLOG +#define KEYCACHE_DBUG_PRINT(l, m) \ + { if (pagecache_debug_log) \ + { \ + fprintf(pagecache_debug_log, "%s: ", l); \ + DBUG_PRINT("PCDEBUG", ("%s: ", l)); \ + } \ + pagecache_debug_print m; } +#else +#define KEYCACHE_DBUG_PRINT(l, m) \ + { if (pagecache_debug_log) \ + fprintf(pagecache_debug_log, "%s: ", l); \ + pagecache_debug_print m; } +#endif + + +#define KEYCACHE_DBUG_ASSERT(a) \ + { if (! (a) && pagecache_debug_log) \ + fclose(pagecache_debug_log); \ + DBUG_ASSERT(a); } +#else +#define KEYCACHE_PRINT(l, m) +#define KEYCACHE_DBUG_PRINT(l, m) DBUG_PRINT(l, m) +#define KEYCACHE_DBUG_ASSERT(a) DBUG_ASSERT(a) +#endif /* defined(PAGECACHE_DEBUG) */ + +#if defined(PAGECACHE_DEBUG) || defined(DBUG_TRACE) +static my_thread_id pagecache_thread_id; +#define KEYCACHE_THREAD_TRACE(l) \ + KEYCACHE_DBUG_PRINT(l,("|thread %lld",pagecache_thread_id)) + +#define KEYCACHE_THREAD_TRACE_BEGIN(l) \ + { struct st_my_thread_var *thread_var= my_thread_var; \ + pagecache_thread_id= thread_var->id; \ + KEYCACHE_DBUG_PRINT(l,("[thread %lld",pagecache_thread_id)); \ + } + +#define KEYCACHE_THREAD_TRACE_END(l) \ + KEYCACHE_DBUG_PRINT(l,("]thread %lld",pagecache_thread_id)) +#else +#define KEYCACHE_PRINT(l,m) +#define KEYCACHE_THREAD_TRACE_BEGIN(l) +#define KEYCACHE_THREAD_TRACE_END(l) +#define KEYCACHE_THREAD_TRACE(l) +#endif /* defined(PAGECACHE_DEBUG) || defined(DBUG_TRACE) */ + +#define PCBLOCK_NUMBER(p, b) \ + ((uint) (((char*)(b)-(char *) p->block_root)/sizeof(PAGECACHE_BLOCK_LINK))) +#define PAGECACHE_HASH_LINK_NUMBER(p, h) \ + ((uint) (((char*)(h)-(char *) p->hash_link_root)/ \ + sizeof(PAGECACHE_HASH_LINK))) + +#if (defined(PAGECACHE_TIMEOUT) && !defined(_WIN32)) || defined(PAGECACHE_DEBUG) +static int pagecache_pthread_cond_wait(mysql_cond_t *cond, + mysql_mutex_t *mutex); +#else +#define pagecache_pthread_cond_wait mysql_cond_wait +#endif + +#if defined(PAGECACHE_DEBUG) +static int ___pagecache_pthread_mutex_lock(mysql_mutex_t *mutex); +static void ___pagecache_pthread_mutex_unlock(mysql_mutex_t *mutex); +static int ___pagecache_pthread_cond_signal(mysql_cond_t *cond); +#define pagecache_pthread_mutex_lock(M) \ +{ DBUG_PRINT("lock", ("mutex lock %p %u", (M), __LINE__)); \ + ___pagecache_pthread_mutex_lock(M);} +#define pagecache_pthread_mutex_unlock(M) \ +{ DBUG_PRINT("lock", ("mutex unlock %p %u", (M), __LINE__)); \ + ___pagecache_pthread_mutex_unlock(M);} +#define pagecache_pthread_cond_signal(M) \ +{ DBUG_PRINT("lock", ("signal %p %u", (M), __LINE__)); \ + ___pagecache_pthread_cond_signal(M);} +#else +#define pagecache_pthread_mutex_lock mysql_mutex_lock +#define pagecache_pthread_mutex_unlock mysql_mutex_unlock +#define pagecache_pthread_cond_signal mysql_cond_signal +#endif /* defined(PAGECACHE_DEBUG) */ + +extern my_bool translog_flush(TRANSLOG_ADDRESS lsn); + +/* + Write page to the disk + + SYNOPSIS + pagecache_fwrite() + pagecache - page cache pointer + filedesc - pagecache file descriptor structure + buffer - buffer which we will write + type - page type (plain or with LSN) + flags - MYF() flags + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool pagecache_fwrite(PAGECACHE *pagecache, + PAGECACHE_FILE *filedesc, + uchar *buffer, + pgcache_page_no_t pageno, + enum pagecache_page_type type + __attribute__((unused)), + myf flags) +{ + int res; + PAGECACHE_IO_HOOK_ARGS args; + DBUG_ENTER("pagecache_fwrite"); + DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE); + +#ifdef EXTRA_DEBUG_BITMAP + /* + This code is very good when debugging changes in bitmaps or dirty lists + The above define should be defined for all Aria files if you want to + debug either of the above issues. + */ + + if (pagecache->extra_debug) + { + char buff[80]; + uint len= my_sprintf(buff, + (buff, "fwrite: fd: %d id: %u page: %llu", + filedesc->file, + _ma_file_callback_to_id(filedesc->callback_data), + pageno)); + (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY, + (uchar*) buff, len); + } +#endif + + /* initialize hooks args */ + args.page= buffer; + args.pageno= pageno; + args.data= filedesc->callback_data; + + /* Todo: Integrate this with write_callback so we have only one callback */ + if ((*filedesc->flush_log_callback)(&args)) + DBUG_RETURN(1); + DBUG_PRINT("info", ("pre_write_hook:%p data: %p", + filedesc->pre_write_hook, + filedesc->callback_data)); + if ((*filedesc->pre_write_hook)(&args)) + { + DBUG_PRINT("error", ("write callback problem")); + DBUG_RETURN(1); + } +#if __has_feature(memory_sanitizer) /* FIXME: encryption.aria_tiny etc. fail */ + /* FIXME: ENGINE=Aria occasionally writes uninitialized data */ + __msan_unpoison(args.page, pagecache->block_size); +#endif + res= (int)my_pwrite(filedesc->file, args.page, pagecache->block_size, + ((my_off_t) pageno << pagecache->shift), flags); + (*filedesc->post_write_hook)(res, &args); + DBUG_RETURN(res); +} + + +/* + Read page from the disk + + SYNOPSIS + pagecache_fread() + pagecache - page cache pointer + filedesc - pagecache file descriptor structure + buffer - buffer in which we will read + pageno - page number + flags - MYF() flags +*/ +#define pagecache_fread(pagecache, filedesc, buffer, pageno, flags) \ + mysql_file_pread((filedesc)->file, buffer, pagecache->block_size, \ + ((my_off_t) pageno << pagecache->shift), flags) + + +/** + @brief set rec_lsn of pagecache block (if it is needed) + + @param block block where to set rec_lsn + @param first_REDO_LSN_for_page the LSN to set +*/ + +static inline void pagecache_set_block_rec_lsn(PAGECACHE_BLOCK_LINK *block, + LSN first_REDO_LSN_for_page) +{ + if (block->rec_lsn == LSN_MAX) + block->rec_lsn= first_REDO_LSN_for_page; + else + DBUG_ASSERT(cmp_translog_addr(block->rec_lsn, + first_REDO_LSN_for_page) <= 0); +} + + +/* + next_power(value) is 2 at the power of (1+floor(log2(value))); + e.g. next_power(2)=4, next_power(3)=4. +*/ +static inline uint next_power(uint value) +{ + return (uint) my_round_up_to_next_power((uint32) value) << 1; +} + + +/* + Initialize a page cache + + SYNOPSIS + init_pagecache() + pagecache pointer to a page cache data structure + key_cache_block_size size of blocks to keep cached data + use_mem total memory to use for the key cache + division_limit division limit (may be zero) + age_threshold age threshold (may be zero) + block_size size of block (should be power of 2) + my_read_flags Flags used for all pread/pwrite calls + Usually MY_WME in case of recovery + + RETURN VALUE + number of blocks in the key cache, if successful, + 0 - otherwise. + + NOTES. + if pagecache->inited != 0 we assume that the key cache + is already initialized. This is for now used by myisamchk, but shouldn't + be something that a program should rely on! + + It's assumed that no two threads call this function simultaneously + referring to the same key cache handle. + +*/ + +size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem, + uint division_limit, uint age_threshold, + uint block_size, + uint changed_blocks_hash_size, + myf my_readwrite_flags) +{ + size_t blocks, hash_links, length; + int error; + DBUG_ENTER("init_pagecache"); + DBUG_ASSERT(block_size >= 512); + + // By default we init usual cache (variables will be assigned to switch to s3) + pagecache->big_block_read= NULL; + pagecache->big_block_free= NULL; + + PAGECACHE_DEBUG_OPEN; + if (pagecache->inited && pagecache->disk_blocks > 0) + { + DBUG_PRINT("warning",("key cache already in use")); + DBUG_RETURN(0); + } + + pagecache->global_cache_w_requests= pagecache->global_cache_r_requests= 0; + pagecache->global_cache_read= pagecache->global_cache_write= 0; + pagecache->disk_blocks= -1; + if (! pagecache->inited) + { + if (mysql_mutex_init(key_PAGECACHE_cache_lock, + &pagecache->cache_lock, MY_MUTEX_INIT_FAST) || + my_hash_init(PSI_INSTRUMENT_ME, &pagecache->files_in_flush, + &my_charset_bin, 32, offsetof(struct st_file_in_flush, file), + sizeof(((struct st_file_in_flush *)NULL)->file), + NULL, NULL, 0)) + goto err; + pagecache->inited= 1; + pagecache->in_init= 0; + pagecache->resize_queue.last_thread= NULL; + } + + pagecache->mem_size= use_mem; + pagecache->block_size= block_size; + pagecache->shift= my_bit_log2_uint64(block_size); + pagecache->readwrite_flags= my_readwrite_flags | MY_NABP | MY_WAIT_IF_FULL; + pagecache->org_readwrite_flags= pagecache->readwrite_flags; + DBUG_PRINT("info", ("block_size: %u", block_size)); + DBUG_ASSERT(((uint)(1 << pagecache->shift)) == block_size); + + blocks= use_mem / (sizeof(PAGECACHE_BLOCK_LINK) + + 2 * sizeof(PAGECACHE_HASH_LINK) + + sizeof(PAGECACHE_HASH_LINK*) * + 5/4 + block_size); + /* Changed blocks hash needs to be a power of 2 */ + changed_blocks_hash_size= my_round_up_to_next_power(MY_MAX(changed_blocks_hash_size, + MIN_PAGECACHE_CHANGED_BLOCKS_HASH_SIZE)); + + /* + We need to support page cache with just one block to be able to do + scanning of rows-in-block files + */ + for ( ; ; ) + { + if (blocks < 8) + { + my_message(ENOMEM, "Not enough memory to allocate 8 pagecache pages", + MYF(0)); + my_errno= ENOMEM; + goto err; + } + /* Set my_hash_entries to the next bigger 2 power */ + if ((pagecache->hash_entries= next_power((uint)blocks)) < + (blocks) * 5/4) + pagecache->hash_entries<<= 1; + hash_links= 2 * blocks; +#if defined(MAX_THREADS) + if (hash_links < MAX_THREADS + blocks - 1) + hash_links= MAX_THREADS + blocks - 1; +#endif + while ((length= (ALIGN_SIZE(blocks * sizeof(PAGECACHE_BLOCK_LINK)) + + ALIGN_SIZE(sizeof(PAGECACHE_HASH_LINK*) * + pagecache->hash_entries) + + ALIGN_SIZE(hash_links * sizeof(PAGECACHE_HASH_LINK)) + + sizeof(PAGECACHE_BLOCK_LINK*)* (changed_blocks_hash_size*2))) + + (blocks << pagecache->shift) > use_mem && blocks > 8) + blocks--; + /* Allocate memory for cache page buffers */ + pagecache->mem_size= blocks * pagecache->block_size; + if ((pagecache->block_mem= + my_large_malloc(&pagecache->mem_size, MYF(MY_WME)))) + { + /* + Allocate memory for blocks, hash_links and hash entries; + For each block 2 hash links are allocated + */ + if (my_multi_malloc_large(PSI_INSTRUMENT_ME, MYF(MY_ZEROFILL), + &pagecache->block_root, + (ulonglong) (blocks * + sizeof(PAGECACHE_BLOCK_LINK)), + &pagecache->hash_root, + (ulonglong) (sizeof(PAGECACHE_HASH_LINK*) * + pagecache->hash_entries), + &pagecache->hash_link_root, + (ulonglong) (hash_links * + sizeof(PAGECACHE_HASH_LINK)), + &pagecache->changed_blocks, + (ulonglong) (sizeof(PAGECACHE_BLOCK_LINK*) * + changed_blocks_hash_size), + &pagecache->file_blocks, + (ulonglong) (sizeof(PAGECACHE_BLOCK_LINK*) * + changed_blocks_hash_size), + NullS)) + break; + my_large_free(pagecache->block_mem, pagecache->mem_size); + pagecache->block_mem= 0; + } + blocks= blocks / 4*3; + } + pagecache->blocks_unused= blocks; + pagecache->disk_blocks= blocks; + pagecache->hash_links= hash_links; + pagecache->hash_links_used= 0; + pagecache->free_hash_list= NULL; + pagecache->blocks_used= pagecache->blocks_changed= 0; + + pagecache->global_blocks_changed= 0; + pagecache->blocks_available=0; /* For debugging */ + + /* The LRU chain is empty after initialization */ + pagecache->used_last= NULL; + pagecache->used_ins= NULL; + pagecache->free_block_list= NULL; + pagecache->time= 0; + pagecache->warm_blocks= 0; + pagecache->min_warm_blocks= (division_limit ? + blocks * division_limit / 100 + 1 : + blocks); + pagecache->age_threshold= (age_threshold ? + blocks * age_threshold / 100 : + blocks); + pagecache->changed_blocks_hash_size= changed_blocks_hash_size; + + pagecache->cnt_for_resize_op= 0; + pagecache->resize_in_flush= 0; + pagecache->can_be_used= 1; + + pagecache->waiting_for_hash_link.last_thread= NULL; + pagecache->waiting_for_block.last_thread= NULL; + DBUG_PRINT("exit", + ("disk_blocks: %zu block_root: %p hash_entries: %zu\ + hash_root: %p hash_links: %zu hash_link_root: %p", + (size_t)pagecache->disk_blocks, pagecache->block_root, + pagecache->hash_entries, pagecache->hash_root, + (size_t)pagecache->hash_links, pagecache->hash_link_root)); + + pagecache->blocks= pagecache->disk_blocks > 0 ? pagecache->disk_blocks : 0; + DBUG_RETURN((size_t)pagecache->disk_blocks); + +err: + error= my_errno; + pagecache->disk_blocks= 0; + pagecache->blocks= 0; + if (pagecache->block_mem) + { + my_large_free(pagecache->block_mem, pagecache->mem_size); + pagecache->block_mem= NULL; + } + if (pagecache->block_root) + { + my_free(pagecache->block_root); + pagecache->block_root= NULL; + } + my_errno= error; + pagecache->can_be_used= 0; + DBUG_RETURN(0); +} + + +/* + Flush all blocks in the key cache to disk +*/ + +#ifdef NOT_USED +static int flush_all_key_blocks(PAGECACHE *pagecache) +{ +#if defined(PAGECACHE_DEBUG) + uint cnt=0; +#endif + while (pagecache->blocks_changed > 0) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->used_last->next_used ; ; block=block->next_used) + { + if (block->hash_link) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file, + FLUSH_RELEASE, NULL, NULL)) + return 1; + break; + } + if (block == pagecache->used_last) + break; + } + } + return 0; +} +#endif /* NOT_USED */ + +/* + Resize a key cache + + SYNOPSIS + resize_pagecache() + pagecache pointer to a page cache data structure + use_mem total memory to use for the new key cache + division_limit new division limit (if not zero) + age_threshold new age threshold (if not zero) + + RETURN VALUE + number of blocks in the key cache, if successful, + 0 - otherwise. + + NOTES. + The function first compares the memory size parameter + with the key cache value. + + If they differ the function free the the memory allocated for the + old key cache blocks by calling the end_pagecache function and + then rebuilds the key cache with new blocks by calling + init_key_cache. + + The function starts the operation only when all other threads + performing operations with the key cache let her to proceed + (when cnt_for_resize=0). + + Before being usable, this function needs: + - to receive fixes for BUG#17332 "changing key_buffer_size on a running + server can crash under load" similar to those done to the key cache + - to have us (Sanja) look at the additional constraints placed on + resizing, due to the page locking specific to this page cache. + So we disable it for now. +*/ +#ifdef NOT_USED /* keep disabled until code is fixed see above !! */ +size_t resize_pagecache(PAGECACHE *pagecache, + size_t use_mem, uint division_limit, + uint age_threshold, uint changed_blocks_hash_size) +{ + size_t blocks; + struct st_my_thread_var *thread; + WQUEUE *wqueue; + DBUG_ENTER("resize_pagecache"); + + if (!pagecache->inited) + DBUG_RETURN(pagecache->disk_blocks); + + if(use_mem == pagecache->mem_size) + { + change_pagecache_param(pagecache, division_limit, age_threshold); + DBUG_RETURN(pagecache->disk_blocks); + } + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + wqueue= &pagecache->resize_queue; + thread= my_thread_var; + wqueue_link_into_queue(wqueue, thread); + + while (wqueue->last_thread->next != thread) + { + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + } + + pagecache->resize_in_flush= 1; + if (flush_all_key_blocks(pagecache)) + { + /* TODO: if this happens, we should write a warning in the log file ! */ + pagecache->resize_in_flush= 0; + blocks= 0; + pagecache->can_be_used= 0; + goto finish; + } + pagecache->resize_in_flush= 0; + pagecache->can_be_used= 0; + while (pagecache->cnt_for_resize_op) + { + DBUG_PRINT("wait", ("suspend thread %s %ld", thread->name, thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + } + + end_pagecache(pagecache, 0); /* Don't free mutex */ + /* The following will work even if use_mem is 0 */ + blocks= init_pagecache(pagecache, pagecache->block_size, use_mem, + division_limit, age_threshold, changed_blocks_hash_size, + pagecache->readwrite_flags); + +finish: + wqueue_unlink_from_queue(wqueue, thread); + /* Signal for the next resize request to proceeed if any */ + if (wqueue->last_thread) + { + DBUG_PRINT("signal", + ("thread %s %ld", wqueue->last_thread->next->name, + wqueue->last_thread->next->id)); + pagecache_pthread_cond_signal(&wqueue->last_thread->next->suspend); + } + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(blocks); +} +#endif /* 0 */ + + +/* + Increment counter blocking resize key cache operation +*/ +static inline void inc_counter_for_resize_op(PAGECACHE *pagecache) +{ + mysql_mutex_assert_owner(&pagecache->cache_lock); + pagecache->cnt_for_resize_op++; +} + + +/* + Decrement counter blocking resize key cache operation; + Signal the operation to proceed when counter becomes equal zero +*/ + +static inline void dec_counter_for_resize_op(PAGECACHE *pagecache) +{ + struct st_my_thread_var *last_thread; + mysql_mutex_assert_owner(&pagecache->cache_lock); + if (!--pagecache->cnt_for_resize_op && + (last_thread= pagecache->resize_queue.last_thread)) + { + DBUG_PRINT("signal", + ("thread %s %ld", last_thread->next->name, + (ulong) last_thread->next->id)); + pagecache_pthread_cond_signal(&last_thread->next->suspend); + } +} + +/* + Change the page cache parameters + + SYNOPSIS + change_pagecache_param() + pagecache pointer to a page cache data structure + division_limit new division limit (if not zero) + age_threshold new age threshold (if not zero) + + RETURN VALUE + none + + NOTES. + Presently the function resets the key cache parameters + concerning midpoint insertion strategy - division_limit and + age_threshold. +*/ + +void change_pagecache_param(PAGECACHE *pagecache, uint division_limit, + uint age_threshold) +{ + DBUG_ENTER("change_pagecache_param"); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (division_limit) + pagecache->min_warm_blocks= (pagecache->disk_blocks * + division_limit / 100 + 1); + if (age_threshold) + pagecache->age_threshold= (pagecache->disk_blocks * + age_threshold / 100); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_VOID_RETURN; +} + + +/* + Check that pagecache was used and cleaned up properly. +*/ + +#ifndef DBUG_OFF +void check_pagecache_is_cleaned_up(PAGECACHE *pagecache) +{ + DBUG_ENTER("check_pagecache_is_cleaned_up"); + /* + Ensure we called inc_counter_for_resize_op and dec_counter_for_resize_op + the same number of times. (If not, a resize() could never happen. + */ + DBUG_ASSERT(pagecache->cnt_for_resize_op == 0); + + if (pagecache->disk_blocks > 0) + { + if (pagecache->block_mem) + { + uint i; + for (i=0 ; i < pagecache->blocks_used ; i++) + { + DBUG_ASSERT(pagecache->block_root[i].status == 0); + DBUG_ASSERT(pagecache->block_root[i].type == PAGECACHE_EMPTY_PAGE); + } + } + } + DBUG_VOID_RETURN; +} +#endif + + +/* + Removes page cache from memory. Does NOT flush pages to disk. + + SYNOPSIS + end_pagecache() + pagecache page cache handle + cleanup Complete free (Free also mutex for key cache) + + RETURN VALUE + none +*/ + +void end_pagecache(PAGECACHE *pagecache, my_bool cleanup) +{ + DBUG_ENTER("end_pagecache"); + DBUG_PRINT("enter", ("key_cache: %p", pagecache)); + + if (!pagecache->inited) + DBUG_VOID_RETURN; + + if (pagecache->disk_blocks > 0) + { +#ifndef DBUG_OFF + check_pagecache_is_cleaned_up(pagecache); +#endif + + if (pagecache->block_mem) + { + my_large_free(pagecache->block_mem, pagecache->mem_size); + pagecache->block_mem= NULL; + my_free(pagecache->block_root); + pagecache->block_root= NULL; + } + pagecache->disk_blocks= -1; + /* Reset blocks_changed to be safe if flush_all_key_blocks is called */ + pagecache->blocks_changed= 0; + } + + DBUG_PRINT("status", ("used: %zu changed: %zu w_requests: %llu " + "writes: %llu r_requests: %llu reads: %llu", + pagecache->blocks_used, + pagecache->global_blocks_changed, + pagecache->global_cache_w_requests, + pagecache->global_cache_write, + pagecache->global_cache_r_requests, + pagecache->global_cache_read)); + + if (cleanup) + { + my_hash_free(&pagecache->files_in_flush); + mysql_mutex_destroy(&pagecache->cache_lock); + pagecache->inited= pagecache->can_be_used= 0; + PAGECACHE_DEBUG_CLOSE; + } + DBUG_VOID_RETURN; +} /* end_pagecache */ + + +/* + Unlink a block from the chain of dirty/clean blocks +*/ + +static inline void unlink_changed(PAGECACHE_BLOCK_LINK *block) +{ + if (block->next_changed) + block->next_changed->prev_changed= block->prev_changed; + *block->prev_changed= block->next_changed; +} + + +/* + Link a block into the chain of dirty/clean blocks +*/ + +static inline void link_changed(PAGECACHE_BLOCK_LINK *block, + PAGECACHE_BLOCK_LINK **phead) +{ + block->prev_changed= phead; + if ((block->next_changed= *phead)) + (*phead)->prev_changed= &block->next_changed; + *phead= block; +} + + +/* + Unlink a block from the chain of dirty/clean blocks, if it's asked for, + and link it to the chain of clean blocks for the specified file +*/ + +static void link_to_file_list(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_FILE *file, my_bool unlink_flag) +{ + if (unlink_flag) + unlink_changed(block); + link_changed(block, &pagecache->file_blocks[FILE_HASH(*file, pagecache)]); + if (block->status & PCBLOCK_CHANGED) + { + block->status&= ~(PCBLOCK_CHANGED | PCBLOCK_DEL_WRITE); + block->rec_lsn= LSN_MAX; + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + } +} + + +/* + Unlink a block from the chain of clean blocks for the specified + file and link it to the chain of dirty blocks for this file +*/ + +static inline void link_to_changed_list(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + unlink_changed(block); + link_changed(block, + &pagecache->changed_blocks[FILE_HASH(block->hash_link->file, pagecache)]); + block->status|=PCBLOCK_CHANGED; + pagecache->blocks_changed++; + pagecache->global_blocks_changed++; +} + + +/* + Link a block to the LRU chain at the beginning or at the end of + one of two parts. + + SYNOPSIS + link_block() + pagecache pointer to a page cache data structure + block pointer to the block to link to the LRU chain + hot <-> to link the block into the hot subchain + at_end <-> to link the block at the end of the subchain + + RETURN VALUE + none + + NOTES. + The LRU chain is represented by a circular list of block structures. + The list is double-linked of the type (**prev,*next) type. + The LRU chain is divided into two parts - hot and warm. + There are two pointers to access the last blocks of these two + parts. The beginning of the warm part follows right after the + end of the hot part. + Only blocks of the warm part can be used for replacement. + The first block from the beginning of this subchain is always + taken for eviction (pagecache->last_used->next) + + LRU chain: +------+ H O T +------+ + +----| end |----...<----| beg |----+ + | +------+last +------+ | + v<-link in latest hot (new end) | + | link in latest warm (new end)->^ + | +------+ W A R M +------+ | + +----| beg |---->...----| end |----+ + +------+ +------+ins + first for eviction +*/ + +static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool hot, my_bool at_end) +{ + PAGECACHE_BLOCK_LINK *ins; + PAGECACHE_BLOCK_LINK **ptr_ins; + DBUG_ENTER("link_block"); + + PCBLOCK_INFO(block); + KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests)); + if (!hot && pagecache->waiting_for_block.last_thread) + { + /* Signal that in the LRU warm sub-chain an available block has appeared */ + struct st_my_thread_var *last_thread= + pagecache->waiting_for_block.last_thread; + struct st_my_thread_var *first_thread= last_thread->next; + struct st_my_thread_var *next_thread= first_thread; + PAGECACHE_HASH_LINK *hash_link= + (PAGECACHE_HASH_LINK *) first_thread->keycache_link; + struct st_my_thread_var *thread; + + DBUG_ASSERT(block->requests + block->wlocks + block->rlocks + + block->pins == 0); + DBUG_ASSERT(block->next_used == NULL); + + do + { + thread= next_thread; + next_thread= thread->next; + /* + We notify about the event all threads that ask + for the same page as the first thread in the queue + */ + if ((PAGECACHE_HASH_LINK *) thread->keycache_link == hash_link) + { + DBUG_PRINT("signal", ("thread: %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_signal(&thread->suspend); + wqueue_unlink_from_queue(&pagecache->waiting_for_block, thread); + block->requests++; + } + } + while (thread != last_thread); + DBUG_PRINT("hash", ("hash_link (link block): %p, hash_link: %p -> %p", + hash_link, hash_link->block, block)); + hash_link->block= block; + /* Ensure that no other thread tries to use this block */ + block->status|= PCBLOCK_REASSIGNED; + + DBUG_PRINT("signal", ("after signal")); +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_PRINT("link_block", + ("linked,unlinked block: %u status: %x #requests: %u #available: %u", + PCBLOCK_NUMBER(pagecache, block), block->status, + block->requests, pagecache->blocks_available)); +#endif + DBUG_VOID_RETURN; + } + ptr_ins= hot ? &pagecache->used_ins : &pagecache->used_last; + ins= *ptr_ins; + if (ins) + { + ins->next_used->prev_used= &block->next_used; + block->next_used= ins->next_used; + block->prev_used= &ins->next_used; + ins->next_used= block; + if (at_end) + *ptr_ins= block; + } + else + { + /* The LRU chain is empty */ + pagecache->used_last= pagecache->used_ins= block->next_used= block; + block->prev_used= &block->next_used; + } + KEYCACHE_THREAD_TRACE("link_block"); +#if defined(PAGECACHE_DEBUG) + pagecache->blocks_available++; + KEYCACHE_DBUG_PRINT("link_block", + ("linked block: %u:%1u status: %x #requests: %u #available: %u", + PCBLOCK_NUMBER(pagecache, block), at_end, block->status, + block->requests, pagecache->blocks_available)); + KEYCACHE_DBUG_ASSERT(pagecache->blocks_available <= + pagecache->blocks_used); +#endif + DBUG_VOID_RETURN; +} + + +/* + Unlink a block from the LRU chain + + SYNOPSIS + unlink_block() + pagecache pointer to a page cache data structure + block pointer to the block to unlink from the LRU chain + + RETURN VALUE + none + + NOTES. + See NOTES for link_block +*/ + +static void unlink_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("unlink_block"); + DBUG_PRINT("pagecache", ("unlink %p", block)); + DBUG_ASSERT(block->next_used != NULL); + if (block->next_used == block) + { + /* The list contains only one member */ + pagecache->used_last= pagecache->used_ins= NULL; + } + else + { + block->next_used->prev_used= block->prev_used; + *block->prev_used= block->next_used; + if (pagecache->used_last == block) + pagecache->used_last= STRUCT_PTR(PAGECACHE_BLOCK_LINK, + next_used, block->prev_used); + if (pagecache->used_ins == block) + pagecache->used_ins= STRUCT_PTR(PAGECACHE_BLOCK_LINK, + next_used, block->prev_used); + } + block->next_used= NULL; + + KEYCACHE_THREAD_TRACE("unlink_block"); +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_ASSERT(pagecache->blocks_available != 0); + pagecache->blocks_available--; + KEYCACHE_DBUG_PRINT("pagecache", + ("unlinked block: %p (%u) status: %x #requests: %u #available: %u", + block, PCBLOCK_NUMBER(pagecache, block), + block->status, + block->requests, pagecache->blocks_available)); + PCBLOCK_INFO(block); +#endif + DBUG_VOID_RETURN; +} + + +/* + Register requests for a block + + SYNOPSIS + reg_requests() + pagecache this page cache reference + block the block we request reference + count how many requests we register (it is 1 everywhere) + + NOTE + Registration of request means we are going to use this block so we exclude + it from the LRU if it is first request +*/ +static void reg_requests(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + int count) +{ + DBUG_ENTER("reg_requests"); + PCBLOCK_INFO(block); + if (! block->requests) + /* First request for the block unlinks it */ + unlink_block(pagecache, block); + block->requests+= count; + DBUG_VOID_RETURN; +} + + +/* + Unregister request for a block + linking it to the LRU chain if it's the last request + + SYNOPSIS + unreg_request() + pagecache pointer to a page cache data structure + block pointer to the block to link to the LRU chain + at_end <-> to link the block at the end of the LRU chain + + RETURN VALUE + none + + NOTES. + Every linking to the LRU chain decrements by one a special block + counter (if it's positive). If the at_end parameter is TRUE the block is + added either at the end of warm sub-chain or at the end of hot sub-chain. + It is added to the hot subchain if its counter is zero and number of + blocks in warm sub-chain is not less than some low limit (determined by + the division_limit parameter). Otherwise the block is added to the warm + sub-chain. If the at_end parameter is FALSE the block is always added + at beginning of the warm sub-chain. + Thus a warm block can be promoted to the hot sub-chain when its counter + becomes zero for the first time. + At the same time the block at the very beginning of the hot subchain + might be moved to the beginning of the warm subchain if it stays untouched + for a too long time (this time is determined by parameter age_threshold). +*/ + +static void unreg_request(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, int at_end) +{ + DBUG_ENTER("unreg_request"); + DBUG_PRINT("enter", ("block %p (%u) status: %x requests: %u", + block, PCBLOCK_NUMBER(pagecache, block), + block->status, block->requests)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->requests > 0); + if (! --block->requests) + { + my_bool hot; + if (block->hits_left) + block->hits_left--; + hot= !block->hits_left && at_end && + pagecache->warm_blocks > pagecache->min_warm_blocks; + if (hot) + { + if (block->temperature == PCBLOCK_WARM) + pagecache->warm_blocks--; + block->temperature= PCBLOCK_HOT; + KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %zu", + pagecache->warm_blocks)); + } + link_block(pagecache, block, hot, (my_bool)at_end); + block->last_hit_time= pagecache->time; + pagecache->time++; + + block= pagecache->used_ins; + /* Check if we should link a hot block to the warm block */ + if (block && pagecache->time - block->last_hit_time > + pagecache->age_threshold) + { + unlink_block(pagecache, block); + link_block(pagecache, block, 0, 0); + if (block->temperature != PCBLOCK_WARM) + { + pagecache->warm_blocks++; + block->temperature= PCBLOCK_WARM; + } + KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %zu", + pagecache->warm_blocks)); + } + } + DBUG_VOID_RETURN; +} + +/* + Remove a reader of the page in block +*/ + +static inline void remove_reader(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("remove_reader"); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->hash_link->requests > 0); + if (! --block->hash_link->requests && block->condvar) + pagecache_pthread_cond_signal(block->condvar); + DBUG_VOID_RETURN; +} + + +/* + Wait until the last reader of the page in block + signals on its termination +*/ + +static inline void wait_for_readers(PAGECACHE *pagecache + __attribute__((unused)), + PAGECACHE_BLOCK_LINK *block + __attribute__((unused))) +{ + struct st_my_thread_var *thread= my_thread_var; + DBUG_ASSERT(block->condvar == NULL); + while (block->hash_link->requests) + { + DBUG_ENTER("wait_for_readers"); + DBUG_PRINT("wait", + ("suspend thread: %s %ld block: %u", + thread->name, (ulong) thread->id, + PCBLOCK_NUMBER(pagecache, block))); + block->condvar= &thread->suspend; + pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); + block->condvar= NULL; + DBUG_VOID_RETURN; + } +} + + +/* + Wait until the flush of the page is done. +*/ + +static void wait_for_flush(PAGECACHE *pagecache + __attribute__((unused)), + PAGECACHE_BLOCK_LINK *block + __attribute__((unused))) +{ + struct st_my_thread_var *thread= my_thread_var; + DBUG_ENTER("wait_for_flush"); + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); + DBUG_VOID_RETURN; +} + + +/* + Add a hash link to a bucket in the hash_table +*/ + +static inline void link_hash(PAGECACHE_HASH_LINK **start, + PAGECACHE_HASH_LINK *hash_link) +{ + if (*start) + (*start)->prev= &hash_link->next; + hash_link->next= *start; + hash_link->prev= start; + *start= hash_link; +} + + +/* + Remove a hash link from the hash table +*/ + +static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link) +{ + DBUG_ENTER("unlink_hash"); + DBUG_PRINT("enter", ("hash_link: %p block: %p fd: %u pos: %lu requests: %u", + hash_link, hash_link->block, (uint) hash_link->file.file, + (ulong) hash_link->pageno, + hash_link->requests)); + DBUG_ASSERT(hash_link->requests == 0); + DBUG_ASSERT(!hash_link->block || hash_link->block->pins == 0); + + if ((*hash_link->prev= hash_link->next)) + hash_link->next->prev= hash_link->prev; + + hash_link->block= NULL; + if (pagecache->waiting_for_hash_link.last_thread) + { + /* Signal that a free hash link has appeared */ + struct st_my_thread_var *last_thread= + pagecache->waiting_for_hash_link.last_thread; + struct st_my_thread_var *first_thread= last_thread->next; + struct st_my_thread_var *next_thread= first_thread; + PAGECACHE_PAGE *first_page= (PAGECACHE_PAGE *) (first_thread->keycache_link); + struct st_my_thread_var *thread; + + hash_link->file= first_page->file; + DBUG_ASSERT(first_page->pageno < ((1ULL) << 40)); + hash_link->pageno= first_page->pageno; + do + { + PAGECACHE_PAGE *page; + thread= next_thread; + page= (PAGECACHE_PAGE *) thread->keycache_link; + next_thread= thread->next; + /* + We notify about the event all threads that ask + for the same page as the first thread in the queue + */ + if (page->file.file == hash_link->file.file && + page->pageno == hash_link->pageno) + { + DBUG_PRINT("signal", ("thread %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_signal(&thread->suspend); + wqueue_unlink_from_queue(&pagecache->waiting_for_hash_link, thread); + } + } + while (thread != last_thread); + + /* + Add this to the hash, so that the waiting threads can find it + when they retry the call to get_hash_link(). This entry is special + in that it has no associated block. + */ + link_hash(&pagecache->hash_root[PAGECACHE_HASH(pagecache, + hash_link->file, + hash_link->pageno)], + hash_link); + DBUG_VOID_RETURN; + } + + /* Add hash to free hash list */ + hash_link->next= pagecache->free_hash_list; + pagecache->free_hash_list= hash_link; + DBUG_VOID_RETURN; +} + + +/* + Get the hash link for the page if it is in the cache (do not put the + page in the cache if it is absent there) + + SYNOPSIS + get_present_hash_link() + pagecache Pagecache reference + file file ID + pageno page number in the file + start where to put pointer to found hash bucket (for + direct referring it) + + RETURN + found hashlink pointer +*/ + +static PAGECACHE_HASH_LINK *get_present_hash_link(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + PAGECACHE_HASH_LINK ***start) +{ + reg1 PAGECACHE_HASH_LINK *hash_link; +#if defined(PAGECACHE_DEBUG) + int cnt; +#endif + DBUG_ENTER("get_present_hash_link"); + DBUG_PRINT("enter", ("fd: %u pos: %lu", (uint) file->file, (ulong) pageno)); + + /* + Find the bucket in the hash table for the pair (file, pageno); + start contains the head of the bucket list, + hash_link points to the first member of the list + */ + hash_link= *(*start= &pagecache->hash_root[PAGECACHE_HASH(pagecache, + *file, pageno)]); +#if defined(PAGECACHE_DEBUG) + cnt= 0; +#endif + /* Look for an element for the pair (file, pageno) in the bucket chain */ + while (hash_link && + (hash_link->pageno != pageno || + hash_link->file.file != file->file)) + { + hash_link= hash_link->next; +#if defined(PAGECACHE_DEBUG) + cnt++; + if (! (cnt <= pagecache->hash_links_used)) + { + int i; + for (i=0, hash_link= **start ; + i < cnt ; i++, hash_link= hash_link->next) + { + KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u pos: %lu", + (uint) hash_link->file.file, (ulong) hash_link->pageno)); + } + } + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->hash_links_used); +#endif + } + if (hash_link) + { + DBUG_PRINT("exit", ("hash_link: %p", hash_link)); + /* Register the request for the page */ + hash_link->requests++; + } + /* + As soon as the caller will release the page cache's lock, "hash_link" + will be potentially obsolete (unusable) information. + */ + DBUG_RETURN(hash_link); +} + + +/* + Get the hash link for a page +*/ + +static PAGECACHE_HASH_LINK *get_hash_link(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno) +{ + reg1 PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_HASH_LINK **start; + DBUG_ENTER("get_hash_link"); + +restart: + /* try to find the page in the cache */ + hash_link= get_present_hash_link(pagecache, file, pageno, + &start); + if (!hash_link) + { + /* There is no hash link in the hash table for the pair (file, pageno) */ + if (pagecache->free_hash_list) + { + DBUG_PRINT("info", ("free_hash_list: %p free_hash_list->next: %p", + pagecache->free_hash_list, + pagecache->free_hash_list->next)); + hash_link= pagecache->free_hash_list; + pagecache->free_hash_list= hash_link->next; + } + else if (pagecache->hash_links_used < pagecache->hash_links) + { + hash_link= &pagecache->hash_link_root[pagecache->hash_links_used++]; + } + else + { + /* Wait for a free hash link */ + struct st_my_thread_var *thread= my_thread_var; + PAGECACHE_PAGE page; + page.file= *file; + page.pageno= pageno; + thread->keycache_link= (void *) &page; + wqueue_link_into_queue(&pagecache->waiting_for_hash_link, thread); + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + thread->keycache_link= NULL; + DBUG_PRINT("thread", ("restarting...")); + goto restart; + } + hash_link->file= *file; + DBUG_ASSERT(pageno < ((1ULL) << 40)); + hash_link->pageno= pageno; + link_hash(start, hash_link); + /* Register the request for the page */ + hash_link->requests++; + DBUG_ASSERT(hash_link->block == 0); + DBUG_ASSERT(hash_link->requests == 1); + } + else + { + /* + We have to copy the flush_log callback, as it may change if the table + goes from non_transactional to transactional during recovery + */ + hash_link->file.flush_log_callback= file->flush_log_callback; + } + DBUG_PRINT("exit", ("hash_link: %p block: %p", hash_link, + hash_link->block)); + DBUG_RETURN(hash_link); +} + + +/* + Get a block for the file page requested by a pagecache read/write operation; + If the page is not in the cache return a free block, if there is none + return the lru block after saving its buffer if the page is dirty. + + SYNOPSIS + + find_block() + pagecache pointer to a page cache data structure + file handler for the file to read page from + pageno number of the page in the file + init_hits_left how initialize the block counter for the page + wrmode <-> get for writing + block_is_copied 1 if block will be copied from page cache under + the pagelock mutex. + reg_req Register request to the page. Normally all pages + should be registered; The only time it's ok to + not register a page is when the page is already + pinned (and thus registered) by the same thread. + page_st out {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ} + + RETURN VALUE + Pointer to the found block if successful, 0 - otherwise + + NOTES. + For the page from file positioned at pageno the function checks whether + the page is in the key cache specified by the first parameter. + If this is the case it immediately returns the block. + If not, the function first chooses a block for this page. If there is + no not used blocks in the key cache yet, the function takes the block + at the very beginning of the warm sub-chain. It saves the page in that + block if it's dirty before returning the pointer to it. + The function returns in the page_st parameter the following values: + PAGE_READ - if page already in the block, + PAGE_TO_BE_READ - if it is to be read yet by the current thread + WAIT_TO_BE_READ - if it is to be read by another thread + If an error occurs THE PCBLOCK_ERROR bit is set in the block status. + It might happen that there are no blocks in LRU chain (in warm part) - + all blocks are unlinked for some read/write operations. Then the function + waits until first of this operations links any block back. +*/ + +static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + int init_hits_left, + my_bool wrmode, + my_bool block_is_copied, + my_bool reg_req, + my_bool fast, + int *page_st) +{ + PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_BLOCK_LINK *block; + int error= 0; + int page_status; + DBUG_ENTER("find_block"); + DBUG_PRINT("enter", ("fd: %d pos: %lu wrmode: %d block_is_copied: %d", + file->file, (ulong) pageno, wrmode, block_is_copied)); + KEYCACHE_PRINT("find_block", ("fd: %d pos: %lu wrmode: %d", + file->file, (ulong) pageno, + wrmode)); +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "start of find_block", 0);); +#endif + DBUG_ASSERT(!fast || !wrmode); + +restart: + /* Find the hash link for the requested page (file, pageno) */ + hash_link= get_hash_link(pagecache, file, pageno); + + page_status= -1; + if ((block= hash_link->block) && + block->hash_link == hash_link && (block->status & PCBLOCK_READ)) + page_status= PAGE_READ; + + if (wrmode && pagecache->resize_in_flush) + { + /* This is a write request during the flush phase of a resize operation */ + + if (page_status != PAGE_READ) + { + /* We don't need the page in the cache: we are going to write on disk */ + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + unlink_hash(pagecache, hash_link); + return 0; + } + if (!(block->status & PCBLOCK_IN_FLUSH)) + { + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + /* + Remove block to invalidate the page in the block buffer + as we are going to write directly on disk. + Although we have an exclusive lock for the updated key part + the control can be yielded by the current thread as we might + have unfinished readers of other key parts in the block + buffer. Still we are guaranteed not to have any readers + of the key part we are writing into until the block is + removed from the cache as we set the PCBLOCK_REASSIGNED + flag (see the code below that handles reading requests). + */ + free_block(pagecache, block, 0); + return 0; + } + /* Wait until the page is flushed on disk */ + DBUG_ASSERT(hash_link->requests > 0); + hash_link->requests--; + wait_for_flush(pagecache, block); + + /* Invalidate page in the block if it has not been done yet */ + DBUG_ASSERT(block->status); /* Should always be true */ + if (block->status) + free_block(pagecache, block, 0); + return 0; + } + + if (page_status == PAGE_READ && + (block->status & (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED))) + { + /* This is a request for a page to be removed from cache */ + + KEYCACHE_DBUG_PRINT("find_block", + ("request for old page in block: %u " + "wrmode: %d block->status: %d", + PCBLOCK_NUMBER(pagecache, block), wrmode, + block->status)); + /* + Only reading requests can proceed until the old dirty page is flushed, + all others are to be suspended, then resubmitted + */ + if (!wrmode && block_is_copied && !(block->status & PCBLOCK_REASSIGNED)) + { + if (reg_req) + reg_requests(pagecache, block, 1); + } + else + { + /* + When we come here either PCBLOCK_REASSIGNED or PCBLOCK_IN_SWITCH are + active. In both cases wqueue_release_queue() is called when the + state changes. + */ + DBUG_ASSERT(block->hash_link == hash_link); + remove_reader(block); + KEYCACHE_DBUG_PRINT("find_block", + ("request waiting for old page to be saved")); + { + struct st_my_thread_var *thread= my_thread_var; + /* Put the request into the queue of those waiting for the old page */ + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + /* Wait until the request can be resubmitted */ + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); + } + KEYCACHE_DBUG_PRINT("find_block", + ("request for old page resubmitted")); + DBUG_PRINT("info", ("restarting...")); + /* Resubmit the request */ + goto restart; + } + } + else + { + /* This is a request for a new page or for a page not to be removed */ + if (! block) + { + DBUG_PRINT("info", ("request for a new page")); + /* No block is assigned for the page yet */ + if (pagecache->blocks_unused) + { + DBUG_PRINT("info", ("there is never used blocks")); + if (pagecache->free_block_list) + { + /* There is a block in the free list. */ + block= pagecache->free_block_list; + pagecache->free_block_list= block->next_used; + block->next_used= NULL; + } + else + { + /* There are some never used blocks, take first of them */ + block= &pagecache->block_root[pagecache->blocks_used]; + block->buffer= ADD_TO_PTR(pagecache->block_mem, + (pagecache->blocks_used* + pagecache->block_size), + uchar*); + pagecache->blocks_used++; + } + pagecache->blocks_unused--; + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + block->status= 0; +#ifdef DBUG_ASSERT_EXISTS + block->type= PAGECACHE_EMPTY_PAGE; +#endif + DBUG_ASSERT(reg_req); + block->requests= 1; + block->temperature= PCBLOCK_COLD; + block->hits_left= init_hits_left; + block->last_hit_time= 0; + block->rec_lsn= LSN_MAX; + link_to_file_list(pagecache, block, file, 0); + DBUG_PRINT("hash", + ("block (no block assigned): %p hash_link: %p -> %p", + block, block->hash_link, hash_link)); + block->hash_link= hash_link; + DBUG_PRINT("hash", + ("hash_link (no block assignment): %p hash_link: %p -> %p", + hash_link, hash_link->block, block)); + hash_link->block= block; + page_status= PAGE_TO_BE_READ; + DBUG_PRINT("info", ("page to be read set for page %p (%u)", + block, PCBLOCK_NUMBER(pagecache, block))); + KEYCACHE_PRINT("find_block", + ("got free or never used block %u", + PCBLOCK_NUMBER(pagecache, block))); + } + else + { + DBUG_PRINT("info", ("there is NOT never used blocks")); + /* There are no never used blocks, use a block from the LRU chain */ + + /* + Ensure that we are going to register the block. + (This should be true as a new block could not have been + pinned by caller). + */ + DBUG_ASSERT(reg_req); + + if (! pagecache->used_last) + { + struct st_my_thread_var *thread; + DBUG_PRINT("info", ("there is NOT UNUSED blocks")); + /* + Wait until a new block is added to the LRU chain; + several threads might wait here for the same page, + all of them must get the same block. + + The block is given to us by the next thread executing + link_block(). + */ + if (fast) + { + DBUG_ASSERT(hash_link->requests == 0); + unlink_hash(pagecache, hash_link); + DBUG_PRINT("info", ("fast and no blocks in LRU")); + + KEYCACHE_DBUG_PRINT("find_block", + ("fast and no blocks in LRU")); + DBUG_RETURN(0); + } + + thread= my_thread_var; + thread->keycache_link= (void *) hash_link; + wqueue_link_into_queue(&pagecache->waiting_for_block, thread); + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + thread->keycache_link= NULL; + block= hash_link->block; + /* Ensure that the block is registered */ + DBUG_ASSERT(block->requests >= 1); + } + else + { + DBUG_PRINT("info", ("take a block from LRU")); + /* + Take the first block from the LRU chain + unlinking it from the chain + */ + block= pagecache->used_last->next_used; + if (fast && + ((block->status & (PCBLOCK_IN_FLUSH | PCBLOCK_CHANGED)) || + (block->hash_link && block->hash_link != hash_link && + block->hash_link->requests))) + { + DBUG_ASSERT(hash_link->requests == 0); + unlink_hash(pagecache, hash_link); + DBUG_PRINT("info", ("fast and LRU block is in switch or has " + "readers")); + KEYCACHE_DBUG_PRINT("find_block", + ("fast and LRU block is in switch or has " + "readers")); + DBUG_RETURN (0); + } + if (reg_req) + reg_requests(pagecache, block, 1); + DBUG_PRINT("hash", ("hash_link (LRU): %p, hash_link: %p -> %p", + hash_link, hash_link->block, block)); + hash_link->block= block; + DBUG_ASSERT(block->requests == 1); + } + + PCBLOCK_INFO(block); + + DBUG_ASSERT(block->hash_link == hash_link || + !(block->status & PCBLOCK_IN_SWITCH)); + + if (block->hash_link != hash_link && + ! (block->status & PCBLOCK_IN_SWITCH) ) + { + /* If another thread is flushing the block, wait for it. */ + if (block->status & PCBLOCK_IN_FLUSH) + wait_for_flush(pagecache, block); + + /* this is a primary request for a new page */ + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + block->status|= PCBLOCK_IN_SWITCH; + + KEYCACHE_DBUG_PRINT("find_block", + ("got block %u for new page", + PCBLOCK_NUMBER(pagecache, block))); + + if (block->status & PCBLOCK_CHANGED) + { + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_block", ("block is dirty")); + + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 0); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + } + + block->status|= PCBLOCK_REASSIGNED; + if (block->hash_link) + { + /* + Wait until all pending read requests + for this page are executed + (we could have avoided this waiting, if we had read + a page in the cache in a sweep, without yielding control) + */ + wait_for_readers(pagecache, block); + + /* Remove the hash link for this page from the hash table */ + unlink_hash(pagecache, block->hash_link); + + /* All pending requests for this page must be resubmitted */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); + } + link_to_file_list(pagecache, block, file, + (my_bool)(block->hash_link ? 1 : 0)); + + DBUG_PRINT("hash", ("block (LRU): %p, hash_link: %p -> %p", + block, block->hash_link, hash_link)); + block->hash_link= hash_link; + PCBLOCK_INFO(block); + block->hits_left= init_hits_left; + block->last_hit_time= 0; + block->status= error ? PCBLOCK_ERROR : 0; + block->error= error ? (int16) my_errno : 0; +#ifdef DBUG_ASSERT_EXISTS + block->type= PAGECACHE_EMPTY_PAGE; + if (error) + my_debug_put_break_here(); +#endif + page_status= PAGE_TO_BE_READ; + DBUG_PRINT("info", ("page to be read set for page %p", block)); + + KEYCACHE_DBUG_ASSERT(block->hash_link->block == block); + KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link); + } + else + { + /* This is for secondary requests for a new page only */ + KEYCACHE_DBUG_PRINT("find_block", + ("block->hash_link: %p hash_link: %p " + "block->status: %u", block->hash_link, + hash_link, block->status )); + page_status= (((block->hash_link == hash_link) && + (block->status & PCBLOCK_READ)) ? + PAGE_READ : PAGE_WAIT_TO_BE_READ); + } + } + } + else + { + /* + The block was found in the cache. It's either a already read + block or a block waiting to be read by another thread. + */ + if (reg_req) + reg_requests(pagecache, block, 1); + KEYCACHE_DBUG_PRINT("find_block", + ("block->hash_link: %p hash_link: %p " + "block->status: %u", block->hash_link, + hash_link, block->status )); + /* + block->hash_link != hash_link can only happen when + the block is in PCBLOCK_IN_SWITCH above (is flushed out + to be replaced by another block). The SWITCH code will change + block->hash_link to point to hash_link. + */ + KEYCACHE_DBUG_ASSERT(block->hash_link == hash_link || + block->status & PCBLOCK_IN_SWITCH); + page_status= (((block->hash_link == hash_link) && + (block->status & PCBLOCK_READ)) ? + PAGE_READ : PAGE_WAIT_TO_BE_READ); + } + } + + KEYCACHE_DBUG_ASSERT(page_status != -1); + *page_st= page_status; + DBUG_PRINT("info", + ("block: %p fd: %u pos: %lu block->status: %u page_status: %u", + block, (uint) file->file, + (ulong) pageno, block->status, (uint) page_status)); + KEYCACHE_PRINT("find_block", + ("block: %p fd: %d pos: %lu block->status: %u page_status: %d", + block, file->file, (ulong) pageno, block->status, + page_status)); + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "end of find_block",0);); +#endif + KEYCACHE_THREAD_TRACE("find_block:end"); + DBUG_RETURN(block); +} + + +static void add_pin(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("add_pin"); + DBUG_PRINT("enter", ("block: %p pins: %u", block, block->pins)); + PCBLOCK_INFO(block); + block->pins++; +#ifndef DBUG_OFF + { + PAGECACHE_PIN_INFO *info= + (PAGECACHE_PIN_INFO *)my_malloc(PSI_INSTRUMENT_ME, sizeof(PAGECACHE_PIN_INFO), MYF(0)); + info->thread= my_thread_var; + info_link(&block->pin_list, info); + } +#endif + DBUG_VOID_RETURN; +} + +static void remove_pin(PAGECACHE_BLOCK_LINK *block, my_bool any +#ifdef DBUG_OFF + __attribute__((unused)) +#endif + ) +{ + DBUG_ENTER("remove_pin"); + DBUG_PRINT("enter", ("block: %p pins: %u any: %d", block, block->pins, + (int)any)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->pins > 0); + block->pins--; +#ifndef DBUG_OFF + { + PAGECACHE_PIN_INFO *info= info_find(block->pin_list, my_thread_var, any); + DBUG_ASSERT(info != 0); + info_unlink(info); + my_free(info); + } +#endif + DBUG_VOID_RETURN; +} +#ifndef DBUG_OFF +static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)my_malloc(PSI_INSTRUMENT_ME, sizeof(PAGECACHE_LOCK_INFO), MYF(0)); + info->thread= my_thread_var; + info->write_lock= wl; + info_link((PAGECACHE_PIN_INFO **)&block->lock_list, + (PAGECACHE_PIN_INFO *)info); +} +static void info_remove_lock(PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list, + my_thread_var, FALSE); + DBUG_ASSERT(info != 0); + info_unlink((PAGECACHE_PIN_INFO *)info); + my_free(info); +} +static void info_change_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl) +{ + PAGECACHE_LOCK_INFO *info= + (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list, + my_thread_var, FALSE); + DBUG_ASSERT(info != 0); + DBUG_ASSERT(info->write_lock != wl); + info->write_lock= wl; +} +#else +#define info_add_lock(B,W) +#define info_remove_lock(B) +#define info_change_lock(B,W) +#endif + + +/** + @brief waiting for lock for read and write lock + + @parem pagecache pointer to a page cache data structure + @parem block the block to work with + @param file file of the block when it was locked + @param pageno page number of the block when it was locked + @param lock_type MY_PTHREAD_LOCK_READ or MY_PTHREAD_LOCK_WRITE + + @retval 0 OK + @retval 1 Can't lock this block, need retry +*/ + +static my_bool pagecache_wait_lock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_FILE file, + pgcache_page_no_t pageno, + uint lock_type) +{ + /* Lock failed we will wait */ + struct st_my_thread_var *thread= my_thread_var; + DBUG_ENTER("pagecache_wait_lock"); + DBUG_PRINT("info", ("fail to lock, waiting... %p", block)); + thread->lock_type= lock_type; + wqueue_add_to_queue(&block->wqueue[COND_FOR_WRLOCK], thread); + dec_counter_for_resize_op(pagecache); + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while(thread->next); + inc_counter_for_resize_op(pagecache); + PCBLOCK_INFO(block); + if ((block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) || + !block->hash_link || + file.file != block->hash_link->file.file || + pageno != block->hash_link->pageno) + { + DBUG_PRINT("info", ("the block %p changed => need retry " + "status: %x files %d != %d or pages %lu != %lu", + block, block->status, file.file, + block->hash_link ? block->hash_link->file.file : -1, + (ulong) pageno, + (ulong) (block->hash_link ? block->hash_link->pageno : 0))); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +/** + @brief Put on the block write lock + + @parem pagecache pointer to a page cache data structure + @parem block the block to work with + + @note We have loose scheme for locking by the same thread: + * Downgrade to read lock if no other locks are taken + * Our scheme of locking allow for the same thread + - the same kind of lock + - taking read lock if write lock present + - downgrading to read lock if still other place the same + thread keep write lock + * But unlock operation number should be the same to lock operation. + * If we try to get read lock having active write locks we put read + locks to queue, and as soon as write lock(s) gone the read locks + from queue came in force. + * If read lock is unlocked earlier then it came to force it + just removed from the queue + + @retval 0 OK + @retval 1 Can't lock this block, need retry +*/ + +static my_bool get_wrlock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_FILE file= block->hash_link->file; + pgcache_page_no_t pageno= block->hash_link->pageno; + pthread_t locker= pthread_self(); + DBUG_ENTER("get_wrlock"); + DBUG_PRINT("info", ("the block %p " + "files %d(%d) pages %lu(%lu)", + block, file.file, block->hash_link->file.file, + (ulong) pageno, (ulong) block->hash_link->pageno)); + PCBLOCK_INFO(block); + /* + We assume that the same thread will try write lock on block on which it + has already read lock. + */ + while ((block->wlocks && !pthread_equal(block->write_locker, locker)) || + block->rlocks) + { + /* Lock failed we will wait */ + if (pagecache_wait_lock(pagecache, block, file, pageno, + MY_PTHREAD_LOCK_WRITE)) + DBUG_RETURN(1); + } + /* we are doing it by global cache mutex protection, so it is OK */ + block->wlocks++; + block->write_locker= locker; + DBUG_PRINT("info", ("WR lock set, block %p", block)); + DBUG_RETURN(0); +} + + +/* + @brief Put on the block read lock + + @param pagecache pointer to a page cache data structure + @param block the block to work with + @param user_file Unique handler per handler file. Used to check if + we request many write locks withing the same + statement + + @note see note for get_wrlock(). + + @retvalue 0 OK + @retvalue 1 Can't lock this block, need retry +*/ + +static my_bool get_rdlock(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + PAGECACHE_FILE file= block->hash_link->file; + pgcache_page_no_t pageno= block->hash_link->pageno; + pthread_t locker= pthread_self(); + DBUG_ENTER("get_rdlock"); + DBUG_PRINT("info", ("the block %p " + "files %d(%d) pages %lu(%lu)", + block, file.file, block->hash_link->file.file, + (ulong) pageno, (ulong) block->hash_link->pageno)); + PCBLOCK_INFO(block); + while (block->wlocks && !pthread_equal(block->write_locker, locker)) + { + /* Lock failed we will wait */ + if (pagecache_wait_lock(pagecache, block, file, pageno, + MY_PTHREAD_LOCK_READ)) + DBUG_RETURN(1); + } + /* we are doing it by global cache mutex protection, so it is OK */ + if (block->wlocks) + { + DBUG_ASSERT(pthread_equal(block->write_locker, locker)); + block->rlocks_queue++; + DBUG_PRINT("info", ("RD lock put into queue, block %p", block)); + } + else + { + block->rlocks++; + DBUG_PRINT("info", ("RD lock set, block %p", block)); + } + DBUG_RETURN(0); +} + + +/* + @brief Remove write lock from the block + + @param pagecache pointer to a page cache data structure + @param block the block to work with + @param read_lock downgrade to read lock + + @note see note for get_wrlock(). +*/ + +static void release_wrlock(PAGECACHE_BLOCK_LINK *block, my_bool read_lock) +{ + DBUG_ENTER("release_wrlock"); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->wlocks > 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->pins > 0); + if (read_lock) + block->rlocks_queue++; + if (block->wlocks == 1) + { + block->rlocks= block->rlocks_queue; + block->rlocks_queue= 0; + } + block->wlocks--; + if (block->wlocks > 0) + DBUG_VOID_RETURN; /* Multiple write locked */ + DBUG_PRINT("info", ("WR lock reset, block %p", block)); + /* release all threads waiting for read lock or one waiting for write */ + if (block->wqueue[COND_FOR_WRLOCK].last_thread) + wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]); + PCBLOCK_INFO(block); + DBUG_VOID_RETURN; +} + +/* + @brief Remove read lock from the block + + @param pagecache pointer to a page cache data structure + @param block the block to work with + + @note see note for get_wrlock(). +*/ + +static void release_rdlock(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("release_wrlock"); + PCBLOCK_INFO(block); + if (block->wlocks) + { + DBUG_ASSERT(pthread_equal(block->write_locker, pthread_self())); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue > 0); + block->rlocks_queue--; + DBUG_PRINT("info", ("RD lock queue decreased, block %p", block)); + DBUG_VOID_RETURN; + } + DBUG_ASSERT(block->rlocks > 0); + DBUG_ASSERT(block->rlocks_queue == 0); + block->rlocks--; + DBUG_PRINT("info", ("RD lock decreased, block %p", block)); + if (block->rlocks > 0) + DBUG_VOID_RETURN; /* Multiple write locked */ + DBUG_PRINT("info", ("RD lock reset, block %p", block)); + /* release all threads waiting for read lock or one waiting for write */ + if (block->wqueue[COND_FOR_WRLOCK].last_thread) + wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]); + PCBLOCK_INFO(block); + DBUG_VOID_RETURN; +} + +/** + @brief Try to lock/unlock and pin/unpin the block + + @param pagecache pointer to a page cache data structure + @param block the block to work with + @param lock lock change mode + @param pin pinchange mode + @param file File handler requesting pin + @param any allow unpinning block pinned by any thread; possible + only if not locked, see pagecache_unlock_by_link() + + @retval 0 OK + @retval 1 Try to lock the block failed +*/ + +static my_bool make_lock_and_pin(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + my_bool any) +{ + DBUG_ENTER("make_lock_and_pin"); + DBUG_PRINT("enter", ("block: %p (%u) lock: %s pin: %s any %d", + block, PCBLOCK_NUMBER(pagecache, block), + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin], (int)any)); + PCBLOCK_INFO(block); + + DBUG_ASSERT(block); + DBUG_ASSERT(!any || + ((lock == PAGECACHE_LOCK_LEFT_UNLOCKED) && + (pin == PAGECACHE_UNPIN))); + DBUG_ASSERT(block->hash_link->block == block); + + switch (lock) { + case PAGECACHE_LOCK_WRITE: /* free -> write */ + /* Writelock and pin the buffer */ + if (get_wrlock(pagecache, block)) + { + /* Couldn't lock because block changed status => need retry */ + goto retry; + } + + /* The cache is locked so nothing afraid of */ + add_pin(block); + info_add_lock(block, 1); + break; + case PAGECACHE_LOCK_WRITE_TO_READ: /* write -> read */ + case PAGECACHE_LOCK_WRITE_UNLOCK: /* write -> free */ + /* Removes write lock and puts read lock */ + release_wrlock(block, lock == PAGECACHE_LOCK_WRITE_TO_READ); + /* fall through */ + case PAGECACHE_LOCK_READ_UNLOCK: /* read -> free */ + if (lock == PAGECACHE_LOCK_READ_UNLOCK) + release_rdlock(block); + /* fall through */ + case PAGECACHE_LOCK_LEFT_READLOCKED: /* read -> read */ + if (pin == PAGECACHE_UNPIN) + { + remove_pin(block, FALSE); + } + if (lock == PAGECACHE_LOCK_WRITE_TO_READ) + { + info_change_lock(block, 0); + } + else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK) + { + info_remove_lock(block); + } + break; + case PAGECACHE_LOCK_READ: /* free -> read */ + if (get_rdlock(pagecache, block)) + { + /* Couldn't lock because block changed status => need retry */ + goto retry; + } + + if (pin == PAGECACHE_PIN) + { + /* The cache is locked so nothing afraid off */ + add_pin(block); + } + info_add_lock(block, 0); + break; + case PAGECACHE_LOCK_LEFT_UNLOCKED: /* free -> free */ + if (pin == PAGECACHE_UNPIN) + { + remove_pin(block, any); + } + /* fall through */ + case PAGECACHE_LOCK_LEFT_WRITELOCKED: /* write -> write */ + break; /* do nothing */ + default: + DBUG_ASSERT(0); /* Never should happened */ + } + + PCBLOCK_INFO(block); + DBUG_RETURN(0); +retry: + DBUG_PRINT("INFO", ("Retry block %p", block)); + PCBLOCK_INFO(block); + DBUG_ASSERT(block->hash_link->requests > 0); + block->hash_link->requests--; + DBUG_RETURN(1); +} + + +/** + @brief Reading of a big block in the S3 storage engine. + + @param pagecache Page cache + @param block Block to read + + @note + + Page cache is segmented in logical blocks of size 'block_size'. All + read request are for blocks of 'block_size'. + + When using a file with 'big blocks', the file is split into a + header, header size (for index information) and then blocks of + big_block_size. he last block may be smaller than big_block_size. + All 'big blocks' are a multiple of block_size. + The header is never read into the page cache. It's used to store + the table definition and status and is only read by open(). + + When wanting to read a block, we register a read request for that + block and for the first block that is part of the big block read. We + also put a special flag on the first block so that if another thread + would want to do a big block read, it will wait on signal, and then + check if the block it requested is now in the page cache. If it's + not in the cache it will retry. + + After the big block is read, we will put all read block that was not in the + page cache. Blocks that where already in page cache will not be touched + and will not be added first in the FIFO. + + The block for which we had a read request is added first in FIFO and + returned. +*/ + +#ifdef WITH_S3_STORAGE_ENGINE +static void read_big_block(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block) +{ + int page_st; + size_t big_block_size_in_pages; + size_t offset; + pgcache_page_no_t page, our_page; + pgcache_page_no_t page_to_read; + PAGECACHE_BLOCK_LINK *block_to_read= NULL; + PAGECACHE_IO_HOOK_ARGS args; + S3_BLOCK data; + DBUG_ENTER("read_big_block"); + DBUG_PRINT("enter", ("read BIG block: %p", block)); + bzero((void*) &data, sizeof(data)); + + DBUG_ASSERT(block->hash_link->file.big_block_size % + pagecache->block_size == 0); + big_block_size_in_pages= + block->hash_link->file.big_block_size / pagecache->block_size; + + our_page= block->hash_link->pageno; + + /* find first page of the big block (page_to_read) */ + page_to_read= ((block->hash_link->pageno - + block->hash_link->file.head_blocks) / + big_block_size_in_pages); + page_to_read= (page_to_read * big_block_size_in_pages + + block->hash_link->file.head_blocks); + if (page_to_read != our_page) + { + block_to_read= find_block(pagecache, &block->hash_link->file, + page_to_read, 1, + FALSE, TRUE /* copy under protection (?)*/, + TRUE /*register*/, FALSE, &page_st); + DBUG_ASSERT(block_to_read == block_to_read->hash_link->block); + + if (block_to_read->status & PCBLOCK_ERROR) + { + /* We get first block with an error so all operation failed */ + DBUG_PRINT("error", ("Got error when reading first page")); + block->status|= PCBLOCK_ERROR; + block->error= block_to_read->error; + remove_reader(block_to_read); + unreg_request(pagecache, block_to_read, 1); + DBUG_VOID_RETURN; + } + if (block_to_read->status & PCBLOCK_BIG_READ) + { + /* + Other thread is reading the big block so we will wait when it will + have read our block for us + */ + struct st_my_thread_var *thread; + /* + Either the page was not yet read and there is another thread + doing the read (page_st == PAGE_WAIT_TO_BE_READ) or the page + was just read and there are other threads waiting for the page + but they have not yet unmarked the PCLBOCK_BIG_READ flag + (page_st == PAGE_READ) + */ + DBUG_ASSERT(page_st == PAGE_READ || page_st == PAGE_WAIT_TO_BE_READ); + block->status|= PCBLOCK_BIG_READ; // will be read by other thread + /* + Block read failed because somebody else is reading the first block + (and all other blocks part of this one). + Wait until block is available. + */ + thread= my_thread_var; + /* Put the request into a queue and wait until it can be processed */ + wqueue_add_to_queue(&block_to_read->wqueue[COND_FOR_REQUESTED], thread); + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + // page should be read by other thread + DBUG_ASSERT(block->status & PCBLOCK_READ || + block->status & PCBLOCK_ERROR); + /* + It is possible that other thread already removed the flag (in + case of two threads waiting) but it will not harm to try to + remove it even in that case. + */ + block->status&= ~PCBLOCK_BIG_READ; + // all is read => lets finish nice + DBUG_ASSERT(block_to_read != block); + remove_reader(block_to_read); + unreg_request(pagecache, block_to_read, 1); + DBUG_VOID_RETURN; + } + else + { + // only primary request here, PAGE_WAIT_TO_BE_READ is impossible + DBUG_ASSERT(page_st != PAGE_WAIT_TO_BE_READ); + } + } + else + { + block_to_read= block; + page_st= PAGE_TO_BE_READ; + } + + DBUG_ASSERT(!(block_to_read->status & PCBLOCK_BIG_READ)); + // Mark the first page of a big block + block_to_read->status|= PCBLOCK_BIG_READ; + + // Don't keep cache locked during the possible slow read from s3 + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + // perform read of big block + args.page= NULL; + args.pageno= page_to_read; + args.data= block->hash_link->file.callback_data; + + pagecache->global_cache_read++; + if (pagecache->big_block_read(pagecache, &args, &block->hash_link->file, + &data)) + { + pagecache->big_block_free(&data); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + block_to_read->status|= PCBLOCK_ERROR; + block_to_read->error= (int16) my_errno; + + /* Handle the block that we originally wanted with read */ + block->status|= PCBLOCK_ERROR; + block->error= block_to_read->error; + goto error; + } + + /* + We need to keep the mutex locked while filling pages. + As there is no changed blocks to flush, this operation should + be reasonable fast + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + /* Copy the first page to the cache */ + if (page_st != PAGE_READ) + { + DBUG_ASSERT(page_st != PAGE_WAIT_TO_BE_READ); + memcpy(block_to_read->buffer, data.str, pagecache->block_size); + block_to_read->status|= PCBLOCK_READ; + } + else + { + DBUG_ASSERT(block_to_read->status & PCBLOCK_READ); + } + /* Signal that all pending requests for this page now can be processed */ + if (block_to_read->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block_to_read->wqueue[COND_FOR_REQUESTED]); + + /* Copy the rest of the pages */ + for (offset= pagecache->block_size, page= page_to_read + 1; + offset < data.length; + offset+= pagecache->block_size, page++) + { + DBUG_ASSERT(offset + pagecache->block_size <= data.length); + if (page == our_page) + { + DBUG_ASSERT(!(block->status & PCBLOCK_READ)); + memcpy(block->buffer, data.str + offset, pagecache->block_size); + block->status|= PCBLOCK_READ; + } + else + { + PAGECACHE_BLOCK_LINK *bl; + bl= find_block(pagecache, &block->hash_link->file, page, 1, + FALSE, TRUE /* copy under protection (?)*/, + TRUE /*register*/, TRUE /*fast*/, &page_st); + if (!bl) + { + /* + We can not get this page easy. + Maybe we will be lucky with other pages, + also among other pages can be page which waited by other thread + */ + continue; + } + DBUG_ASSERT(bl == bl->hash_link->block); + if ((bl->status & PCBLOCK_ERROR) == 0 && + (page_st == PAGE_TO_BE_READ || // page should be read + (page_st == PAGE_WAIT_TO_BE_READ && + (bl->status & PCBLOCK_BIG_READ)))) // or page waited by other thread + { + memcpy(bl->buffer, data.str + offset, pagecache->block_size); + bl->status|= PCBLOCK_READ; + } + remove_reader(bl); + unreg_request(pagecache, bl, 1); + /* Signal that all pending requests for this page now can be processed */ + if (bl->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&bl->wqueue[COND_FOR_REQUESTED]); + } + } + if (page < our_page) + { + /* we break earlier, but still have to fill page what was requested */ + DBUG_ASSERT(!(block->status & PCBLOCK_READ)); + memcpy(block->buffer, + data.str + ((our_page - page_to_read) * pagecache->block_size), + pagecache->block_size); + block->status|= PCBLOCK_READ; + } + pagecache->big_block_free(&data); + +end: + block_to_read->status&= ~PCBLOCK_BIG_READ; + if (block_to_read != block) + { + /* Unlock the 'first block' in the big read */ + remove_reader(block_to_read); + unreg_request(pagecache, block_to_read, 1); + } + /* Signal that all pending requests for this page now can be processed */ + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); + DBUG_VOID_RETURN; + +error: + /* + Read failed. Mark all readers waiting for the a block covered by the + big block that the read failed + */ + for (offset= 0, page= page_to_read + 1; + offset < big_block_size_in_pages; + offset++) + { + if (page != our_page) + { + PAGECACHE_BLOCK_LINK *bl; + bl= find_block(pagecache, &block->hash_link->file, page, 1, + FALSE, TRUE /* copy under protection (?)*/, + TRUE /*register*/, TRUE /*fast*/, &page_st); + if (!bl) + { + /* + We can not get this page easy. + Maybe we will be lucky with other pages, + also among other pages can be page which waited by other thread + */ + continue; + } + DBUG_ASSERT(bl == bl->hash_link->block); + if ((bl->status & PCBLOCK_ERROR) == 0 && + (page_st == PAGE_TO_BE_READ || // page should be read + (page_st == PAGE_WAIT_TO_BE_READ && + (bl->status & PCBLOCK_BIG_READ)))) // or page waited by other thread + { + bl->status|= PCBLOCK_ERROR; + bl->error= (int16) my_errno; + } + remove_reader(bl); + unreg_request(pagecache, bl, 1); + /* Signal that all pending requests for this page now can be processed */ + if (bl->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&bl->wqueue[COND_FOR_REQUESTED]); + } + } + goto end; +} +#endif /* WITH_S3_STORAGE_ENGINE */ + + +/* + Read into a key cache block buffer from disk. + + SYNOPSIS + + read_block() + pagecache pointer to a page cache data structure + block block to which buffer the data is to be read + primary <-> the current thread will read the data + + RETURN VALUE + None + + NOTES. + The function either reads a page data from file to the block buffer, + or waits until another thread reads it. What page to read is determined + by a block parameter - reference to a hash link for this page. + If an error occurs THE PCBLOCK_ERROR bit is set in the block status. + + On entry cache_lock is locked +*/ + +static void read_block(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + my_bool primary) +{ + DBUG_ENTER("read_block"); + DBUG_PRINT("enter", ("read block: %p primary: %d", block, primary)); + if (primary) + { + size_t error; + PAGECACHE_IO_HOOK_ARGS args; + /* + This code is executed only by threads + that submitted primary requests + */ + + pagecache->global_cache_read++; + /* + Page is not in buffer yet, is to be read from disk + Here other threads may step in and register as secondary readers. + They will register in block->wqueue[COND_FOR_REQUESTED]. + */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + args.page= block->buffer; + args.pageno= block->hash_link->pageno; + args.data= block->hash_link->file.callback_data; + error= (*block->hash_link->file.pre_read_hook)(&args); + if (!error) + { + error= pagecache_fread(pagecache, &block->hash_link->file, + args.page, + block->hash_link->pageno, + pagecache->readwrite_flags); + } + error= (*block->hash_link->file.post_read_hook)(error != 0, &args); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (error) + { + DBUG_ASSERT(maria_in_recovery || !maria_assert_if_crashed_table); + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + } + else + { + block->status|= PCBLOCK_READ; + } + DBUG_PRINT("read_block", + ("primary request: new page in cache")); + /* Signal that all pending requests for this page now can be processed */ + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); + } + else + { + /* + This code is executed only by threads + that submitted secondary requests + */ + + struct st_my_thread_var *thread= my_thread_var; + /* Put the request into a queue and wait until it can be processed */ + wqueue_add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread); + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + DBUG_PRINT("read_block", + ("secondary request: new page in cache")); + } + DBUG_VOID_RETURN; +} + + +/** + @brief Set LSN on the page to the given one if the given LSN is bigger + + @param pagecache pointer to a page cache data structure + @param lsn LSN to set + @param block block to check and set +*/ + +static void check_and_set_lsn(PAGECACHE *pagecache, + LSN lsn, PAGECACHE_BLOCK_LINK *block) +{ + LSN old; + DBUG_ENTER("check_and_set_lsn"); + /* + In recovery, we can _ma_unpin_all_pages() to put a LSN on page, though + page would be PAGECACHE_PLAIN_PAGE (transactionality temporarily disabled + to not log REDOs). + */ + DBUG_ASSERT((block->type == PAGECACHE_LSN_PAGE) || maria_in_recovery); + old= lsn_korr(block->buffer); + DBUG_PRINT("info", ("old lsn: " LSN_FMT " new lsn: " LSN_FMT, + LSN_IN_PARTS(old), LSN_IN_PARTS(lsn))); + if (cmp_translog_addr(lsn, old) > 0) + { + + DBUG_ASSERT(block->type != PAGECACHE_READ_UNKNOWN_PAGE); + lsn_store(block->buffer, lsn); + /* we stored LSN in page so we dirtied it */ + if (!(block->status & PCBLOCK_CHANGED)) + link_to_changed_list(pagecache, block); + } + DBUG_VOID_RETURN; +} + + +/** + @brief Unlock/unpin page and put LSN stamp if it need + + @param pagecache pointer to a page cache data structure + @pagam file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param lock lock change + @param pin pin page + @param first_REDO_LSN_for_page do not set it if it is zero + @param lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page + @param was_changed should be true if the page was write locked with + direct link giving and the page was changed + + @note + Pininig uses requests registration mechanism it works following way: + | beginnig | ending | + | of func. | of func. | + ----------------------------+-------------+---------------+ + PAGECACHE_PIN_LEFT_PINNED | - | - | + PAGECACHE_PIN_LEFT_UNPINNED | reg request | unreg request | + PAGECACHE_PIN | reg request | - | + PAGECACHE_UNPIN | - | unreg request | + + +*/ + +void pagecache_unlock(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed) +{ + PAGECACHE_BLOCK_LINK *block; + int page_st; + DBUG_ENTER("pagecache_unlock"); + DBUG_PRINT("enter", ("fd: %u page: %lu %s %s", + (uint) file->file, (ulong) pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + /* we do not allow any lock/pin increasing here */ + DBUG_ASSERT(pin != PAGECACHE_PIN); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ && lock != PAGECACHE_LOCK_WRITE); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + to unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + /* See NOTE for pagecache_unlock about registering requests */ + block= find_block(pagecache, file, pageno, 0, 0, 0, + pin == PAGECACHE_PIN_LEFT_UNPINNED, FALSE, &page_st); + PCBLOCK_INFO(block); + DBUG_ASSERT(block != 0 && page_st == PAGE_READ); + if (first_REDO_LSN_for_page) + { + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page); + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* if we lock for write we must link the block to changed blocks */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)); + /* + if was_changed then status should be PCBLOCK_DIRECT_W or marked + as dirty + */ + DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) || + (block->status & PCBLOCK_CHANGED)); + if ((block->status & PCBLOCK_DIRECT_W) && + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ)) + { + if (!(block->status & PCBLOCK_CHANGED) && was_changed) + link_to_changed_list(pagecache, block); + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: %p", block)); + } + + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + { + DBUG_ASSERT(0); /* should not happend */ + } + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin != PAGECACHE_PIN_LEFT_PINNED) + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unpin page + + SYNOPSIS + pagecache_unpin() + pagecache pointer to a page cache data structure + file handler for the file for the block of data to be read + pageno number of the block of data in the file + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page +*/ + +void pagecache_unpin(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + LSN lsn) +{ + PAGECACHE_BLOCK_LINK *block; + int page_st; + DBUG_ENTER("pagecache_unpin"); + DBUG_PRINT("enter", ("fd: %u page: %lu", + (uint) file->file, (ulong) pageno)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock bacause want + aunlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + /* See NOTE for pagecache_unlock about registering requests */ + block= find_block(pagecache, file, pageno, 0, 0, 0, 0, FALSE, &page_st); + DBUG_ASSERT(block != 0); + DBUG_ASSERT(page_st == PAGE_READ); + /* we can't unpin such page without unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* + we can just unpin only with keeping read lock because: + a) we can't pin without any lock + b) we can't unpin keeping write lock + */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_LEFT_READLOCKED, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); /* should not happend */ + + remove_reader(block); + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests + */ + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/** + @brief Unlock/unpin page and put LSN stamp if it need + (uses direct block/page pointer) + + @param pagecache pointer to a page cache data structure + @param link direct link to page (returned by read or write) + @param lock lock change + @param pin pin page + @param first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0) + @param lsn if it is not LSN_IMPOSSIBLE and it is bigger then + LSN on the page it will be written on the page + @param was_changed should be true if the page was write locked with + direct link giving and the page was changed + @param any allow unpinning block pinned by any thread; possible + only if not locked + + @note 'any' is a hack so that _ma_bitmap_unpin_all() is allowed to unpin + non-locked bitmap pages pinned by other threads. Because it always uses + PAGECACHE_LOCK_LEFT_UNLOCKED and PAGECACHE_UNPIN + (see write_changed_bitmap()), the hack is limited to these conditions. +*/ + +void pagecache_unlock_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed, + my_bool any) +{ + DBUG_ENTER("pagecache_unlock_by_link"); + DBUG_PRINT("enter", ("block: %p fd: %u page: %lu changed: %d %s %s", + block, (uint) block->hash_link->file.file, + (ulong) block->hash_link->pageno, was_changed, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + /* + We do not allow any lock/pin increasing here and page can't be + unpinned because we use direct link. + */ + DBUG_ASSERT(pin != PAGECACHE_PIN); + DBUG_ASSERT(pin != PAGECACHE_PIN_LEFT_UNPINNED); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ); + DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (pin == PAGECACHE_PIN_LEFT_UNPINNED && + lock == PAGECACHE_LOCK_READ_UNLOCK) + { + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + DBUG_ASSERT(0); /* should not happend */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_VOID_RETURN; + } + + /* + As soon as we keep lock cache can be used, and we have lock because want + unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + + inc_counter_for_resize_op(pagecache); + if (was_changed) + { + if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE) + { + /* + LOCK_READ_UNLOCK is ok here as the page may have first locked + with WRITE lock that was temporarly converted to READ lock before + it's unpinned + */ + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page); + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + /* + Reset error flag. Mark also that page is active; This may not have + been the case if there was an error reading the page + */ + block->status= (block->status & ~PCBLOCK_ERROR) | PCBLOCK_READ; + } + + /* if we lock for write we must link the block to changed blocks */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)); + /* + If was_changed then status should be PCBLOCK_DIRECT_W or marked + as dirty + */ + DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) || + (block->status & PCBLOCK_CHANGED)); + if ((block->status & PCBLOCK_DIRECT_W) && + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_WRITE_TO_READ)) + { + if (!(block->status & PCBLOCK_CHANGED) && was_changed) + link_to_changed_list(pagecache, block); + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: %p", block)); + } + + if (make_lock_and_pin(pagecache, block, lock, pin, any)) + DBUG_ASSERT(0); /* should not happend */ + + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (pin != PAGECACHE_PIN_LEFT_PINNED) + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + + +/* + Unpin page + (uses direct block/page pointer) + + SYNOPSIS + pagecache_unpin_by_link() + pagecache pointer to a page cache data structure + link direct link to page (returned by read or write) + lsn if it is not LSN_IMPOSSIBLE (0) and it + is bigger then LSN on the page it will be written on + the page +*/ + +void pagecache_unpin_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + LSN lsn) +{ + DBUG_ENTER("pagecache_unpin_by_link"); + DBUG_PRINT("enter", ("block: %p fd: %u page: %lu", + block, (uint) block->hash_link->file.file, + (ulong) block->hash_link->pageno)); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* + As soon as we keep lock cache can be used, and we have lock because want + unlock. + */ + DBUG_ASSERT(pagecache->can_be_used); + /* we can't unpin such page without unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + + inc_counter_for_resize_op(pagecache); + + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + + /* + We can just unpin only with keeping read lock because: + a) we can't pin without any lock + b) we can't unpin keeping write lock + */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_LEFT_READLOCKED, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); /* should not happend */ + + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + unreg_request(pagecache, block, 1); + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + DBUG_VOID_RETURN; +} + +/* description of how to change lock before and after read/write */ +struct rw_lock_change +{ + my_bool need_lock_change; /* need changing of lock at the end */ + enum pagecache_page_lock new_lock; /* lock at the beginning */ + enum pagecache_page_lock unlock_lock; /* lock at the end */ +}; + +/* description of how to change pin before and after read/write */ +struct rw_pin_change +{ + enum pagecache_page_pin new_pin; /* pin status at the beginning */ + enum pagecache_page_pin unlock_pin; /* pin status at the end */ +}; + +/** + Depending on the lock which the user wants in pagecache_read(), we + need to acquire a first type of lock at start of pagecache_read(), and + downgrade it to a second type of lock at end. For example, if user + asked for no lock (PAGECACHE_LOCK_LEFT_UNLOCKED) this translates into + taking first a read lock PAGECACHE_LOCK_READ (to rightfully block on + existing write locks) then read then unlock the lock i.e. change lock + to PAGECACHE_LOCK_READ_UNLOCK (the "1" below tells that a change is + needed). +*/ + +static struct rw_lock_change lock_to_read[8]= +{ + { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/ + 1, + PAGECACHE_LOCK_READ, PAGECACHE_LOCK_READ_UNLOCK + }, + { /*PAGECACHE_LOCK_LEFT_READLOCKED*/ + 0, + PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_LEFT_READLOCKED + }, + { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/ + 0, + PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_LEFT_WRITELOCKED + }, + { /*PAGECACHE_LOCK_READ*/ + 1, + PAGECACHE_LOCK_READ, PAGECACHE_LOCK_LEFT_READLOCKED + }, + { /*PAGECACHE_LOCK_WRITE*/ + 1, + PAGECACHE_LOCK_WRITE, PAGECACHE_LOCK_LEFT_WRITELOCKED + }, + { /*PAGECACHE_LOCK_READ_UNLOCK*/ + 1, + PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_READ_UNLOCK + }, + { /*PAGECACHE_LOCK_WRITE_UNLOCK*/ + 1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_UNLOCK + }, + { /*PAGECACHE_LOCK_WRITE_TO_READ*/ + 1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_TO_READ + } +}; + +/** + Two sets of pin modes (every as for lock upper but for pinning). The + difference between sets if whether we are going to provide caller with + reference on the block or not +*/ + +static struct rw_pin_change lock_to_pin[2][8]= +{ + { + { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_LEFT_READLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED, + }, + { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_WRITE*/ + PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ_UNLOCK*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_WRITE_UNLOCK*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN + }, + { /*PAGECACHE_LOCK_WRITE_TO_READ*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN + } + }, + { + { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_LEFT_READLOCKED*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED, + }, + { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ*/ + PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_WRITE*/ + PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED + }, + { /*PAGECACHE_LOCK_READ_UNLOCK*/ + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_PIN_LEFT_UNPINNED + }, + { /*PAGECACHE_LOCK_WRITE_UNLOCK*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN + }, + { /*PAGECACHE_LOCK_WRITE_TO_READ*/ + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED, + } + } +}; + + +/* + @brief Read a block of data from a cached file into a buffer; + + @param pagecache pointer to a page cache data structure + @param file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param level determines the weight of the data + @param buff buffer to where the data must be placed + @param type type of the page + @param lock lock change + @param link link to the page if we pin it + + @return address from where the data is placed if successful, 0 - otherwise. + + @note Pin will be chosen according to lock parameter (see lock_to_pin) + + @note 'buff', if not NULL, must be long-aligned. + + @note If buff==0 then we provide reference on the page so should keep the + page pinned. +*/ + +uchar *pagecache_read(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + PAGECACHE_BLOCK_LINK **page_link) +{ + my_bool error= 0; + enum pagecache_page_pin + new_pin= lock_to_pin[buff==0][lock].new_pin, + unlock_pin= lock_to_pin[buff==0][lock].unlock_pin; + PAGECACHE_BLOCK_LINK *fake_link; + my_bool reg_request; +#ifdef DBUG_TRACE + char llbuf[22]; +#endif + DBUG_ENTER("pagecache_read"); + DBUG_PRINT("enter", ("fd: %u page: %s buffer: %p level: %u " + "t:%s (%d)%s->%s %s->%s big block: %d", + (uint) file->file, ullstr(pageno, llbuf), + buff, level, + page_cache_page_type_str[type], + lock_to_read[lock].need_lock_change, + page_cache_page_lock_str[lock_to_read[lock].new_lock], + page_cache_page_lock_str[lock_to_read[lock].unlock_lock], + page_cache_page_pin_str[new_pin], + page_cache_page_pin_str[unlock_pin], + MY_TEST(pagecache->big_block_read))); + DBUG_ASSERT(buff != 0 || (buff == 0 && (unlock_pin == PAGECACHE_PIN || + unlock_pin == PAGECACHE_PIN_LEFT_PINNED))); + DBUG_ASSERT(pageno < ((1ULL) << 40)); + + if (!page_link) + page_link= &fake_link; + *page_link= 0; /* Catch errors */ + +restart: + + /* + If we use big block than the big block is multiple of blocks and we + have enouch blocks in cache + */ + DBUG_ASSERT(!pagecache->big_block_read || + (file->big_block_size != 0 && + file->big_block_size % pagecache->block_size == 0)); + + if (pagecache->can_be_used) + { + /* Key cache is used */ + PAGECACHE_BLOCK_LINK *block; + uint status; + int UNINIT_VAR(page_st); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + goto no_key_cache; + } + + inc_counter_for_resize_op(pagecache); + pagecache->global_cache_r_requests++; + /* See NOTE for pagecache_unlock about registering requests. */ + reg_request= ((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (new_pin == PAGECACHE_PIN) || + pagecache->big_block_read); + block= find_block(pagecache, file, pageno, level, + lock == PAGECACHE_LOCK_WRITE, buff != 0, + reg_request, FALSE, &page_st); + DBUG_PRINT("info", ("Block type: %s current type %s", + page_cache_page_type_str[block->type], + page_cache_page_type_str[type])); + if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ)) + { +#ifdef WITH_S3_STORAGE_ENGINE + if (!pagecache->big_block_read || page_st == PAGE_WAIT_TO_BE_READ) +#endif /* WITH_S3_STORAGE_ENGINE */ + { + /* The requested page is to be read into the block buffer */ + read_block(pagecache, block, page_st == PAGE_TO_BE_READ); + DBUG_PRINT("info", ("read is done")); + } +#ifdef WITH_S3_STORAGE_ENGINE + else + { + /* It is big read and this thread should read */ + DBUG_ASSERT(page_st == PAGE_TO_BE_READ); + + read_big_block(pagecache, block); + + if (!((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (new_pin == PAGECACHE_PIN))) + { + /* we registered request only for big_block_read */ + unreg_request(pagecache, block, 1); + } + } +#endif /* WITH_S3_STORAGE_ENGINE */ + } + /* + Assert after block is read. Imagine two concurrent SELECTs on same + table (thread1 and 2), which want to pagecache_read() the same + pageno/fileno. Thread1 calls find_block(), decides to evict a dirty + page from LRU; while it's writing this dirty page to disk, it is + pre-empted and thread2 runs its find_block(), gets the block (in + PAGE_TO_BE_READ state). This block is still containing the in-eviction + dirty page so has an its type, which cannot be tested. + So thread2 has to wait for read_block() to finish (when it wakes up in + read_block(), it's woken up by read_block() of thread1, which implies + that block's type was set to EMPTY by thread1 as part of find_block()). + */ + DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE || + block->type == type || + type == PAGECACHE_LSN_PAGE || + type == PAGECACHE_READ_UNKNOWN_PAGE || + block->type == PAGECACHE_READ_UNKNOWN_PAGE); + if (type != PAGECACHE_READ_UNKNOWN_PAGE || + block->type == PAGECACHE_EMPTY_PAGE) + block->type= type; + + if (make_lock_and_pin(pagecache, block, lock_to_read[lock].new_lock, + new_pin, FALSE)) + { + /* + We failed to write lock the block, cache is unlocked, + we will try to get the block again. + */ + if (reg_request) + unreg_request(pagecache, block, 1); + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + status= block->status; + if (!buff) + { + buff= block->buffer; + /* possibly we will write here (resolved on unlock) */ + if ((lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)) + { + block->status|= PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Set PCBLOCK_DIRECT_W for block: %p", block)); + } + } + else + { + if (status & PCBLOCK_READ) + { +#if !defined(SERIALIZED_READ_FROM_CACHE) + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); +#endif + + DBUG_ASSERT((pagecache->block_size & 511) == 0); + /* Copy data from the cache buffer */ + memcpy(buff, block->buffer, pagecache->block_size); + +#if !defined(SERIALIZED_READ_FROM_CACHE) + pagecache_pthread_mutex_lock(&pagecache->cache_lock); +#endif + } + } + + remove_reader(block); + if (lock_to_read[lock].need_lock_change) + { + if (make_lock_and_pin(pagecache, block, + lock_to_read[lock].unlock_lock, + unlock_pin, FALSE)) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_ASSERT(0); + return (uchar*) 0; + } + } + /* + Link the block into the LRU chain if it's the last submitted request + for the block and block will not be pinned. + See NOTE for pagecache_unlock about registering requests. + */ + if (unlock_pin == PAGECACHE_PIN_LEFT_UNPINNED || + unlock_pin == PAGECACHE_UNPIN) + unreg_request(pagecache, block, 1); + else + *page_link= block; + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + if (status & PCBLOCK_ERROR) + { + my_errno= block->error; + DBUG_ASSERT(my_errno != 0); + DBUG_PRINT("error", ("Got error %d when doing page read", my_errno)); + DBUG_RETURN((uchar *) 0); + } + + DBUG_RETURN(buff); + } + +no_key_cache: /* Key cache is not used */ + + /* We can't use mutex here as the key cache may not be initialized */ + pagecache->global_cache_r_requests++; + pagecache->global_cache_read++; + + { + PAGECACHE_IO_HOOK_ARGS args; + args.page= buff; + args.pageno= pageno; + args.data= file->callback_data; + error= (* file->pre_read_hook)(&args); + if (!error) + { + error= pagecache_fread(pagecache, file, args.page, pageno, + pagecache->readwrite_flags) != 0; + } + error= (* file->post_read_hook)(error, &args); + } + + DBUG_RETURN(error ? (uchar*) 0 : buff); +} + + +/* + @brief Set/reset flag that page always should be flushed on delete + + @param pagecache pointer to a page cache data structure + @param link direct link to page (returned by read or write) + @param write write on delete flag value + +*/ + +void pagecache_set_write_on_delete_by_link(PAGECACHE_BLOCK_LINK *block) +{ + DBUG_ENTER("pagecache_set_write_on_delete_by_link"); + DBUG_PRINT("enter", ("fd: %d block %p %d -> TRUE", + block->hash_link->file.file, + block, (int) block->status & PCBLOCK_DEL_WRITE)); + DBUG_ASSERT(block->pins); /* should be pinned */ + DBUG_ASSERT(block->wlocks); /* should be write locked */ + + block->status|= PCBLOCK_DEL_WRITE; + + DBUG_VOID_RETURN; +} + + +/* + @brief Delete page from the buffer (common part for link and file/page) + + @param pagecache pointer to a page cache data structure + @param block direct link to page (returned by read or write) + @param page_link hash link of the block + @param flush flush page if it is dirty + + @retval 0 deleted or was not present at all + @retval 1 error +*/ + +static my_bool pagecache_delete_internal(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + PAGECACHE_HASH_LINK *page_link, + my_bool flush) +{ + my_bool error= 0; + if (block->status & PCBLOCK_IN_FLUSH) + { + /* + this call is just 'hint' for the cache to free the page so we will + not interferes with flushing process but must return success + */ + goto out; + } + if (block->status & PCBLOCK_CHANGED) + { + flush= (flush || (block->status & PCBLOCK_DEL_WRITE)); + if (flush) + { + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_block", ("block is dirty")); + + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 1); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + + if (error) + { + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + goto out; + } + } + else + { + PAGECACHE_IO_HOOK_ARGS args; + PAGECACHE_FILE *filedesc= &block->hash_link->file; + args.page= block->buffer; + args.pageno= block->hash_link->pageno; + args.data= filedesc->callback_data; + /* We are not going to write the page but have to call callbacks */ + DBUG_PRINT("info", ("flush_callback: %p data: %p", + filedesc->flush_log_callback, + filedesc->callback_data)); + if ((*filedesc->flush_log_callback)(&args)) + { + DBUG_PRINT("error", ("flush or write callback problem")); + error= 1; + goto out; + } + } + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + /* + free_block() will change the status and rec_lsn of the block so no + need to change them here. + */ + } + /* Cache is locked, so we can relese page before freeing it */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); + DBUG_ASSERT(block->hash_link->requests > 0); + page_link->requests--; + /* See NOTE for pagecache_unlock() about registering requests. */ + free_block(pagecache, block, 0); + dec_counter_for_resize_op(pagecache); + return 0; + +out: + /* Cache is locked, so we can relese page before freeing it */ + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); + page_link->requests--; + unreg_request(pagecache, block, 1); + dec_counter_for_resize_op(pagecache); + return error; +} + + +/* + @brief Delete page from the buffer by link + + @param pagecache pointer to a page cache data structure + @param link direct link to page (returned by read or write) + @param lock lock change + @param flush flush page if it is dirty + + @retval 0 deleted or was not present at all + @retval 1 error + + @note lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was + write locked before) or PAGECACHE_LOCK_WRITE (delete will write + lock page before delete) +*/ + +my_bool pagecache_delete_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + my_bool flush) +{ + my_bool error= 0; + enum pagecache_page_pin pin= PAGECACHE_PIN_LEFT_PINNED; + DBUG_ENTER("pagecache_delete_by_link"); + DBUG_PRINT("enter", ("fd: %d block %p %s %s", + block->hash_link->file.file, + block, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); + DBUG_ASSERT(block->pins != 0); /* should be pinned */ + + if (pagecache->can_be_used) + { + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + goto end; + + /* + This block should be pinned (i.e. has not zero request counter) => + Such block can't be chosen for eviction. + */ + DBUG_ASSERT((block->status & + (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)) == 0); + + /* This lock is deleted in pagecache_delete_internal() called below */ + inc_counter_for_resize_op(pagecache); + /* + make_lock_and_pin() can't fail here, because we are keeping pin on the + block and it can't be evicted (which is cause of lock fail and retry) + */ + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + DBUG_ASSERT(0); + + /* + get_present_hash_link() side effect emulation before call + pagecache_delete_internal() + */ + block->hash_link->requests++; + + error= pagecache_delete_internal(pagecache, block, block->hash_link, + flush); +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + } + + DBUG_RETURN(error); +} + + +/** + @brief Returns "hits" for promotion + + @return "hits" for promotion +*/ + +uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block) +{ + return block->hits_left; +} + +/* + @brief Adds "hits" to the page + + @param link direct link to page (returned by read or write) + @param level number of "hits" which we add to the page +*/ + +void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block, + uint level) +{ + DBUG_ASSERT(block->pins != 0); /* should be pinned */ + /* + Operation is just for statistics so it is not really important + if it interfere with other hit increasing => we are doing it without + locking the pagecache. + */ + block->hits_left+= level; +} + +/* + @brief Delete page from the buffer + + @param pagecache pointer to a page cache data structure + @param file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param lock lock change + @param flush flush page if it is dirty + + @retval 0 deleted or was not present at all + @retval 1 error + + @note lock can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was + write locked before) or PAGECACHE_LOCK_WRITE (delete will write + lock page before delete) +*/ +static enum pagecache_page_pin lock_to_pin_one_phase[8]= +{ + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_UNLOCKED*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_READLOCKED*/, + PAGECACHE_PIN_LEFT_PINNED /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ*/, + PAGECACHE_PIN /*PAGECACHE_LOCK_WRITE*/, + PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ_UNLOCK*/, + PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_UNLOCK*/, + PAGECACHE_UNPIN /*PAGECACHE_LOCK_WRITE_TO_READ*/ +}; + +my_bool pagecache_delete(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + my_bool flush) +{ + my_bool error= 0; + enum pagecache_page_pin pin= lock_to_pin_one_phase[lock]; + DBUG_ENTER("pagecache_delete"); + DBUG_PRINT("enter", ("fd: %u page: %lu %s %s", + (uint) file->file, (ulong) pageno, + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin])); + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); + DBUG_ASSERT(pin == PAGECACHE_PIN || + pin == PAGECACHE_PIN_LEFT_PINNED); +restart: + + DBUG_ASSERT(pageno < ((1ULL) << 40)); + if (pagecache->can_be_used) + { + /* Key cache is used */ + reg1 PAGECACHE_BLOCK_LINK *block; + PAGECACHE_HASH_LINK **unused_start, *page_link; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + goto end; + + inc_counter_for_resize_op(pagecache); + page_link= get_present_hash_link(pagecache, file, pageno, &unused_start); + if (!page_link) + { + DBUG_PRINT("info", ("There is no such page in the cache")); + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(0); + } + block= page_link->block; + if (block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) + { + DBUG_PRINT("info", ("Block %p already is %s", + block, + ((block->status & PCBLOCK_REASSIGNED) ? + "reassigned" : "in switch"))); + PCBLOCK_INFO(block); + page_link->requests--; + dec_counter_for_resize_op(pagecache); + goto end; + } + /* See NOTE for pagecache_unlock about registering requests. */ + if (pin == PAGECACHE_PIN) + reg_requests(pagecache, block, 1); + if (make_lock_and_pin(pagecache, block, lock, pin, FALSE)) + { + /* + We failed to writelock the block, cache is unlocked, and last write + lock is released, we will try to get the block again. + */ + if (pin == PAGECACHE_PIN) + unreg_request(pagecache, block, 1); + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + dec_counter_for_resize_op(pagecache); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + /* we can't delete with opened direct link for write */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + + error= pagecache_delete_internal(pagecache, block, page_link, flush); +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + } + + DBUG_RETURN(error); +} + + +my_bool pagecache_delete_pages(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint page_count, + enum pagecache_page_lock lock, + my_bool flush) +{ + pgcache_page_no_t page_end; + DBUG_ENTER("pagecache_delete_pages"); + DBUG_ASSERT(page_count > 0); + + page_end= pageno + page_count; + do + { + if (pagecache_delete(pagecache, file, pageno, + lock, flush)) + DBUG_RETURN(1); + } while (++pageno != page_end); + DBUG_RETURN(0); +} + + +/** + @brief Writes a buffer into a cached file. + + @param pagecache pointer to a page cache data structure + @param file handler for the file to write data to + @param pageno number of the block of data in the file + @param level determines the weight of the data + @param buff buffer with the data + @param type type of the page + @param lock lock change + @param pin pin page + @param write_mode how to write page + @param link link to the page if we pin it + @param first_REDO_LSN_for_page the lsn to set rec_lsn + @param offset offset in the page + @param size size of data + @param validator read page validator + @param validator_data the validator data + + @retval 0 if a success. + @retval 1 Error. +*/ + +static struct rw_lock_change write_lock_change_table[]= +{ + {1, + PAGECACHE_LOCK_WRITE, + PAGECACHE_LOCK_WRITE_UNLOCK} /*PAGECACHE_LOCK_LEFT_UNLOCKED*/, + {0, /*unsupported (we can't write having the block read locked) */ + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_LEFT_READLOCKED*/, + {0, PAGECACHE_LOCK_LEFT_WRITELOCKED, 0} /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/, + {1, + PAGECACHE_LOCK_WRITE, + PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_READ*/, + {0, PAGECACHE_LOCK_WRITE, 0} /*PAGECACHE_LOCK_WRITE*/, + {0, /*unsupported (we can't write having the block read locked) */ + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_READ_UNLOCK*/, + {1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_LOCK_WRITE_UNLOCK } /*PAGECACHE_LOCK_WRITE_UNLOCK*/, + {1, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_WRITE_TO_READ*/ +}; + + +static struct rw_pin_change write_pin_change_table[]= +{ + {PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN_LEFT_PINNED*/, + {PAGECACHE_PIN, + PAGECACHE_UNPIN} /*PAGECACHE_PIN_LEFT_UNPINNED*/, + {PAGECACHE_PIN, + PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN*/, + {PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_UNPIN} /*PAGECACHE_UNPIN*/ +}; + + +/** + @note 'buff', if not NULL, must be long-aligned. +*/ + +my_bool pagecache_write_part(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + enum pagecache_write_mode write_mode, + PAGECACHE_BLOCK_LINK **page_link, + LSN first_REDO_LSN_for_page, + uint offset, uint size) +{ + PAGECACHE_BLOCK_LINK *block= NULL; + PAGECACHE_BLOCK_LINK *fake_link; + my_bool error= 0; + int need_lock_change= write_lock_change_table[lock].need_lock_change; + my_bool reg_request; +#ifdef DBUG_TRACE + char llbuf[22]; +#endif + DBUG_ENTER("pagecache_write_part"); + DBUG_PRINT("enter", ("fd: %u page: %s level: %u type: %s lock: %s " + "pin: %s mode: %s offset: %u size %u", + (uint) file->file, ullstr(pageno, llbuf), level, + page_cache_page_type_str[type], + page_cache_page_lock_str[lock], + page_cache_page_pin_str[pin], + page_cache_page_write_mode_str[write_mode], + offset, size)); + DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE); + DBUG_ASSERT(lock != PAGECACHE_LOCK_LEFT_READLOCKED); + DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(offset + size <= pagecache->block_size); + DBUG_ASSERT(pageno < ((1ULL) << 40)); + DBUG_ASSERT(pagecache->big_block_read == 0); + + if (!page_link) + page_link= &fake_link; + *page_link= 0; + +restart: + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "start of key_cache_write", 1);); +#endif + + if (pagecache->can_be_used) + { + /* Key cache is used */ + int page_st; + my_bool need_page_ready_signal= FALSE; + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + if (!pagecache->can_be_used) + { + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + goto no_key_cache; + } + + inc_counter_for_resize_op(pagecache); + pagecache->global_cache_w_requests++; + /* + Here we register a request if the page was not already pinned. + See NOTE for pagecache_unlock about registering requests. + */ + reg_request= ((pin == PAGECACHE_PIN_LEFT_UNPINNED) || + (pin == PAGECACHE_PIN)); + block= find_block(pagecache, file, pageno, level, + TRUE, FALSE, + reg_request, FALSE, &page_st); + if (!block) + { + DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE); + /* It happens only for requests submitted during resize operation */ + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* Write to the disk key cache is in resize at the moment*/ + goto no_key_cache; + } + DBUG_PRINT("info", ("page status: %d", page_st)); + if (!(block->status & PCBLOCK_ERROR) && + ((page_st == PAGE_TO_BE_READ && + (offset || size < pagecache->block_size)) || + (page_st == PAGE_WAIT_TO_BE_READ))) + { + /* The requested page is to be read into the block buffer */ + read_block(pagecache, block, + (my_bool)(page_st == PAGE_TO_BE_READ)); + DBUG_PRINT("info", ("read is done")); + } + else if (page_st == PAGE_TO_BE_READ) + { + need_page_ready_signal= TRUE; + } + + DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE || + block->type == PAGECACHE_READ_UNKNOWN_PAGE || + block->type == type || + /* this is for when going to non-trans to trans */ + (block->type == PAGECACHE_PLAIN_PAGE && + type == PAGECACHE_LSN_PAGE)); + block->type= type; + /* we write to the page so it has no sense to keep the flag */ + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: %p", block)); + + if (make_lock_and_pin(pagecache, block, + write_lock_change_table[lock].new_lock, + (need_lock_change ? + write_pin_change_table[pin].new_pin : + pin), FALSE)) + { + /* + We failed to writelock the block, cache is unlocked, and last write + lock is released, we will try to get the block again. + */ + if (reg_request) + unreg_request(pagecache, block, 1); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_PRINT("info", ("restarting...")); + goto restart; + } + + if (write_mode == PAGECACHE_WRITE_DONE) + { + if (block->status & PCBLOCK_ERROR) + { + my_debug_put_break_here(); + DBUG_PRINT("warning", ("Writing on page with error")); + } + else + { + /* Copy data from buff */ + memcpy(block->buffer + offset, buff, size); + block->status= PCBLOCK_READ; + KEYCACHE_DBUG_PRINT("key_cache_insert", + ("Page injection")); + /* Signal that all pending requests for this now can be processed. */ + if (block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); + } + } + else + { + if (! (block->status & PCBLOCK_CHANGED)) + link_to_changed_list(pagecache, block); + + memcpy(block->buffer + offset, buff, size); + block->status|= PCBLOCK_READ; + /* Page is correct again if we made a full write in it */ + if (size == pagecache->block_size) + block->status&= ~PCBLOCK_ERROR; + } + + if (need_page_ready_signal && + block->wqueue[COND_FOR_REQUESTED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]); + + if (first_REDO_LSN_for_page) + { + /* single write action of the last write action */ + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_LEFT_UNLOCKED); + DBUG_ASSERT(pin == PAGECACHE_UNPIN || + pin == PAGECACHE_PIN_LEFT_UNPINNED); + pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page); + } + + if (need_lock_change) + { + /* + We don't set rec_lsn of the block; this is ok as for the + Maria-block-record's pages, we always keep pages pinned here. + */ + if (make_lock_and_pin(pagecache, block, + write_lock_change_table[lock].unlock_lock, + write_pin_change_table[pin].unlock_pin, FALSE)) + DBUG_ASSERT(0); + } + + /* Unregister the request */ + DBUG_ASSERT(block->hash_link->requests > 0); + block->hash_link->requests--; + /* See NOTE for pagecache_unlock about registering requests. */ + if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN) + { + unreg_request(pagecache, block, 1); + DBUG_ASSERT(page_link == &fake_link); + } + else + *page_link= block; + + if (block->status & PCBLOCK_ERROR) + { + error= 1; + my_debug_put_break_here(); + } + + dec_counter_for_resize_op(pagecache); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + + goto end; + } + +no_key_cache: + /* + We can't by pass the normal page cache operations because need + whole page for calling callbacks & so on. + This branch should not be used for now (but it is fixed as it + should be just to avoid confusing) + */ + DBUG_ASSERT(0); + /* Key cache is not used */ + if (write_mode == PAGECACHE_WRITE_DELAY) + { + /* We can't use mutex here as the key cache may not be initialized */ + pagecache->global_cache_w_requests++; + pagecache->global_cache_write++; + if (offset != 0 || size != pagecache->block_size) + { + uchar *page_buffer= (uchar *) alloca(pagecache->block_size); + PAGECACHE_IO_HOOK_ARGS args; + args.page= page_buffer; + args.pageno= pageno; + args.data= file->callback_data; + + pagecache->global_cache_read++; + error= (*file->pre_read_hook)(&args); + if (!error) + { + error= pagecache_fread(pagecache, file, + page_buffer, + pageno, + pagecache->readwrite_flags) != 0; + } + if ((*file->post_read_hook)(error, &args)) + { + DBUG_PRINT("error", ("read callback problem")); + error= 1; + goto end; + } + memcpy((char *)page_buffer + offset, buff, size); + buff= page_buffer; + } + if (pagecache_fwrite(pagecache, file, buff, pageno, type, + pagecache->readwrite_flags)) + error= 1; + } + +end: +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("exec", + test_key_cache(pagecache, "end of key_cache_write", 1);); +#endif + if (block) + PCBLOCK_INFO(block); + else + DBUG_PRINT("info", ("No block")); + DBUG_RETURN(error); +} + + +/* + Free block: remove reference to it from hash table, + remove it from the chain file of dirty/clean blocks + and add it to the free list. +*/ + +static my_bool free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, + my_bool abort_if_pinned) +{ + uint status= block->status; + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", + ("block: %u hash_link %p", + PCBLOCK_NUMBER(pagecache, block), + block->hash_link)); + mysql_mutex_assert_owner(&pagecache->cache_lock); + if (block->hash_link) + { + /* + While waiting for readers to finish, new readers might request the + block. But since we set block->status|= PCBLOCK_REASSIGNED, they + will wait on block->wqueue[COND_FOR_SAVED]. They must be signaled + later. + */ + block->status|= PCBLOCK_REASSIGNED; + wait_for_readers(pagecache, block); + if (unlikely(abort_if_pinned) && unlikely(block->pins)) + { + /* + Block got pinned while waiting for readers. + This can only happens when called from flush_pagecache_blocks_int() + when flushing blocks as part of prepare for maria_close() or from + flush_cached_blocks() + */ + block->status&= ~PCBLOCK_REASSIGNED; + unreg_request(pagecache, block, 0); + + /* All pending requests for this page must be resubmitted. */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); + return 1; + } + unlink_hash(pagecache, block->hash_link); + } + + unlink_changed(block); + DBUG_ASSERT(block->wlocks == 0); + DBUG_ASSERT(block->rlocks == 0); + DBUG_ASSERT(block->rlocks_queue == 0); + DBUG_ASSERT(block->pins == 0); + DBUG_ASSERT((block->status & ~(PCBLOCK_ERROR | PCBLOCK_READ | PCBLOCK_IN_FLUSH | PCBLOCK_CHANGED | PCBLOCK_REASSIGNED | PCBLOCK_DEL_WRITE)) == 0); + DBUG_ASSERT(block->requests >= 1); + DBUG_ASSERT(block->next_used == NULL); + block->status= 0; +#ifdef DBUG_ASSERT_EXISTS + block->type= PAGECACHE_EMPTY_PAGE; +#endif + block->rec_lsn= LSN_MAX; + DBUG_PRINT("hash", ("block (Free): %p, hash_link: %p -> NULL", + block, block->hash_link)); + block->hash_link= NULL; + if (block->temperature == PCBLOCK_WARM) + pagecache->warm_blocks--; + block->temperature= PCBLOCK_COLD; + KEYCACHE_THREAD_TRACE("free block"); + KEYCACHE_DBUG_PRINT("free_block", + ("block is freed")); + unreg_request(pagecache, block, 0); + + /* + Block->requests is != 0 if unreg_requests()/link_block() gave the block + to a waiting thread + */ + if (!block->requests) + { + DBUG_ASSERT(block->next_used != 0); + + /* Remove the free block from the LRU ring. */ + unlink_block(pagecache, block); + /* Insert the free block in the free list. */ + block->next_used= pagecache->free_block_list; + pagecache->free_block_list= block; + /* Keep track of the number of currently unused blocks. */ + pagecache->blocks_unused++; + } + else + { + /* keep flag set by link_block() */ + block->status= status & PCBLOCK_REASSIGNED; + } + + /* All pending requests for this page must be resubmitted. */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); + + return 0; +} + + +static int cmp_sec_link(PAGECACHE_BLOCK_LINK **a, PAGECACHE_BLOCK_LINK **b) +{ + return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 : + ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0); +} + + +/** + @brief Flush a portion of changed blocks to disk, free used blocks + if requested + + @param pagecache This page cache reference. + @param file File which should be flushed + @param cache Beginning of array of the block. + @param end Reference to the block after last in the array. + @param flush_type Type of the flush. + @param first_errno Where to store first errno of the flush. + + + @return Operation status + @retval PCFLUSH_OK OK + @retval PCFLUSH_ERROR There was errors during the flush process. + @retval PCFLUSH_PINNED Pinned blocks was met and skipped. + @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED. +*/ + +static int flush_cached_blocks(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + PAGECACHE_BLOCK_LINK **cache, + PAGECACHE_BLOCK_LINK **end, + enum flush_type type, + int *first_errno) +{ + int rc= PCFLUSH_OK; + my_bool error; + uint count= (uint) (end-cache); + DBUG_ENTER("flush_cached_blocks"); + *first_errno= 0; + + /* Don't lock the cache during the flush */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH + we are guaranteed that no thread will change them + */ + qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link); + + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + for (; cache != end; cache++) + { + PAGECACHE_BLOCK_LINK *block= *cache; + + /* + In the case of non_transactional tables we want to flush also + block pinned with reads. This is becasue we may have other + threads reading the block during flush, as non transactional + tables can have many readers while the one writer is doing the + flush. + We don't want to do flush pinned blocks during checkpoint. + We detect the checkpoint case by checking if type is LAZY. + */ + if ((type == FLUSH_KEEP_LAZY && block->pins) || block->wlocks) + { + KEYCACHE_DBUG_PRINT("flush_cached_blocks", + ("block: %u (%p) pinned", + PCBLOCK_NUMBER(pagecache, block), block)); + DBUG_PRINT("info", ("block: %u (%p) pinned", + PCBLOCK_NUMBER(pagecache, block), block)); + PCBLOCK_INFO(block); + /* undo the mark put by flush_pagecache_blocks_int(): */ + block->status&= ~PCBLOCK_IN_FLUSH; + rc|= PCFLUSH_PINNED; + DBUG_PRINT("warning", ("Page pinned")); + unreg_request(pagecache, block, 1); + if (!*first_errno) + *first_errno= HA_ERR_INTERNAL_ERROR; + continue; + } + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_READ, PAGECACHE_PIN, FALSE)) + DBUG_ASSERT(0); + + KEYCACHE_PRINT("flush_cached_blocks", + ("block: %u (%p) to be flushed", + PCBLOCK_NUMBER(pagecache, block), block)); + DBUG_PRINT("info", ("block: %u (%p) to be flushed", + PCBLOCK_NUMBER(pagecache, block), block)); + PCBLOCK_INFO(block); + + /** + @todo IO If page is contiguous with next page to flush, group flushes + in one single my_pwrite(). + */ + /** + It is important to use block->hash_link->file below and not 'file', as + the first one is right and the second may have different out-of-date + content (see StaleFilePointersInFlush in ma_checkpoint.c). + @todo change argument of functions to be File. + */ + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + error= pagecache_fwrite(pagecache, &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + pagecache->readwrite_flags); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + if (make_lock_and_pin(pagecache, block, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, FALSE)) + DBUG_ASSERT(0); + + pagecache->global_cache_write++; + if (error) + { + block->status|= PCBLOCK_ERROR; + block->error= (int16) my_errno; + my_debug_put_break_here(); + if (!*first_errno) + *first_errno= my_errno ? my_errno : -1; + rc|= PCFLUSH_ERROR; + } + /* + Let to proceed for possible waiting requests to write to the block page. + It might happen only during an operation to resize the key cache. + */ + if (block->wqueue[COND_FOR_SAVED].last_thread) + wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]); + /* type will never be FLUSH_IGNORE_CHANGED here */ + if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY || + type == FLUSH_FORCE_WRITE)) + { + if (!free_block(pagecache, block, 1)) + { + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + } + else + { + block->status&= ~PCBLOCK_IN_FLUSH; + link_to_file_list(pagecache, block, file, 1); + } + } + else + { + block->status&= ~PCBLOCK_IN_FLUSH; + link_to_file_list(pagecache, block, file, 1); + unreg_request(pagecache, block, 1); + } + } + DBUG_RETURN(rc); +} + + +/** + @brief flush all blocks for a file to disk but don't do any mutex locks + + @param pagecache pointer to a pagecache data structure + @param file handler for the file to flush to + @param flush_type type of the flush + @param filter optional function which tells what blocks to flush; + can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY + or FLUSH_FORCE_WRITE. + @param filter_arg an argument to pass to 'filter'. Information about + the block will be passed too. + + @note + Flushes all blocks having the same OS file descriptor as 'file->file', so + can flush blocks having '*block->hash_link->file' != '*file'. + + @note + This function doesn't do any mutex locks because it needs to be called + both from flush_pagecache_blocks and flush_all_key_blocks (the later one + does the mutex lock in the resize_pagecache() function). + + @note + This function can cause problems if two threads call it + concurrently on the same file (look for "PageCacheFlushConcurrencyBugs" + in ma_checkpoint.c); to avoid them, it has internal logic to serialize in + this situation. + + @return Operation status + @retval PCFLUSH_OK OK + @retval PCFLUSH_ERROR There was errors during the flush process. + @retval PCFLUSH_PINNED Pinned blocks was met and skipped. + @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED. +*/ + +static int flush_pagecache_blocks_int(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + enum flush_type type, + PAGECACHE_FLUSH_FILTER filter, + void *filter_arg) +{ + PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache; + int last_errno= 0; + int rc= PCFLUSH_OK; + DBUG_ENTER("flush_pagecache_blocks_int"); + DBUG_PRINT("enter", + ("fd: %d blocks_used: %zu blocks_changed: %zu type: %d", + file->file, pagecache->blocks_used, pagecache->blocks_changed, + type)); + +#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, + "start of flush_pagecache_blocks", 0);); +#endif + + cache= cache_buff; + if (pagecache->disk_blocks > 0 && + (!my_disable_flush_pagecache_blocks || + (type != FLUSH_KEEP && type != FLUSH_KEEP_LAZY))) + { + /* + Key cache exists. If my_disable_flush_pagecache_blocks is true it + disables the operation but only FLUSH_KEEP[_LAZY]: other flushes still + need to be allowed: FLUSH_RELEASE has to free blocks, and + FLUSH_FORCE_WRITE is to overrule my_disable_flush_pagecache_blocks. + */ + int error= 0; + uint count= 0; + PAGECACHE_BLOCK_LINK **pos, **end; + PAGECACHE_BLOCK_LINK *first_in_switch= NULL; + PAGECACHE_BLOCK_LINK *block, *next; +#if defined(PAGECACHE_DEBUG) + uint cnt= 0; +#endif + + struct st_file_in_flush us_flusher, *other_flusher; + us_flusher.file= file->file; + us_flusher.flush_queue.last_thread= NULL; + us_flusher.first_in_switch= FALSE; + while ((other_flusher= (struct st_file_in_flush *) + my_hash_search(&pagecache->files_in_flush, (uchar *)&file->file, + sizeof(file->file)))) + { + /* + File is in flush already: wait, unless FLUSH_KEEP_LAZY. "Flusher" + means "who can mark PCBLOCK_IN_FLUSH", i.e. caller of + flush_pagecache_blocks_int(). + */ + struct st_my_thread_var *thread; + if (type == FLUSH_KEEP_LAZY) + { + DBUG_PRINT("info",("FLUSH_KEEP_LAZY skips")); + DBUG_RETURN(0); + } + thread= my_thread_var; + wqueue_add_to_queue(&other_flusher->flush_queue, thread); + do + { + DBUG_PRINT("wait", + ("(1) suspend thread %s %ld", + thread->name, (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + } + /* we are the only flusher of this file now */ + while (my_hash_insert(&pagecache->files_in_flush, (uchar *)&us_flusher)) + { + /* + Out of memory, wait for flushers to empty the hash and retry; should + rarely happen. Other threads are flushing the file; when done, they + are going to remove themselves from the hash, and thus memory will + appear again. However, this memory may be stolen by yet another thread + (for a purpose unrelated to page cache), before we retry + my_hash_insert(). So the loop may run for long. Only if the thread was + killed do we abort the loop, returning 1 (error) which can cause the + table to be marked as corrupted (cf maria_chk_size(), maria_close()) + and thus require a table check. + */ + DBUG_ASSERT(0); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + if (my_thread_var->abort) + DBUG_RETURN(1); /* End if aborted by user */ + sleep(10); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + } + + if (type != FLUSH_IGNORE_CHANGED) + { + /* + Count how many key blocks we have to cache to be able + to flush all dirty pages with minimum seek moves. + */ + for (block= pagecache->changed_blocks[FILE_HASH(*file, pagecache)] ; + block; + block= block->next_changed) + { + if (block->hash_link->file.file == file->file) + { + count++; + KEYCACHE_DBUG_ASSERT(count<= pagecache->blocks_used); + } + } + count++; /* Allocate one extra for easy end-of-buffer test */ + /* Allocate a new buffer only if its bigger than the one we have */ + if (count > FLUSH_CACHE && + !(cache= + (PAGECACHE_BLOCK_LINK**) + my_malloc(PSI_INSTRUMENT_ME, sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0)))) + { + cache= cache_buff; + count= FLUSH_CACHE; + } + } + + /* Retrieve the blocks and write them to a buffer to be flushed */ +restart: + end= (pos= cache)+count; + for (block= pagecache->changed_blocks[FILE_HASH(*file, pagecache)] ; + block; + block= next) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + next= block->next_changed; + if (block->hash_link->file.file != file->file) + continue; + if (filter != NULL) + { + int filter_res= (*filter)(block->type, block->hash_link->pageno, + block->rec_lsn, filter_arg); + DBUG_PRINT("info",("filter returned %d", filter_res)); + if (filter_res == FLUSH_FILTER_SKIP_TRY_NEXT) + continue; + if (filter_res == FLUSH_FILTER_SKIP_ALL) + break; + DBUG_ASSERT(filter_res == FLUSH_FILTER_OK); + } + { + DBUG_ASSERT(!(block->status & PCBLOCK_IN_FLUSH)); + /* + We care only for the blocks for which flushing was not + initiated by other threads as a result of page swapping + */ + if (! (block->status & PCBLOCK_IN_SWITCH)) + { + /* + Mark the block with BLOCK_IN_FLUSH in order not to let + other threads to use it for new pages and interfere with + our sequence of flushing dirty file pages + */ + block->status|= PCBLOCK_IN_FLUSH; + + reg_requests(pagecache, block, 1); + if (type != FLUSH_IGNORE_CHANGED) + { + *pos++= block; + /* It's not a temporary file */ + if (pos == end) + { + /* + This happens only if there is not enough + memory for the big block + */ + if ((rc|= flush_cached_blocks(pagecache, file, cache, + end, type, &error)) & + (PCFLUSH_ERROR | PCFLUSH_PINNED)) + last_errno=error; + DBUG_PRINT("info", ("restarting...")); + /* + Restart the scan as some other thread might have changed + the changed blocks chain: the blocks that were in switch + state before the flush started have to be excluded + */ + goto restart; + } + } + else + { + /* It's a temporary file */ + pagecache->blocks_changed--; + pagecache->global_blocks_changed--; + free_block(pagecache, block, 0); + } + } + else if (type != FLUSH_KEEP_LAZY) + { + /* + Link the block into a list of blocks 'in switch', and then we will + wait for this list to be empty, which means they have been flushed + */ + unlink_changed(block); + link_changed(block, &first_in_switch); + us_flusher.first_in_switch= TRUE; + } + } + } + if (pos != cache) + { + if ((rc|= flush_cached_blocks(pagecache, file, cache, pos, type, + &error)) & + (PCFLUSH_ERROR | PCFLUSH_PINNED)) + last_errno= error; + } + /* Wait until list of blocks in switch is empty */ + while (first_in_switch) + { +#if defined(PAGECACHE_DEBUG) + cnt= 0; +#endif + block= first_in_switch; + { + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread); + do + { + DBUG_PRINT("wait", + ("(2) suspend thread %s %ld", + thread->name, (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + } +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + } + us_flusher.first_in_switch= FALSE; + /* The following happens very seldom */ + if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY || + type == FLUSH_FORCE_WRITE)) + { + /* + this code would free all blocks while filter maybe handled only a + few, that is not possible. + */ + DBUG_ASSERT(filter == NULL); +#if defined(PAGECACHE_DEBUG) + cnt=0; +#endif + for (block= pagecache->file_blocks[FILE_HASH(*file, pagecache)] ; + block; + block= next) + { +#if defined(PAGECACHE_DEBUG) + cnt++; + KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used); +#endif + next= block->next_changed; + if (block->hash_link->file.file == file->file && + !block->pins && + (! (block->status & PCBLOCK_CHANGED) + || type == FLUSH_IGNORE_CHANGED)) + { + reg_requests(pagecache, block, 1); + free_block(pagecache, block, 1); + } + } + } + /* wake up others waiting to flush this file */ + my_hash_delete(&pagecache->files_in_flush, (uchar *)&us_flusher); + if (us_flusher.flush_queue.last_thread) + wqueue_release_queue(&us_flusher.flush_queue); + } + + DBUG_EXECUTE("check_pagecache", + test_key_cache(pagecache, "end of flush_pagecache_blocks", 0);); + if (cache != cache_buff) + my_free(cache); + if (rc != 0) + { + if (last_errno) + my_errno= last_errno; /* Return first error */ + DBUG_PRINT("error", ("Got error: %d", my_errno)); + } + DBUG_RETURN(rc); +} + + +/** + @brief flush all blocks for a file to disk + + @param pagecache pointer to a pagecache data structure + @param file handler for the file to flush to + @param flush_type type of the flush + @param filter optional function which tells what blocks to flush; + can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY + or FLUSH_FORCE_WRITE. + @param filter_arg an argument to pass to 'filter'. Information about + the block will be passed too. + + @return Operation status + @retval PCFLUSH_OK OK + @retval PCFLUSH_ERROR There was errors during the flush process. + @retval PCFLUSH_PINNED Pinned blocks was met and skipped. + @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED. +*/ + +int flush_pagecache_blocks_with_filter(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + enum flush_type type, + PAGECACHE_FLUSH_FILTER filter, + void *filter_arg) +{ + int res; + DBUG_ENTER("flush_pagecache_blocks_with_filter"); + DBUG_PRINT("enter", ("pagecache: %p", pagecache)); + + if (pagecache->disk_blocks <= 0) + DBUG_RETURN(0); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + inc_counter_for_resize_op(pagecache); + res= flush_pagecache_blocks_int(pagecache, file, type, filter, filter_arg); + dec_counter_for_resize_op(pagecache); + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(res); +} + + +/* + Reset the counters of a key cache. + + SYNOPSIS + reset_pagecache_counters() + name the name of a key cache + pagecache pointer to the pagecache to be reset + + DESCRIPTION + This procedure is used to reset the counters of all currently used key + caches, both the default one and the named ones. + + RETURN + 0 on success (always because it can't fail) +*/ + +int reset_pagecache_counters(const char *name __attribute__((unused)), + PAGECACHE *pagecache) +{ + DBUG_ENTER("reset_pagecache_counters"); + if (!pagecache->inited) + { + DBUG_PRINT("info", ("Key cache %s not initialized.", name)); + DBUG_RETURN(0); + } + DBUG_PRINT("info", ("Resetting counters for key cache %s.", name)); + + pagecache->global_blocks_changed= 0; /* Key_blocks_not_flushed */ + pagecache->global_cache_r_requests= 0; /* Key_read_requests */ + pagecache->global_cache_read= 0; /* Key_reads */ + pagecache->global_cache_w_requests= 0; /* Key_write_requests */ + pagecache->global_cache_write= 0; /* Key_writes */ + DBUG_RETURN(0); +} + + +/** + @brief Allocates a buffer and stores in it some info about all dirty pages + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they + are not interesting for a checkpoint record. + The caller has the intention of doing checkpoints. + + @param pagecache pointer to the page cache + @param[out] str pointer to where the allocated buffer, and + its size, will be put + @param[out] min_rec_lsn pointer to where the minimum rec_lsn of all + relevant dirty pages will be put + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *min_rec_lsn) +{ + my_bool error= 0; + size_t stored_list_size= 0; + uint file_hash; + char *ptr; + LSN minimum_rec_lsn= LSN_MAX; + DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN"); + + DBUG_ASSERT(NULL == str->str); + /* + We lock the entire cache but will be quick, just reading/writing a few MBs + of memory at most. + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + for (;;) + { + struct st_file_in_flush *other_flusher; + for (file_hash= 0; + (other_flusher= (struct st_file_in_flush *) + my_hash_element(&pagecache->files_in_flush, file_hash)) != NULL && + !other_flusher->first_in_switch; + file_hash++) + {} + if (other_flusher == NULL) + break; + /* + other_flusher.first_in_switch is true: some thread is flushing a file + and has removed dirty blocks from changed_blocks[] while they were still + dirty (they were being evicted (=>flushed) by yet another thread, which + may not have flushed the block yet so it may still be dirty). + If Checkpoint proceeds now, it will not see the page. If there is a + crash right after writing the checkpoint record, before the page is + flushed, at recovery the page will be wrongly ignored because it won't + be in the dirty pages list in the checkpoint record. So wait. + */ + { + struct st_my_thread_var *thread= my_thread_var; + wqueue_add_to_queue(&other_flusher->flush_queue, thread); + do + { + DBUG_PRINT("wait", + ("suspend thread %s %ld", thread->name, + (ulong) thread->id)); + pagecache_pthread_cond_wait(&thread->suspend, + &pagecache->cache_lock); + } + while (thread->next); + } + } + + /* Count how many dirty pages are interesting */ + for (file_hash= 0; file_hash < pagecache->changed_blocks_hash_size; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + /* + Q: is there something subtle with block->hash_link: can it be NULL? + does it have to be == hash_link->block... ? + */ + DBUG_ASSERT(block->hash_link != NULL); + DBUG_ASSERT(block->status & PCBLOCK_CHANGED); + /* + Note that we don't store bitmap pages, or pages from non-transactional + (like temporary) tables. Don't checkpoint during Recovery which uses + PAGECACHE_PLAIN_PAGE. + */ + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it */ + stored_list_size++; + } + } + + compile_time_assert(sizeof(pagecache->blocks) <= 8); + str->length= 8 + /* number of dirty pages */ + (2 + /* table id */ + 1 + /* data or index file */ + 5 + /* pageno */ + LSN_STORE_SIZE /* rec_lsn */ + ) * stored_list_size; + if (NULL == (str->str= my_malloc(PSI_INSTRUMENT_ME, str->length, MYF(MY_WME)))) + goto err; + ptr= str->str; + int8store(ptr, (ulonglong)stored_list_size); + ptr+= 8; + DBUG_PRINT("info", ("found %zu dirty pages", stored_list_size)); + if (stored_list_size == 0) + goto end; + for (file_hash= 0; file_hash < pagecache->changed_blocks_hash_size; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + uint16 table_id; + MARIA_SHARE *share; + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it in the checkpoint record */ + share= (MARIA_SHARE *)(block->hash_link->file.callback_data); + table_id= share->id; + int2store(ptr, table_id); + ptr+= 2; + ptr[0]= (share->kfile.file == block->hash_link->file.file); + ptr++; + DBUG_ASSERT(block->hash_link->pageno < ((1ULL) << 40)); + page_store(ptr, block->hash_link->pageno); + ptr+= PAGE_STORE_SIZE; + lsn_store(ptr, block->rec_lsn); + ptr+= LSN_STORE_SIZE; + if (block->rec_lsn != LSN_MAX) + { + DBUG_ASSERT(LSN_VALID(block->rec_lsn)); + if (cmp_translog_addr(block->rec_lsn, minimum_rec_lsn) < 0) + minimum_rec_lsn= block->rec_lsn; + } /* otherwise, some trn->rec_lsn should hold the correct info */ + } + } +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + *min_rec_lsn= minimum_rec_lsn; + DBUG_RETURN(error); + +err: + error= 1; + goto end; +} + + +#ifndef DBUG_OFF + +/** + Verifies that a file has no dirty pages. +*/ + +void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file) +{ + File fd= file->file; + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[FILE_HASH(*file, pagecache)]; + block != NULL; + block= block->next_changed) + if (block->hash_link->file.file == fd) + { + DBUG_PRINT("info", ("pagecache_file_not_in error")); + PCBLOCK_INFO(block); + DBUG_ASSERT(0); + } +} + + +/* + Test if disk-cache is ok +*/ +static void test_key_cache(PAGECACHE *pagecache __attribute__((unused)), + const char *where __attribute__((unused)), + my_bool lock __attribute__((unused))) +{ + /* TODO */ +} +#endif + +uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block) +{ + return block->buffer; +} + +#if defined(PAGECACHE_TIMEOUT) + +#define KEYCACHE_DUMP_FILE "pagecache_dump.txt" +#define MAX_QUEUE_LEN 100 + + +static void pagecache_dump(PAGECACHE *pagecache) +{ + FILE *pagecache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w"); + struct st_my_thread_var *last; + struct st_my_thread_var *thread; + PAGECACHE_BLOCK_LINK *block; + PAGECACHE_HASH_LINK *hash_link; + PAGECACHE_PAGE *page; + uint i; + + fprintf(pagecache_dump_file, "thread: %s %ld\n", thread->name, + (ulong) thread->id); + + i=0; + thread=last=waiting_for_hash_link.last_thread; + fprintf(pagecache_dump_file, "queue of threads waiting for hash link\n"); + if (thread) + do + { + thread= thread->next; + page= (PAGECACHE_PAGE *) thread->keycache_link; + fprintf(pagecache_dump_file, + "thread: %s %ld, (file,pageno)=(%u,%lu)\n", + thread->name, (ulong) thread->id, + (uint) page->file.file,(ulong) page->pageno); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + + i=0; + thread=last=waiting_for_block.last_thread; + fprintf(pagecache_dump_file, "queue of threads waiting for block\n"); + if (thread) + do + { + thread=thread->next; + hash_link= (PAGECACHE_HASH_LINK *) thread->keycache_link; + fprintf(pagecache_dump_file, + "thread: %s %u hash_link:%u (file,pageno)=(%u,%lu)\n", + thread->name, (ulong) thread->id, + (uint) PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link), + (uint) hash_link->file.file,(ulong) hash_link->pageno); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + + for (i=0 ; i < pagecache->blocks_used ; i++) + { + int j; + block= &pagecache->block_root[i]; + hash_link= block->hash_link; + fprintf(pagecache_dump_file, + "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n", + i, (int) (hash_link ? + PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link) : + -1), + block->status, block->requests, block->condvar ? 1 : 0); + for (j=0 ; j < COND_SIZE; j++) + { + PAGECACHE_WQUEUE *wqueue=&block->wqueue[j]; + thread= last= wqueue->last_thread; + fprintf(pagecache_dump_file, "queue #%d\n", j); + if (thread) + { + do + { + thread=thread->next; + fprintf(pagecache_dump_file, + "thread: %s %ld\n", thread->name, (ulong) thread->id); + if (++i == MAX_QUEUE_LEN) + break; + } + while (thread != last); + } + } + } + fprintf(pagecache_dump_file, "LRU chain:"); + block= pagecache= used_last; + if (block) + { + do + { + block= block->next_used; + fprintf(pagecache_dump_file, + "block:%u, ", PCBLOCK_NUMBER(pagecache, block)); + } + while (block != pagecache->used_last); + } + fprintf(pagecache_dump_file, "\n"); + + fclose(pagecache_dump_file); +} + +#endif /* defined(PAGECACHE_TIMEOUT) */ + +#if defined(PAGECACHE_TIMEOUT) && !defined(_WIN32) + + +static int pagecache_pthread_cond_wait(mysql_cond_t *cond, + mysql_mutex_t *mutex) +{ + int rc; + struct timeval now; /* time when we started waiting */ + struct timespec timeout; /* timeout value for the wait function */ + struct timezone tz; +#if defined(PAGECACHE_DEBUG) + int cnt=0; +#endif + + /* Get current time */ + gettimeofday(&now, &tz); + /* Prepare timeout value */ + timeout.tv_sec= now.tv_sec + PAGECACHE_TIMEOUT; + /* + timeval uses microseconds. + timespec uses nanoseconds. + 1 nanosecond = 1000 micro seconds + */ + timeout.tv_nsec= now.tv_usec * 1000; + KEYCACHE_THREAD_TRACE_END("started waiting"); +#if defined(PAGECACHE_DEBUG) + cnt++; + if (cnt % 100 == 0) + fprintf(pagecache_debug_log, "waiting...\n"); + fflush(pagecache_debug_log); +#endif + rc= mysql_cond_timedwait(cond, mutex, &timeout); + KEYCACHE_THREAD_TRACE_BEGIN("finished waiting"); + if (rc == ETIMEDOUT || rc == ETIME) + { +#if defined(PAGECACHE_DEBUG) + fprintf(pagecache_debug_log,"aborted by pagecache timeout\n"); + fclose(pagecache_debug_log); + abort(); +#endif + pagecache_dump(); + } + +#if defined(PAGECACHE_DEBUG) + KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT); +#else + assert(rc != ETIMEDOUT); +#endif + return rc; +} +#else +#if defined(PAGECACHE_DEBUG) +static int pagecache_pthread_cond_wait(mysql_cond_t *cond, + mysql_mutex_t *mutex) +{ + int rc; + KEYCACHE_THREAD_TRACE_END("started waiting"); + rc= mysql_cond_wait(cond, mutex); + KEYCACHE_THREAD_TRACE_BEGIN("finished waiting"); + return rc; +} +#endif +#endif /* defined(PAGECACHE_TIMEOUT) && !defined(_WIN32) */ + + +#if defined(PAGECACHE_DEBUG) +static int ___pagecache_pthread_mutex_lock(mysql_mutex_t *mutex) +{ + int rc; + rc= mysql_mutex_lock(mutex); + KEYCACHE_THREAD_TRACE_BEGIN(""); + return rc; +} + + +static void ___pagecache_pthread_mutex_unlock(mysql_mutex_t *mutex) +{ + KEYCACHE_THREAD_TRACE_END(""); + mysql_mutex_unlock(mutex); +} + + +static int ___pagecache_pthread_cond_signal(mysql_cond_t *cond) +{ + int rc; + KEYCACHE_THREAD_TRACE("signal"); + rc= mysql_cond_signal(cond); + return rc; +} + + +static void pagecache_debug_print(const char * fmt, ...) +{ + va_list args; + va_start(args,fmt); + if (pagecache_debug_log) + { + vfprintf(pagecache_debug_log, fmt, args); + fputc('\n',pagecache_debug_log); +#ifdef PAGECACHE_DEBUG_DLOG + _db_doprnt_(fmt, args); +#endif + } + va_end(args); +} + +void pagecache_debug_log_close(void) +{ + if (pagecache_debug_log) + fclose(pagecache_debug_log); +} +#endif /* defined(PAGECACHE_DEBUG) */ + +/** + @brief null hooks +*/ + +static my_bool null_pre_hook(PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ + return 0; +} + +static my_bool null_post_read_hook(int res, PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ + return res != 0; +} + +static void null_post_write_hook(int res __attribute__((unused)), + PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ + return; +} + +void pagecache_file_set_null_hooks(PAGECACHE_FILE *file) +{ + file->pre_read_hook= null_pre_hook; + file->post_read_hook= null_post_read_hook; + file->pre_write_hook= null_pre_hook; + file->post_write_hook= null_post_write_hook; + file->flush_log_callback= null_pre_hook; + file->callback_data= NULL; + file->head_blocks= file->big_block_size= 0; +} diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h new file mode 100644 index 00000000..dbd86fc0 --- /dev/null +++ b/storage/maria/ma_pagecache.h @@ -0,0 +1,365 @@ +/* Copyright (C) 2006 MySQL AB + Copyright (c) 2011, 2020, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Page cache variable structures */ + +#ifndef _ma_pagecache_h +#define _ma_pagecache_h +C_MODE_START + +#include "ma_loghandler_lsn.h" +#include <m_string.h> +#include <hash.h> + +/* Type of the page */ +enum pagecache_page_type +{ + /* + Used only for control page type changing during debugging. This define + should only be using when using DBUG. + */ + PAGECACHE_EMPTY_PAGE, + /* the page does not contain LSN */ + PAGECACHE_PLAIN_PAGE, + /* the page contain LSN (maria tablespace page) */ + PAGECACHE_LSN_PAGE, + /* Page type used when scanning file and we don't care about the type */ + PAGECACHE_READ_UNKNOWN_PAGE +}; + +/* + This enum describe lock status changing. every type of page cache will + interpret WRITE/READ lock as it need. +*/ +enum pagecache_page_lock +{ + PAGECACHE_LOCK_LEFT_UNLOCKED, /* free -> free */ + PAGECACHE_LOCK_LEFT_READLOCKED, /* read -> read */ + PAGECACHE_LOCK_LEFT_WRITELOCKED, /* write -> write */ + PAGECACHE_LOCK_READ, /* free -> read */ + PAGECACHE_LOCK_WRITE, /* free -> write */ + PAGECACHE_LOCK_READ_UNLOCK, /* read -> free */ + PAGECACHE_LOCK_WRITE_UNLOCK, /* write -> free */ + PAGECACHE_LOCK_WRITE_TO_READ /* write -> read */ +}; +/* + This enum describe pin status changing +*/ +enum pagecache_page_pin +{ + PAGECACHE_PIN_LEFT_PINNED, /* pinned -> pinned */ + PAGECACHE_PIN_LEFT_UNPINNED, /* unpinned -> unpinned */ + PAGECACHE_PIN, /* unpinned -> pinned */ + PAGECACHE_UNPIN /* pinned -> unpinned */ +}; +/* How to write the page */ +enum pagecache_write_mode +{ + /* do not write immediately, i.e. it will be dirty page */ + PAGECACHE_WRITE_DELAY, + /* page already is in the file. (key cache insert analogue) */ + PAGECACHE_WRITE_DONE +}; + +/* page number for maria */ +typedef ulonglong pgcache_page_no_t; + +/* args for read/write hooks */ +typedef struct st_pagecache_io_hook_args +{ + uchar * page; + pgcache_page_no_t pageno; + uchar * data; + + uchar *crypt_buf; /* when using encryption */ +} PAGECACHE_IO_HOOK_ARGS; + +struct st_pagecache; + +/* Structure to store things from get_object */ + +typedef struct st_S3_BLOCK +{ + uchar *str, *alloc_ptr; + size_t length; +} S3_BLOCK; + + +/* file descriptor for Maria */ +typedef struct st_pagecache_file +{ + /* Number of pages in the header which are not read with big blocks */ + size_t head_blocks; + /* size of a big block for S3 or 0 */ + size_t big_block_size; + /* File number */ + File file; + + /** Cannot be NULL */ + my_bool (*pre_read_hook)(PAGECACHE_IO_HOOK_ARGS *args); + my_bool (*post_read_hook)(int error, PAGECACHE_IO_HOOK_ARGS *args); + + /** Cannot be NULL */ + my_bool (*pre_write_hook)(PAGECACHE_IO_HOOK_ARGS *args); + void (*post_write_hook)(int error, PAGECACHE_IO_HOOK_ARGS *args); + + my_bool (*flush_log_callback)(PAGECACHE_IO_HOOK_ARGS *args); + + /** Cannot be NULL */ + uchar *callback_data; +} PAGECACHE_FILE; + +/* declare structures that is used by st_pagecache */ + +struct st_pagecache_block_link; +typedef struct st_pagecache_block_link PAGECACHE_BLOCK_LINK; +struct st_pagecache_page; +typedef struct st_pagecache_page PAGECACHE_PAGE; +struct st_pagecache_hash_link; +typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK; + +#include <wqueue.h> + +/* Default size of hash for changed files */ +#define MIN_PAGECACHE_CHANGED_BLOCKS_HASH_SIZE 512 + +#define PAGECACHE_PRIORITY_LOW 0 +#define PAGECACHE_PRIORITY_DEFAULT 3 +#define PAGECACHE_PRIORITY_HIGH 6 + +/* + The page cache structure + It also contains read-only statistics parameters. +*/ + +typedef struct st_pagecache +{ + size_t mem_size; /* specified size of the cache memory */ + size_t min_warm_blocks; /* min number of warm blocks; */ + size_t age_threshold; /* age threshold for hot blocks */ + ulonglong time; /* total number of block link operations */ + size_t hash_entries; /* max number of entries in the hash table */ + size_t changed_blocks_hash_size;/* Number of hash buckets for file blocks */ + ssize_t hash_links; /* max number of hash links */ + ssize_t hash_links_used; /* number of hash links taken from free links pool */ + ssize_t disk_blocks; /* max number of blocks in the cache */ + size_t blocks_used; /* maximum number of concurrently used blocks */ + size_t blocks_unused; /* number of currently unused blocks */ + size_t blocks_changed; /* number of currently dirty blocks */ + size_t warm_blocks; /* number of blocks in warm sub-chain */ + size_t cnt_for_resize_op; /* counter to block resize operation */ + size_t blocks_available; /* number of blocks available in the LRU chain */ + ssize_t blocks; /* max number of blocks in the cache */ + uint32 block_size; /* size of the page buffer of a cache block */ + PAGECACHE_HASH_LINK **hash_root;/* arr. of entries into hash table buckets */ + PAGECACHE_HASH_LINK *hash_link_root;/* memory for hash table links */ + PAGECACHE_HASH_LINK *free_hash_list;/* list of free hash links */ + PAGECACHE_BLOCK_LINK *free_block_list;/* list of free blocks */ + PAGECACHE_BLOCK_LINK *block_root;/* memory for block links */ + uchar *block_mem; /* memory for block buffers */ + PAGECACHE_BLOCK_LINK *used_last;/* ptr to the last block of the LRU chain */ + PAGECACHE_BLOCK_LINK *used_ins;/* ptr to the insertion block in LRU chain */ + mysql_mutex_t cache_lock; /* to lock access to the cache structure */ + WQUEUE resize_queue; /* threads waiting during resize operation */ + WQUEUE waiting_for_hash_link;/* waiting for a free hash link */ + WQUEUE waiting_for_block; /* requests waiting for a free block */ + /* hash for dirty file bl.*/ + PAGECACHE_BLOCK_LINK **changed_blocks; + /* hash for other file bl.*/ + PAGECACHE_BLOCK_LINK **file_blocks; + + /** + Function for reading file in big hunks from S3 + Data will be filled with pointer and length to data read + start_page will be contain first page read. + */ + my_bool (*big_block_read)(struct st_pagecache *pagecache, + PAGECACHE_IO_HOOK_ARGS *args, + struct st_pagecache_file *file, S3_BLOCK *data); + void (*big_block_free)(S3_BLOCK *data); + + + /* + The following variables are and variables used to hold parameters for + initializing the key cache. + */ + + ulonglong param_buff_size; /* size the memory allocated for the cache */ + size_t param_block_size; /* size of the blocks in the key cache */ + size_t param_division_limit; /* min. percentage of warm blocks */ + size_t param_age_threshold; /* determines when hot block is downgraded */ + + /* Statistics variables. These are reset in reset_pagecache_counters(). */ + size_t global_blocks_changed; /* number of currently dirty blocks */ + ulonglong global_cache_w_requests;/* number of write requests (write hits) */ + ulonglong global_cache_write; /* number of writes from cache to files */ + ulonglong global_cache_r_requests;/* number of read requests (read hits) */ + ulonglong global_cache_read; /* number of reads from files to cache */ + + uint shift; /* block size = 2 ^ shift */ + myf readwrite_flags; /* Flags to pread/pwrite() */ + myf org_readwrite_flags; /* Flags to pread/pwrite() at init */ + my_bool inited; + my_bool resize_in_flush; /* true during flush of resize operation */ + my_bool can_be_used; /* usage of cache for read/write is allowed */ + my_bool in_init; /* Set to 1 in MySQL during init/resize */ + my_bool extra_debug; /* set to 1 if one wants extra logging */ + HASH files_in_flush; /**< files in flush_pagecache_blocks_int() */ +} PAGECACHE; + +/** @brief Return values for PAGECACHE_FLUSH_FILTER */ +enum pagecache_flush_filter_result +{ + FLUSH_FILTER_SKIP_TRY_NEXT= 0,/**< skip page and move on to next one */ + FLUSH_FILTER_OK, /**< flush page and move on to next one */ + FLUSH_FILTER_SKIP_ALL /**< skip page and all next ones */ +}; +/** @brief a filter function type for flush_pagecache_blocks_with_filter() */ +typedef enum pagecache_flush_filter_result +(*PAGECACHE_FLUSH_FILTER)(enum pagecache_page_type type, + pgcache_page_no_t page, + LSN rec_lsn, void *arg); + +/* The default key cache */ +extern PAGECACHE dflt_pagecache_var, *dflt_pagecache; + +extern size_t init_pagecache(PAGECACHE *pagecache, size_t use_mem, + uint division_limit, uint age_threshold, + uint block_size, uint changed_blocks_hash_size, + myf my_read_flags)__attribute__((visibility("default"))) ; +extern size_t resize_pagecache(PAGECACHE *pagecache, + size_t use_mem, uint division_limit, + uint age_threshold, uint changed_blocks_hash_size); +extern void change_pagecache_param(PAGECACHE *pagecache, uint division_limit, + uint age_threshold); + +extern uchar *pagecache_read(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + PAGECACHE_BLOCK_LINK **link); + +#define pagecache_write(P,F,N,L,B,T,O,I,M,K,R) \ + pagecache_write_part(P,F,N,L,B,T,O,I,M,K,R,0,(P)->block_size) + +#define pagecache_inject(P,F,N,L,B,T,O,I,K,R) \ + pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \ + K,R,0,(P)->block_size) + +extern my_bool pagecache_write_part(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint level, + uchar *buff, + enum pagecache_page_type type, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + enum pagecache_write_mode write_mode, + PAGECACHE_BLOCK_LINK **link, + LSN first_REDO_LSN_for_page, + uint offset, + uint size); +extern void pagecache_unlock(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed); +extern void pagecache_unlock_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *block, + enum pagecache_page_lock lock, + enum pagecache_page_pin pin, + LSN first_REDO_LSN_for_page, + LSN lsn, my_bool was_changed, + my_bool any); +extern void pagecache_unpin(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + LSN lsn); +extern void pagecache_unpin_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *link, + LSN lsn); +extern void pagecache_set_write_on_delete_by_link(PAGECACHE_BLOCK_LINK *block); + + +/* Results of flush operation (bit field in fact) */ + +/* The flush is done. */ +#define PCFLUSH_OK 0 +/* There was errors during the flush process. */ +#define PCFLUSH_ERROR 1 +/* Pinned blocks was met and skipped. */ +#define PCFLUSH_PINNED 2 +/* PCFLUSH_ERROR and PCFLUSH_PINNED. */ +#define PCFLUSH_PINNED_AND_ERROR (PCFLUSH_ERROR|PCFLUSH_PINNED) + +// initialize file with empty hooks +void pagecache_file_set_null_hooks(PAGECACHE_FILE*); + +#define flush_pagecache_blocks(A,B,C) \ + flush_pagecache_blocks_with_filter(A,B,C,NULL,NULL) +extern int flush_pagecache_blocks_with_filter(PAGECACHE *keycache, + PAGECACHE_FILE *file, + enum flush_type type, + PAGECACHE_FLUSH_FILTER filter, + void *filter_arg)__attribute__((visibility("default"))) ; +extern my_bool pagecache_delete(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + enum pagecache_page_lock lock, + my_bool flush); +extern my_bool pagecache_delete_by_link(PAGECACHE *pagecache, + PAGECACHE_BLOCK_LINK *link, + enum pagecache_page_lock lock, + my_bool flush); +extern my_bool pagecache_delete_pages(PAGECACHE *pagecache, + PAGECACHE_FILE *file, + pgcache_page_no_t pageno, + uint page_count, + enum pagecache_page_lock lock, + my_bool flush); +extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup)__attribute__((visibility("default"))) ; +extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *min_lsn); +extern int reset_pagecache_counters(const char *name, PAGECACHE *pagecache); +extern uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block); + +extern uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block); +extern void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block, + uint level); + +/* Functions to handle multiple key caches */ +extern my_bool multi_pagecache_init(void); +extern void multi_pagecache_free(void); +extern PAGECACHE *multi_pagecache_search(uchar *key, uint length, + PAGECACHE *def); +extern my_bool multi_pagecache_set(const uchar *key, uint length, + PAGECACHE *pagecache); +extern void multi_pagecache_change(PAGECACHE *old_data, + PAGECACHE *new_data); +#ifndef DBUG_OFF +void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file); +#else +#define pagecache_file_no_dirty_page(A,B) {} +#endif + +C_MODE_END +#endif /* _keycache_h */ diff --git a/storage/maria/ma_pagecaches.c b/storage/maria/ma_pagecaches.c new file mode 100644 index 00000000..fd5cd209 --- /dev/null +++ b/storage/maria/ma_pagecaches.c @@ -0,0 +1,104 @@ +/* Copyright (C) 2003-2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Handling of multiple key caches + + The idea is to have a thread safe hash on the table name, + with a default key cache value that is returned if the table name is not in + the cache. +*/ + +#include "maria_def.h" +#include "ma_pagecache.h" +#include <hash.h> +#include <m_string.h> +#include "../../mysys/my_safehash.h" + +/***************************************************************************** + Functions to handle the pagecache objects +*****************************************************************************/ + +/* Variable to store all key cache objects */ +static SAFE_HASH pagecache_hash; + + +my_bool multi_pagecache_init(void) +{ + return safe_hash_init(&pagecache_hash, 16, (uchar*) maria_pagecache); +} + + +void multi_pagecache_free(void) +{ + safe_hash_free(&pagecache_hash); +} + +/* + Get a key cache to be used for a specific table. + + SYNOPSIS + multi_pagecache_search() + key key to find (usually table path) + uint length Length of key. + def Default value if no key cache + + NOTES + This function is coded in such a way that we will return the + default key cache even if one never called multi_pagecache_init. + This will ensure that it works with old MyISAM clients. + + RETURN + key cache to use +*/ + +PAGECACHE *multi_pagecache_search(uchar *key, uint length, + PAGECACHE *def) +{ + if (!pagecache_hash.hash.records) + return def; + return (PAGECACHE*) safe_hash_search(&pagecache_hash, key, length, + (void*) def); +} + + +/* + Assosiate a key cache with a key + + + SYONOPSIS + multi_pagecache_set() + key key (path to table etc..) + length Length of key + pagecache cache to assococite with the table + + NOTES + This can be used both to insert a new entry and change an existing + entry +*/ + + +my_bool multi_pagecache_set(const uchar *key, uint length, + PAGECACHE *pagecache) +{ + return safe_hash_set(&pagecache_hash, key, length, (uchar*) pagecache); +} + + +void multi_pagecache_change(PAGECACHE *old_data, + PAGECACHE *new_data) +{ + safe_hash_change(&pagecache_hash, (uchar*) old_data, (uchar*) new_data); +} diff --git a/storage/maria/ma_pagecrc.c b/storage/maria/ma_pagecrc.c new file mode 100644 index 00000000..4e1389b1 --- /dev/null +++ b/storage/maria/ma_pagecrc.c @@ -0,0 +1,397 @@ +/* Copyright (C) 2007-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + + +/** + @brief calculate crc of the page avoiding special values + + @param start The value to start CRC (we use page number here) + @param data data pointer + @param length length of the data + + @return crc of the page without special values +*/ + +static uint32 maria_page_crc(uint32 start, uchar *data, uint length) +{ + uint32 crc= my_checksum(start, data, length); + + /* we need this assert to get following comparison working */ + compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE == + MARIA_NO_CRC_NORMAL_PAGE - 1 && + MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff); + if (crc >= MARIA_NO_CRC_BITMAP_PAGE) + crc= MARIA_NO_CRC_BITMAP_PAGE - 1; + + return(crc); +} + +/** + @brief Maria pages read callback (checks the page CRC) + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr pointer to MARIA_SHARE + @param no_crc_val Value which means CRC absence + (MARIA_NO_CRC_NORMAL_PAGE or MARIA_NO_CRC_BITMAP_PAGE) + @param data_length length of data to calculate CRC + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check(uchar *page, + pgcache_page_no_t page_no, + MARIA_SHARE *share, + uint32 no_crc_val, + int data_length) +{ + uint32 crc= uint4korr(page + share->block_size - CRC_SIZE), new_crc; + my_bool res; + DBUG_ENTER("maria_page_crc_check"); + + DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE); + + /* we need this assert to get following comparison working */ + compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE == + MARIA_NO_CRC_NORMAL_PAGE - 1 && + MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff); + /* + If crc is no_crc_val then + the page has no crc, so there is nothing to check. + */ + if (crc >= MARIA_NO_CRC_BITMAP_PAGE) + { + DBUG_PRINT("info", ("No crc: %lu crc: %lu page: %lu ", + (ulong) no_crc_val, (ulong) crc, (ulong) page_no)); + if (crc != no_crc_val) + { + my_errno= HA_ERR_WRONG_CRC; + DBUG_PRINT("error", ("Wrong no CRC value")); + DBUG_RETURN(1); + } + DBUG_RETURN(0); + } + new_crc= maria_page_crc((uint32) page_no, page, data_length); + DBUG_ASSERT(new_crc != no_crc_val); + res= MY_TEST(new_crc != crc); + if (res) + { + /* + Bitmap pages may be totally zero filled in some cases. + This happens when we get a crash after the pagecache has written + out a page that is on a newly created bitmap page and we get + a crash before the bitmap page is written out. + + We handle this case with the following logic: + When reading, approve of bitmap pages where all bytes are zero + (This is after all a bitmap pages where no data is reserved and + the CRC will be corrected at next write) + */ + if (no_crc_val == MARIA_NO_CRC_BITMAP_PAGE && + crc == 0 && _ma_check_if_zero(page, data_length)) + { + DBUG_PRINT("warning", ("Found bitmap page that was not initialized")); + DBUG_RETURN(0); + } + + DBUG_PRINT("error", ("Page: %lu crc: %lu calculated crc: %lu", + (ulong) page_no, (ulong) crc, (ulong) new_crc)); + my_errno= HA_ERR_WRONG_CRC; + } + DBUG_RETURN(res); +} + + +/** + @brief Maria pages write callback (sets the page CRC for data and index + files) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_crc_set_normal(PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + pgcache_page_no_t page_no= args->pageno; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + int data_length= share->block_size - CRC_SIZE; + uint32 crc= maria_page_crc((uint32) page_no, page, data_length); + DBUG_ENTER("maria_page_crc_set_normal"); + DBUG_PRINT("info", ("Page %lu crc: %lu", (ulong) page_no, (ulong)crc)); + + /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */ + int4store_aligned(page + data_length, crc); + DBUG_RETURN(0); +} + + +/** + @brief Maria pages write callback (sets the page CRC for keys) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_crc_set_index(PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + pgcache_page_no_t page_no= args->pageno; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + int data_length= _ma_get_page_used(share, page); + uint32 crc= maria_page_crc((uint32) page_no, page, data_length); + DBUG_ENTER("maria_page_crc_set_index"); + DBUG_PRINT("info", ("Page %lu crc: %lu", + (ulong) page_no, (ulong) crc)); + DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE); + /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */ + int4store_aligned(page + share->block_size - CRC_SIZE, crc); + DBUG_RETURN(0); +} + + +/* interface functions */ + + +/** + @brief Maria pages read callback (checks the page CRC) for index/data pages + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_data(int res, PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + pgcache_page_no_t page_no= args->pageno; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + if (res) + { + return 1; + } + + return (maria_page_crc_check(page, (uint32) page_no, share, + MARIA_NO_CRC_NORMAL_PAGE, + share->block_size - CRC_SIZE)); +} + + +/** + @brief Maria pages read callback (checks the page CRC) for bitmap pages + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_bitmap(int res, PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + pgcache_page_no_t page_no= args->pageno; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + if (res) + { + return 1; + } + return (maria_page_crc_check(page, (uint32) page_no, share, + MARIA_NO_CRC_BITMAP_PAGE, + share->block_size - CRC_SIZE)); +} + + +/** + @brief Maria pages read callback (checks the page CRC) for index pages + + @param page The page data to check + @param page_no The page number (<offset>/<page length>) + @param data_ptr Read callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_index(int res, PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + pgcache_page_no_t page_no= args->pageno; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + uint length= _ma_get_page_used(share, page); + + if (res) + return 1; + if (length > share->block_size - CRC_SIZE) + { + DBUG_PRINT("error", ("Wrong page length: %u", length)); + my_errno= HA_ERR_WRONG_CRC; + return 1; + } + return maria_page_crc_check(page, (uint32) page_no, share, + MARIA_NO_CRC_NORMAL_PAGE, + length); +} + + +/** + @brief Maria pages dummy read callback for temporary tables + + @retval 0 OK + @retval 1 Error +*/ + +my_bool maria_page_crc_check_none(int res, + PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ + return res != 0; +} + + +/** + @brief Maria pages write callback (sets the page filler for index/data) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_filler_set_normal(PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; +#ifdef DBUG_ASSERT_EXISTS + pgcache_page_no_t page_no= args->pageno; +#endif + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + DBUG_ENTER("maria_page_filler_set_normal"); + DBUG_ASSERT(page_no != 0); /* Catches some simple bugs */ + int4store_aligned(page + share->block_size - CRC_SIZE, + MARIA_NO_CRC_NORMAL_PAGE); + DBUG_RETURN(0); +} + + +/** + @brief Maria pages write callback (sets the page filler for bitmap) + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK +*/ + +my_bool maria_page_filler_set_bitmap(PAGECACHE_IO_HOOK_ARGS *args) +{ + uchar *page= args->page; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + DBUG_ENTER("maria_page_filler_set_bitmap"); + int4store_aligned(page + share->block_size - CRC_SIZE, + MARIA_NO_CRC_BITMAP_PAGE); + DBUG_RETURN(0); +} + + +/** + @brief Maria pages dummy write callback for temporary tables + + @retval 0 OK +*/ + +my_bool maria_page_filler_set_none(PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ +#ifdef HAVE_valgrind + uchar *page= args->page; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + int4store_aligned(page + share->block_size - CRC_SIZE, + 0); +#endif + return 0; +} + + +/** + @brief Write failure callback (mark table as corrupted) + + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) +*/ + +void maria_page_write_failure(int error, PAGECACHE_IO_HOOK_ARGS *args) +{ + if (error) + maria_mark_crashed_share((MARIA_SHARE *)args->data); +} + + +/** + @brief Maria flush log log if needed + + @param page The page data to set + @param page_no The page number (<offset>/<page length>) + @param data_ptr Write callback data pointer (pointer to MARIA_SHARE) + + @retval 0 OK + @retval 1 error +*/ + +my_bool maria_flush_log_for_page(PAGECACHE_IO_HOOK_ARGS *args) +{ + LSN lsn; + uchar *page= args->page; + MARIA_SHARE *share= (MARIA_SHARE *)args->data; + DBUG_ENTER("maria_flush_log_for_page"); + /* share is 0 here only in unittest */ + DBUG_ASSERT(!share || share->page_type == PAGECACHE_LSN_PAGE); + lsn= lsn_korr(page); + if (translog_flush(lsn)) + DBUG_RETURN(1); + /* + Now when log is written, it's safe to incremented 'open' counter for + the table so that we know it was not closed properly. + */ + if (share && !share->global_changed) + _ma_mark_file_changed_now(share); + DBUG_RETURN(0); +} + + +my_bool maria_flush_log_for_page_none(PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ + return 0; +} + +my_bool maria_page_null_pre_read_hook(PAGECACHE_IO_HOOK_ARGS *args + __attribute__((unused))) +{ + return 0; +} diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c new file mode 100644 index 00000000..58beec9e --- /dev/null +++ b/storage/maria/ma_panic.c @@ -0,0 +1,139 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "ma_fulltext.h" + +/* + Stop usage of Maria + + SYNOPSIS + maria_panic() + flag HA_PANIC_CLOSE: All maria files (tables and log) are closed. + maria_end() is called. + HA_PANIC_WRITE: All misam files are unlocked and + all changed data in single user maria is + written to file + HA_PANIC_READ All maria files that was locked when + maria_panic(HA_PANIC_WRITE) was done is + locked. A maria_readinfo() is done for + all single user files to get changes + in database + + RETURN + 0 ok + # error number in case of error +*/ + +int maria_panic(enum ha_panic_function flag) +{ + int error=0; + LIST *list_element,*next_open; + MARIA_HA *info; + DBUG_ENTER("maria_panic"); + + if (!maria_inited) + DBUG_RETURN(0); + mysql_mutex_lock(&THR_LOCK_maria); + for (list_element=maria_open_list ; list_element ; list_element=next_open) + { + next_open=list_element->next; /* Save if close */ + info=(MARIA_HA*) list_element->data; + switch (flag) { + case HA_PANIC_CLOSE: + /* + If bad luck (if some tables would be used now, which normally does not + happen in MySQL), as we release the mutex, the list may change and so + we may crash. + */ + mysql_mutex_unlock(&THR_LOCK_maria); + if (maria_close(info)) + error=my_errno; + mysql_mutex_lock(&THR_LOCK_maria); + break; + case HA_PANIC_WRITE: /* Do this to free databases */ +#ifdef CANT_OPEN_FILES_TWICE + if (info->s->options & HA_OPTION_READ_ONLY_DATA) + break; +#endif + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE)) + error=my_errno; + if (info->opt_flag & WRITE_CACHE_USED) + if (flush_io_cache(&info->rec_cache)) + error=my_errno; + if (info->opt_flag & READ_CACHE_USED) + { + if (flush_io_cache(&info->rec_cache)) + error=my_errno; + reinit_io_cache(&info->rec_cache,READ_CACHE,0, + (pbool) (info->lock_type != F_UNLCK),1); + } + if (info->lock_type != F_UNLCK && ! info->was_locked) + { + info->was_locked=info->lock_type; + if (maria_lock_database(info,F_UNLCK)) + error=my_errno; + } +#ifdef CANT_OPEN_FILES_TWICE + if (info->s->kfile.file >= 0 && mysql_file_close(info->s->kfile.file, MYF(0))) + error = my_errno; + if (info->dfile.file >= 0 && mysql_file_close(info->dfile.file, MYF(0))) + error = my_errno; + info->s->kfile.file= info->dfile.file= -1;/* Files aren't open anymore */ +#endif + break; + case HA_PANIC_READ: /* Restore to before WRITE */ +#ifdef CANT_OPEN_FILES_TWICE + { /* Open closed files */ + char name_buff[FN_REFLEN]; + MARIA_SHARE *share= info->s; + if (share->kfile.file < 0) + { + + if ((share->kfile.file= mysql_file_open(key_file_kfile, + fn_format(name_buff, info->filename, "", + N_NAME_IEXT,4), + info->mode, MYF(MY_WME))) < 0) + error = my_errno; + } + if (info->dfile.file < 0) + { + if ((info->dfile.file= mysql_file_open(key_file_dfile, + fn_format(name_buff, info->filename, + "", N_NAME_DEXT, 4), + info->mode, MYF(MY_WME))) < 0) + error = my_errno; + info->rec_cache.file= info->dfile.file; + } + if (share->bitmap.file.file < 0) + share->bitmap.file.file= info->dfile.file; + } +#endif + if (info->was_locked) + { + if (maria_lock_database(info, info->was_locked)) + error=my_errno; + info->was_locked=0; + } + break; + } + } + mysql_mutex_unlock(&THR_LOCK_maria); + if (flag == HA_PANIC_CLOSE) + maria_end(); + if (!error) + DBUG_RETURN(0); + DBUG_RETURN(my_errno=error); +} /* maria_panic */ diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c new file mode 100644 index 00000000..60fd9b09 --- /dev/null +++ b/storage/maria/ma_preload.c @@ -0,0 +1,116 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Preload indexes into key cache +*/ + +#include "maria_def.h" + + +/* + Preload pages of the index file for a table into the key cache + + SYNOPSIS + maria_preload() + info open table + map map of indexes to preload into key cache + ignore_leaves only non-leaves pages are to be preloaded + + RETURN VALUE + 0 if a success. error code - otherwise. + + NOTES. + At present pages for all indexes are preloaded. + In future only pages for indexes specified in the key_map parameter + of the table will be preloaded. + We don't yet use preload_buff_size (we read page after page). +*/ + +int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves) +{ + ulong block_length= 0; + uchar *buff; + MARIA_SHARE* share= info->s; + uint keynr; + my_off_t key_file_length= share->state.state.key_file_length; + pgcache_page_no_t page_no, page_no_max; + PAGECACHE_BLOCK_LINK *page_link; + DBUG_ENTER("maria_preload"); + + if (!share->state.header.keys || !maria_is_any_key_active(key_map) || + (key_file_length == share->base.keystart)) + DBUG_RETURN(0); + + block_length= share->pagecache->block_size; + + if (!(buff= (uchar *) my_malloc(PSI_INSTRUMENT_ME, block_length, MYF(MY_WME)))) + DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM); + + if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE)) + goto err; + + /* + Currently when we come here all other open instances of the table have + been closed, and we flushed all pages of our own instance, so there + cannot be any page of this table in the cache. Thus my_pread() would be + safe. But in the future, we will allow more concurrency during + preloading, so we use pagecache_read() instead of my_pread() because we + observed that on some Linux, concurrent pread() and pwrite() (which + could be from a page eviction by another thread) to the same page can + make pread() see an half-written page. + In this future, we should find a way to read state.key_file_length + reliably, handle concurrent shrinks (delete_all_rows()) etc. + */ + for ((page_no= share->base.keystart / block_length), + (page_no_max= key_file_length / block_length); + page_no < page_no_max; page_no++) + { + /** + @todo instead of reading pages one by one we could have a call + pagecache_read_several_pages() which does a single my_pread() for many + consecutive pages (like the my_pread() in mi_preload()). + */ + if (pagecache_read(share->pagecache, &share->kfile, page_no, + DFLT_INIT_HITS, buff, share->page_type, + PAGECACHE_LOCK_WRITE, &page_link) == NULL) + goto err; + keynr= _ma_get_keynr(share, buff); + if (((ignore_leaves && !_ma_test_if_nod(share, buff)) || + keynr == MARIA_DELETE_KEY_NR || + !(key_map & ((ulonglong) 1 << keynr))) && + (pagecache_pagelevel(page_link) == DFLT_INIT_HITS)) + { + /* + This page is not interesting, and (last condition above) we are the + ones who put it in the cache, so nobody else is interested in it. + */ + if (pagecache_delete_by_link(share->pagecache, page_link, + PAGECACHE_LOCK_LEFT_WRITELOCKED, FALSE)) + goto err; + } + else /* otherwise it stays in cache: */ + pagecache_unlock_by_link(share->pagecache, page_link, + PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, FALSE, FALSE); + } + + my_free(buff); + DBUG_RETURN(0); + +err: + my_free(buff); + DBUG_RETURN(my_errno= errno); +} diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c new file mode 100644 index 00000000..442adc35 --- /dev/null +++ b/storage/maria/ma_range.c @@ -0,0 +1,335 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Gives a approximated number of how many records there is between two keys. + Used when optimizing querries. + */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map, + enum ha_rkey_function, ulonglong *); +static double _ma_search_pos(MARIA_HA *, MARIA_KEY *, uint32, my_off_t, + ulonglong *page); +static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key); + + +/** + @brief Estimate how many records there is in a given range + + @param info MARIA handler + @param inx Index to use + @param min_key Min key. Is = 0 if no min range + @param max_key Max key. Is = 0 if no max range + + @note + We should ONLY return 0 if there is no rows in range + + @return Estimated number of rows or error + @retval HA_POS_ERROR error (or we can't estimate number of rows) + @retval number Estimated number of rows +*/ + +ha_rows maria_records_in_range(MARIA_HA *info, int inx, + const key_range *min_key, + const key_range *max_key, page_range *pages) +{ + ha_rows start_pos,end_pos,res; + MARIA_SHARE *share= info->s; + MARIA_KEY key; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_records_in_range"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(HA_POS_ERROR); + + if (fast_ma_readinfo(info)) + DBUG_RETURN(HA_POS_ERROR); + info->update&= (HA_STATE_CHANGED+HA_STATE_ROW_CHANGED); + keyinfo= share->keyinfo + inx; + if (share->lock_key_trees) + mysql_rwlock_rdlock(&keyinfo->root_lock); + + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + { + uchar *key_buff; + + /* + The problem is that the optimizer doesn't support + RTree keys properly at the moment. + Hope this will be fixed some day. + But now NULL in the min_key means that we + didn't make the task for the RTree key + and expect BTree functionality from it. + As it's not able to handle such request + we return the error. + */ + if (!min_key) + { + res= HA_POS_ERROR; + break; + } + key_buff= info->last_key.data + share->base.max_key_length; + _ma_pack_key(info, &key, inx, key_buff, + min_key->key, min_key->keypart_map, + (HA_KEYSEG**) 0); + res= maria_rtree_estimate(info, &key, maria_read_vec[min_key->flag]); + res= res ? res : 1; /* Don't return 0 */ + break; + } +#endif + case HA_KEY_ALG_BTREE: + default: + start_pos= (min_key ? + _ma_record_pos(info, min_key->key, min_key->keypart_map, + min_key->flag, &pages->first_page) : + (ha_rows) 0); + end_pos= (max_key ? + _ma_record_pos(info, max_key->key, max_key->keypart_map, + max_key->flag, &pages->last_page) : + info->state->records + (ha_rows) 1); + res= (end_pos < start_pos ? (ha_rows) 0 : + (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos)); + if (start_pos == HA_POS_ERROR || end_pos == HA_POS_ERROR) + res=HA_POS_ERROR; + } + + if (share->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + fast_ma_writeinfo(info); + + /** + @todo LOCK + If res==0 (no rows), if we need to guarantee repeatability of the search, + we will need to set a next-key lock in this statement. + Also SELECT COUNT(*)... + */ + + DBUG_PRINT("info",("records: %ld",(ulong) (res))); + DBUG_RETURN(res); +} + + + /* Find relative position (in records) for key in index-tree */ + +static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data, + key_part_map keypart_map, + enum ha_rkey_function search_flag, + ulonglong *final_page) +{ + uint inx= (uint) info->lastinx; + uint32 nextflag; + uchar *key_buff; + double pos; + MARIA_KEY key; + DBUG_ENTER("_ma_record_pos"); + DBUG_PRINT("enter",("search_flag: %d",search_flag)); + DBUG_ASSERT(keypart_map); + + key_buff= info->lastkey_buff+info->s->base.max_key_length; + _ma_pack_key(info, &key, inx, key_buff, key_data, keypart_map, + (HA_KEYSEG**) 0); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key);); + nextflag=maria_read_vec[search_flag]; + + /* Indicate if we're doing a search on a key prefix */ + if (((((key_part_map)1) << key.keyinfo->keysegs) - 1) != keypart_map) + nextflag |= SEARCH_PART_KEY; + + /* + my_handler.c:ha_compare_text() has a flag 'skip_end_space'. + This is set in my_handler.c:ha_key_cmp() in dependence on the + compare flags 'nextflag' and the column type. + + TEXT columns are of type HA_KEYTYPE_VARTEXT. In this case the + condition is skip_end_space= ((nextflag & (SEARCH_FIND | + SEARCH_UPDATE)) == SEARCH_FIND). + + SEARCH_FIND is used for an exact key search. The combination + SEARCH_FIND | SEARCH_UPDATE is used in write/update/delete + operations with a comment like "Not real duplicates", whatever this + means. From the condition above we can see that 'skip_end_space' is + always false for these operations. The result is that trailing space + counts in key comparison and hence, empty strings ('', string length + zero, but not NULL) compare less that strings starting with control + characters and these in turn compare less than strings starting with + blanks. + + When estimating the number of records in a key range, we request an + exact search for the minimum key. This translates into a plain + SEARCH_FIND flag. Using this alone would lead to a 'skip_end_space' + compare. Empty strings would be expected above control characters. + Their keys would not be found because they are located below control + characters. + + This is the reason that we add the SEARCH_UPDATE flag here. It makes + the key estimation compare in the same way like key write operations + do. Only so we will find the keys where they have been inserted. + + Adding the flag unconditionally does not hurt as it is used in the + above mentioned condition only. So it can safely be used together + with other flags. + */ + pos= _ma_search_pos(info, &key, + nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE, + info->s->state.key_root[inx], final_page); + if (pos >= 0.0) + { + DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records))); + DBUG_RETURN((ulong) (pos*info->state->records+0.5)); + } + DBUG_RETURN(HA_POS_ERROR); +} + + +/** + Find offset for key on index page + + @notes + Modified version of _ma_search() + + @return + @retval 0.0 <= x <= 1.0 +*/ + +static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, my_off_t pos, + ulonglong *final_page) +{ + int flag; + uint keynr, UNINIT_VAR(max_keynr); + my_bool after_key; + uchar *keypos; + double offset; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("_ma_search_pos"); + + if (pos == HA_OFFSET_ERROR) + DBUG_RETURN(0.0); + + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff, 1)) + goto err; + *final_page= pos; + flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos, + info->lastkey_buff, &after_key); + keynr= _ma_keynr(&page, keypos, &max_keynr); + + if (flag) + { + if (flag == MARIA_FOUND_WRONG_KEY) + DBUG_RETURN(-1); /* error */ + /* + Didn't found match. keypos points at next (bigger) key + Try to find a smaller, better matching key. + Matches keynr + [0-1] + */ + if (! page.node) + offset= 0.0; + else if ((offset= _ma_search_pos(info, key, nextflag, + _ma_kpos(page.node,keypos), + final_page)) < 0) + DBUG_RETURN(offset); + } + else + { + /* + Found match. Keypos points at the start of the found key. + + For node pages, we are counting underlying trees and for key + pages we are counting keys. + + If this is a node then we have to search backwards to find the + first occurrence of the key. The row position in a node tree + is keynr (starting from 0) + offset for sub tree. If there is + no sub tree to search, then we are at start of next sub tree. + + If this is not a node, then the current key position is correct. + */ + offset= (page.node) ? 1.0 : 0.0; + if ((nextflag & SEARCH_FIND) && page.node && + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME || + (nextflag & (SEARCH_PREFIX | SEARCH_NO_FIND | SEARCH_LAST | + SEARCH_PART_KEY)))) + { + /* + There may be identical keys in the tree. Try to match on of those. + Matches keynr + [0-1] + */ + if ((offset= _ma_search_pos(info, key, SEARCH_FIND, + _ma_kpos(page.node,keypos), + final_page)) < 0) + DBUG_RETURN(offset); /* Read error */ + } + } + DBUG_PRINT("info",("keynr: %d offset: %g max_keynr: %d nod: %d flag: %d", + keynr,offset,max_keynr,page.node,flag)); + DBUG_RETURN((keynr + offset) / (max_keynr + MY_TEST(page.node))); +err: + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN (-1.0); +} + + +/* + Get keynummer of current key and max number of keys in nod + + keynr >= 0 && key_nr <= max_key +*/ + +static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key) +{ + uint page_flag, nod_flag, keynr, max_key; + uchar t_buff[MARIA_MAX_KEY_BUFF], *pos, *end; + const MARIA_KEYDEF *keyinfo= page->keyinfo; + MARIA_KEY key; + + page_flag= page->flag; + nod_flag= page->node; + pos= page->buff + page->info->s->keypage_header + nod_flag; + end= page->buff + page->size; + + if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + *ret_max_key= (uint) (end - pos)/(keyinfo->keylength+nod_flag); + return (uint) (keypos - pos)/(keyinfo->keylength+nod_flag); + } + + max_key=keynr=0; + t_buff[0]=0; /* Safety */ + key.data= t_buff; + key.keyinfo= (MARIA_KEYDEF*) keyinfo; + + while (pos < end) + { + if (!(pos= (*keyinfo->skip_key)(&key, page_flag, nod_flag, pos))) + { + DBUG_ASSERT(0); + return 0; /* Error */ + } + max_key++; + if (pos == keypos) + keynr= max_key; + } + *ret_max_key=max_key; + return(keynr); +} diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c new file mode 100644 index 00000000..006c8bef --- /dev/null +++ b/storage/maria/ma_recovery.c @@ -0,0 +1,3939 @@ +/* Copyright (C) 2006, 2007 MySQL AB + Copyright (C) 2010, 2013, Monty Program Ab. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + WL#3072 Maria recovery + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* Here is the implementation of this module */ + +#include "maria_def.h" +#include "ma_recovery.h" +#include "ma_blockrec.h" +#include "ma_checkpoint.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_recovery_util.h" +#include "hash.h" +#include <my_check_opt.h> + +struct st_trn_for_recovery /* used only in the REDO phase */ +{ + LSN group_start_lsn, undo_lsn, first_undo_lsn; + TrID long_trid; +}; +struct st_table_for_recovery /* used in the REDO and UNDO phase */ +{ + MARIA_HA *info; +}; +/* Variables used by all functions of this module. Ok as single-threaded */ +static struct st_trn_for_recovery *all_active_trans; +static struct st_table_for_recovery *all_tables; +static struct st_dirty_page *dirty_pages_pool; +static LSN current_group_end_lsn; +#ifndef DBUG_OFF +/** Current group of REDOs is about this table and only this one */ +static MARIA_HA *current_group_table; +#endif +static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */ +/** @brief to avoid writing a checkpoint if recovery did nothing. */ +static my_bool checkpoint_useful; +static my_bool in_redo_phase; +static my_bool trns_created; +static int aria_undo_aborted= 0; +static ulong skipped_undo_phase; +static ulonglong now; /**< for tracking execution time of phases */ +static void (*save_error_handler_hook)(uint, const char *,myf); +static ulong recovery_warnings; /**< count of warnings */ +HASH tables_to_redo; /* For maria_read_log */ +ulong maria_recovery_force_crash_counter; +TrID max_long_trid= 0; /**< max long trid seen by REDO phase */ + +#define prototype_redo_exec_hook(R) \ + static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec) + +#define prototype_redo_exec_hook_dummy(R) \ + static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec \ + __attribute__ ((unused))) + +#define prototype_undo_exec_hook(R) \ + static int exec_UNDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec, TRN *trn) + +prototype_redo_exec_hook(LONG_TRANSACTION_ID); +prototype_redo_exec_hook_dummy(CHECKPOINT); +prototype_redo_exec_hook(REDO_CREATE_TABLE); +prototype_redo_exec_hook(REDO_RENAME_TABLE); +prototype_redo_exec_hook(REDO_REPAIR_TABLE); +prototype_redo_exec_hook(REDO_DROP_TABLE); +prototype_redo_exec_hook(FILE_ID); +prototype_redo_exec_hook(INCOMPLETE_LOG); +prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP); +prototype_redo_exec_hook(UNDO_BULK_INSERT); +prototype_redo_exec_hook(IMPORTED_TABLE); +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD); +prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL); +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD); +prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD); +prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL); +prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL); +prototype_redo_exec_hook(REDO_FREE_BLOCKS); +prototype_redo_exec_hook(REDO_DELETE_ALL); +prototype_redo_exec_hook(REDO_INDEX); +prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE); +prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE); +prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE); +prototype_redo_exec_hook(UNDO_ROW_INSERT); +prototype_redo_exec_hook(UNDO_ROW_DELETE); +prototype_redo_exec_hook(UNDO_ROW_UPDATE); +prototype_redo_exec_hook(UNDO_KEY_INSERT); +prototype_redo_exec_hook(UNDO_KEY_DELETE); +prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); +prototype_redo_exec_hook(COMMIT); +prototype_redo_exec_hook(CLR_END); +prototype_redo_exec_hook(DEBUG_INFO); +prototype_undo_exec_hook(UNDO_ROW_INSERT); +prototype_undo_exec_hook(UNDO_ROW_DELETE); +prototype_undo_exec_hook(UNDO_ROW_UPDATE); +prototype_undo_exec_hook(UNDO_KEY_INSERT); +prototype_undo_exec_hook(UNDO_KEY_DELETE); +prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); +prototype_undo_exec_hook(UNDO_BULK_INSERT); + +static int run_redo_phase(LSN lsn, LSN end_lsn, + enum maria_apply_log_way apply); +static uint end_of_redo_phase(my_bool prepare_for_undo_phase); +static int run_undo_phase(LSN end_undo_lsn, uint uncommitted); +static void display_record_position(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec, + uint number); +static int display_and_apply_record(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec); +static MARIA_HA *get_MARIA_HA_from_REDO_record(const + TRANSLOG_HEADER_BUFFER *rec); +static MARIA_HA *get_MARIA_HA_from_UNDO_record(const + TRANSLOG_HEADER_BUFFER *rec); +static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon); +static LSN parse_checkpoint_record(LSN lsn); +static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn, + LSN first_undo_lsn); +static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id); +static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn, + struct st_dirty_page *dirty_page); +static int close_all_tables(void); +static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr); +static void print_redo_phase_progress(TRANSLOG_ADDRESS addr); +static void delete_all_transactions(); + +/** @brief global [out] buffer for translog_read_record(); never shrinks */ +static struct +{ + /* + uchar* is more adapted (less casts) than char*, thus we don't use + LEX_STRING. + */ + uchar *str; + size_t length; +} log_record_buffer; +static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec) +{ + if (log_record_buffer.length < rec->record_length) + { + log_record_buffer.length= rec->record_length; + log_record_buffer.str= my_realloc(PSI_INSTRUMENT_ME, log_record_buffer.str, + rec->record_length, + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); + } +} +/** @brief Tells what kind of progress message was printed to the error log */ +static enum recovery_message_type +{ + REC_MSG_NONE= 0, REC_MSG_REDO, REC_MSG_UNDO, REC_MSG_FLUSH +} recovery_message_printed; + + +/* Hook to ensure we get nicer output if we get an error */ + +void maria_recover_error_handler_hook(uint error, const char *str, + myf flags) +{ + if (procent_printed) + { + procent_printed= 0; + fputc('\n', stderr); + fflush(stderr); + } + (*save_error_handler_hook)(error, str, flags); +} + +/* Define this if you want gdb to break in some interesting situations */ +#define ALERT_USER() + +static void print_preamble() +{ + ma_message_no_user(ME_NOTE, "starting recovery"); +} + + +static my_bool table_is_part_of_recovery_set(LEX_STRING *file_name) +{ + uint offset =0; + if (!tables_to_redo.records) + return 1; /* Default, recover table */ + + /* Skip base directory */ + if (file_name->str[0] == '.' && + (file_name->str[1] == '/' || file_name->str[1] == '\\')) + offset= 2; + /* Only recover if table is in hash */ + return my_hash_search(&tables_to_redo, (uchar*) file_name->str + offset, + file_name->length - offset) != 0; +} + +/** + @brief Recovers from the last checkpoint. + + Runs the REDO phase using special structures, then sets up the playground + of runtime: recreates transactions inside trnman, open tables with their + two-byte-id mapping; takes a checkpoint and runs the UNDO phase. Closes all + tables. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_recovery_from_log(void) +{ + int res= 1; + FILE *trace_file; + uint warnings_count; +#ifdef EXTRA_DEBUG + char name_buff[FN_REFLEN]; +#endif + DBUG_ENTER("maria_recovery_from_log"); + + DBUG_ASSERT(!maria_in_recovery); + maria_in_recovery= TRUE; + +#ifdef EXTRA_DEBUG + fn_format(name_buff, "aria_recovery.trace", maria_data_root, "", MYF(0)); + trace_file= my_fopen(name_buff, O_WRONLY|O_APPEND|O_CREAT, MYF(MY_WME)); +#else + trace_file= NULL; /* no trace file for being fast */ +#endif + tprint(trace_file, "TRACE of the last Aria recovery from mysqld\n"); + DBUG_ASSERT(maria_pagecache->inited); + res= maria_apply_log(LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, MARIA_LOG_APPLY, + trace_file, TRUE, TRUE, &warnings_count); + if (!res) + { + if (warnings_count == 0 && recovery_found_crashed_tables == 0) + tprint(trace_file, "SUCCESS\n"); + else + tprint(trace_file, "DOUBTFUL (%u warnings, check previous output)\n", + warnings_count); + } + if (trace_file) + my_fclose(trace_file, MYF(0)); + maria_in_recovery= FALSE; + DBUG_RETURN(res); +} + + +/** + @brief Displays and/or applies the log + + @param from_lsn LSN from which log reading/applying should start; + LSN_IMPOSSIBLE means "use last checkpoint" + @param end_redo_lsn Apply until this. LSN_IMPOSSIBLE means until end. + @param end_und_lsn Apply all undo >= end_undo_lsn. Set to LSN_MAX if + no undo's should be applied. + @param apply how log records should be applied or not + @param trace_file trace file where progress/debug messages will go + @param skip_DDLs_arg Should DDL records (CREATE/RENAME/DROP/REPAIR) + be skipped by the REDO phase or not + @param take_checkpoints Should we take checkpoints or not. + @param[out] warnings_count Count of warnings will be put there + + @todo This trace_file thing is primitive; soon we will make it similar to + ma_check_print_warning() etc, and a successful recovery does not need to + create a trace file. But for debugging now it is useful. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_apply_log(LSN from_lsn, LSN end_redo_lsn, LSN end_undo_lsn, + enum maria_apply_log_way apply, + FILE *trace_file, + my_bool skip_DDLs_arg, + my_bool take_checkpoints, uint *warnings_count) +{ + int error= 0; + uint uncommitted_trans; + ulonglong old_now; + my_bool abort_message_printed= 0; + DBUG_ENTER("maria_apply_log"); + + DBUG_ASSERT(apply == MARIA_LOG_APPLY || end_undo_lsn == LSN_MAX); + DBUG_ASSERT(!maria_multi_threaded); + recovery_warnings= recovery_found_crashed_tables= 0; + skipped_lsn_err_count= 0; + maria_recovery_changed_data= 0; + /* checkpoints can happen only if TRNs have been built */ + DBUG_ASSERT(end_undo_lsn != LSN_MAX || !take_checkpoints); + all_active_trans= (struct st_trn_for_recovery *) + my_malloc(PSI_INSTRUMENT_ME, (SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery), + MYF(MY_ZEROFILL)); + all_tables= (struct st_table_for_recovery *) + my_malloc(PSI_INSTRUMENT_ME, (SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery), + MYF(MY_ZEROFILL)); + + save_error_handler_hook= error_handler_hook; + error_handler_hook= maria_recover_error_handler_hook; + + if (!all_active_trans || !all_tables) + goto err; + + if (take_checkpoints && ma_checkpoint_init(0)) + goto err; + + recovery_message_printed= REC_MSG_NONE; + checkpoint_useful= trns_created= FALSE; + aria_undo_aborted= 0; + tracef= trace_file; +#ifdef INSTANT_FLUSH_OF_MESSAGES + /* enable this for instant flush of messages to trace file */ + setbuf(tracef, NULL); +#endif + skip_DDLs= skip_DDLs_arg; + skipped_undo_phase= 0; + + trnman_init(max_trid_in_control_file); + + if (from_lsn == LSN_IMPOSSIBLE) + { + if (last_checkpoint_lsn == LSN_IMPOSSIBLE) + { + from_lsn= translog_first_lsn_in_log(); + if (unlikely(from_lsn == LSN_ERROR)) + { + trnman_destroy(); + goto err; + } + } + else + { + from_lsn= parse_checkpoint_record(last_checkpoint_lsn); + if (from_lsn == LSN_ERROR) + { + trnman_destroy(); + goto err; + } + } + } + + now= microsecond_interval_timer(); + in_redo_phase= TRUE; + if (run_redo_phase(from_lsn, end_redo_lsn, apply)) + { + ma_message_no_user(0, "Redo phase failed"); + trnman_destroy(); + goto err; + } + trnman_destroy(); + + if (end_redo_lsn != LSN_IMPOSSIBLE && + (end_undo_lsn == LSN_MAX || end_undo_lsn == LSN_IMPOSSIBLE)) + { + abort_message_printed= 1; + if (!trace_file) + fputc('\n', stderr); + my_message(HA_ERR_INITIALIZATION, + "Maria recovery aborted as end_lsn/end of file was reached", + MYF(0)); + goto err2; + } + + if ((uncommitted_trans= + end_of_redo_phase(end_undo_lsn != LSN_MAX)) == (uint)-1) + { + ma_message_no_user(0, "End of redo phase failed"); + goto err; + } + in_redo_phase= FALSE; + + old_now= now; + now= microsecond_interval_timer(); + if (recovery_message_printed == REC_MSG_REDO) + { + double phase_took= (now - old_now)/1000000.0; + /* + Detailed progress info goes to stderr, because ma_message_no_user() + cannot put several messages on one line. + */ + procent_printed= 1; + fprintf(stderr, " (%.1f seconds); ", phase_took); + fflush(stderr); + } + + /** + REDO phase does not fill blocks' rec_lsn, so a checkpoint now would be + wrong: if a future recovery used it, the REDO phase would always + start from the checkpoint and never from before, wrongly skipping REDOs + (tested). Another problem is that the REDO phase uses + PAGECACHE_PLAIN_PAGE, while Checkpoint only collects PAGECACHE_LSN_PAGE. + + @todo fix this. pagecache_write() now can have a rec_lsn argument. And we + could make a function which goes through pages at end of REDO phase and + changes their type. + */ +#ifdef FIX_AND_ENABLE_LATER + if (take_checkpoints && checkpoint_useful) + { + /* + We take a checkpoint as it can save future recovery work if we crash + during the UNDO phase. But we don't flush pages, as UNDOs will change + them again probably. + If we wanted to take checkpoints in the middle of the REDO phase, at a + moment when we haven't reached the end of log so don't have exact data + about transactions, we could write a special checkpoint: containing only + the list of dirty pages, otherwise to be treated as if it was at the + same LSN as the last checkpoint. + */ + if (ma_checkpoint_execute(CHECKPOINT_INDIRECT, FALSE)) + goto err; + } +#endif + + if (end_undo_lsn != LSN_MAX) + { + if (run_undo_phase(end_undo_lsn, uncommitted_trans)) + { + ma_message_no_user(0, "Undo phase failed"); + goto err; + } + if (aria_undo_aborted) + ma_message_no_user(0, "Undo phase aborted in the middle on user request"); + else if (end_redo_lsn != LSN_IMPOSSIBLE) + my_message(HA_ERR_INITIALIZATION, + "Maria recovery aborted as end_lsn followed by end_undo was " + "reached", MYF(0)); + } + else if (uncommitted_trans > 0) + { + eprint(tracef, "***WARNING: %u uncommitted transactions; some tables may" + " be left inconsistent!***", uncommitted_trans); + recovery_warnings++; + } + + if (skipped_undo_phase) + { + /* + We could want to print a list of tables for which UNDOs were skipped, + but not one line per skipped UNDO. + */ + eprint(tracef, "***WARNING: %lu UNDO records skipped in UNDO phase; some" + " tables may be left inconsistent!***", skipped_undo_phase); + recovery_warnings++; + } + + old_now= now; + now= microsecond_interval_timer(); + if (recovery_message_printed == REC_MSG_UNDO) + { + double phase_took= (now - old_now)/1000000.0; + procent_printed= 1; + fprintf(stderr, " (%.1f seconds); ", phase_took); + fflush(stderr); + } + + /* + we don't use maria_panic() because it would maria_end(), and Recovery does + not want that (we want to keep some modules initialized for runtime). + */ + if (close_all_tables()) + { + ma_message_no_user(0, "closing of tables failed"); + goto err; + } + + old_now= now; + now= microsecond_interval_timer(); + if (recovery_message_printed == REC_MSG_FLUSH) + { + double phase_took= (now - old_now)/1000000.0; + procent_printed= 1; + fprintf(stderr, " (%.1f seconds); ", phase_took); + fflush(stderr); + } + + if (max_long_trid > max_trid_in_control_file) + { + if (ma_control_file_write_and_force(last_checkpoint_lsn, last_logno, + max_long_trid, recovery_failures)) + goto err; + } + + if (take_checkpoints && checkpoint_useful) + { + /* No dirty pages, all tables are closed, no active transactions, save: */ + if (ma_checkpoint_execute(CHECKPOINT_FULL, FALSE)) + goto err; + } + + goto end; +err: + tprint(tracef, "\nRecovery of tables with transaction logs FAILED\n"); +err2: + if (trns_created) + delete_all_transactions(); + if (!abort_message_printed) + error= 1; + if (close_all_tables()) + { + ma_message_no_user(0, "closing of tables failed"); + } +end: + error_handler_hook= save_error_handler_hook; + my_hash_free(&all_dirty_pages); + bzero(&all_dirty_pages, sizeof(all_dirty_pages)); + my_free(dirty_pages_pool); + dirty_pages_pool= NULL; + my_free(all_tables); + all_tables= NULL; + my_free(all_active_trans); + all_active_trans= NULL; + my_free(log_record_buffer.str); + log_record_buffer.str= NULL; + log_record_buffer.length= 0; + ma_checkpoint_end(); + *warnings_count= recovery_warnings + recovery_found_crashed_tables; + if (recovery_message_printed != REC_MSG_NONE) + { + if (procent_printed) + { + procent_printed= 0; + fprintf(stderr, "\n"); + fflush(stderr); + } + if (!error && !abort_message_printed) + { + ma_message_no_user(ME_NOTE, "recovery done"); + maria_recovery_changed_data= 1; + } + } + else if (!error && max_trid_in_control_file != max_long_trid) + { + /* + maria_end() will set max trid in log file so that one can run + maria_chk on the tables + */ + maria_recovery_changed_data= 1; + } + + if (error && !abort_message_printed) + { + my_message(HA_ERR_INITIALIZATION, + "Aria recovery failed. Please run aria_chk -r on all Aria " + "tables (*.MAI) and delete all aria_log.######## files", MYF(0)); + } + procent_printed= 0; + /* + We don't cleanly close tables if we hit some error (may corrupt them by + flushing some wrong blocks made from wrong REDOs). It also leaves their + open_count>0, which ensures that --aria-recover, if used, will try to + repair them. + */ + DBUG_RETURN(error); +} + + +/* very basic info about the record's header */ +static void display_record_position(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec, + uint number) +{ + /* + if number==0, we're going over records which we had already seen and which + form a group, so we indent below the group's end record + */ + tprint(tracef, + "%sRec#%u LSN " LSN_FMT " short_trid %u %s(num_type:%u) len %lu\n", + number ? "" : " ", number, LSN_IN_PARTS(rec->lsn), + rec->short_trid, log_desc->name, rec->type, + (ulong)rec->record_length); + if (rec->type == LOGREC_DEBUG_INFO) + { + /* Print some extra information */ + (*log_desc->record_execute_in_redo_phase)(rec); + } +} + + +static int display_and_apply_record(const LOG_DESC *log_desc, + const TRANSLOG_HEADER_BUFFER *rec) +{ + int error; + if (log_desc->record_execute_in_redo_phase == NULL) + { + /* die on all not-yet-handled records :) */ + DBUG_ASSERT("one more hook to write" == 0); + return 1; + } + if (rec->type == LOGREC_DEBUG_INFO) + { + /* Query already printed by display_record_position() */ + return 0; + } + if ((error= (*log_desc->record_execute_in_redo_phase)(rec))) + eprint(tracef, "Got error %d when executing record %s", + my_errno, log_desc->name); + return error; +} + + +prototype_redo_exec_hook(LONG_TRANSACTION_ID) +{ + uint16 sid= rec->short_trid; + TrID long_trid= all_active_trans[sid].long_trid; + /* + Any incomplete group should be of an old crash which already had a + recovery and thus has logged INCOMPLETE_GROUP which we must have seen. + */ + DBUG_ASSERT(all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE); + if (long_trid != 0) + { + LSN ulsn= all_active_trans[sid].undo_lsn; + /* + If the first record of that transaction is after 'rec', it's probably + because that transaction was found in the checkpoint record, and then + it's ok, we can forget about that transaction (we'll meet it later + again in the REDO phase) and replace it with the one in 'rec'. + */ + if ((ulsn != LSN_IMPOSSIBLE) && + (cmp_translog_addr(ulsn, rec->lsn) < 0)) + { + char llbuf[22]; + llstr(long_trid, llbuf); + eprint(tracef, "Found an old transaction long_trid %s short_trid %u" + " with same short id as this new transaction, and has neither" + " committed nor rollback (undo_lsn: " LSN_FMT ")", + llbuf, sid, LSN_IN_PARTS(ulsn)); + goto err; + } + } + long_trid= uint6korr(rec->header); + new_transaction(sid, long_trid, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE); + goto end; +err: + ALERT_USER(); + return 1; +end: + return 0; +} + + +static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn, + LSN first_undo_lsn) +{ + char llbuf[22]; + all_active_trans[sid].long_trid= long_id; + llstr(long_id, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u starts," + " undo_lsn " LSN_FMT " first_undo_lsn " LSN_FMT "\n", + llbuf, sid, LSN_IN_PARTS(undo_lsn), LSN_IN_PARTS(first_undo_lsn)); + all_active_trans[sid].undo_lsn= undo_lsn; + all_active_trans[sid].first_undo_lsn= first_undo_lsn; + set_if_bigger(max_long_trid, long_id); +} + + +prototype_redo_exec_hook_dummy(CHECKPOINT) +{ + /* the only checkpoint we care about was found via control file, ignore */ + tprint(tracef, "CHECKPOINT found\n"); + return 0; +} + + +prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP) +{ + /* abortion was already made */ + return 0; +} + + +prototype_redo_exec_hook(INCOMPLETE_LOG) +{ + MARIA_HA *info; + + /* We try to get table first, so that we get the table in in the trace log */ + info= get_MARIA_HA_from_REDO_record(rec); + + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + + if (!info) + { + /* no such table, don't need to warn */ + return 0; + } + + if (maria_is_crashed(info)) + return 0; + + if (info->s->state.is_of_horizon > rec->lsn) + { + /* + This table was repaired at a time after this log entry. + We can assume that all rows was inserted sucessfully and we don't + have to warn about that the inserted data was not logged + */ + return 0; + } + + /* + Example of what can go wrong when replaying DDLs: + CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged); + ALTER TABLE t ... which does + CREATE a temporary table #sql... (logged) + INSERT data from t into #sql... (not logged) + RENAME #sql TO t (logged) + Removing tables by hand and replaying the log will leave in the + end an empty table "t": missing records. If after the RENAME an INSERT + into t was done, that row had number 1 in its page, executing the + REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion + failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is + created whereas rownr is not 0). + So when the server disables logging for ALTER TABLE or CREATE SELECT, it + logs LOGREC_INCOMPLETE_LOG to warn aria_read_log and then the user. + + Another issue is that replaying of DDLs is not correct enough to work if + there was a crash during a DDL (see comment in execution of + REDO_RENAME_TABLE ). + */ + + eprint(tracef, "***WARNING: Aria engine currently logs no records " + "about insertion of data by ALTER TABLE and CREATE SELECT, " + "as they are not necessary for recovery; " + "present applying of log records to table '%s' may well not work." + "***", info->s->index_file_name.str); + + /* Prevent using the table for anything else than undo repair */ + _ma_mark_file_crashed(info->s); + recovery_warnings++; + return 0; +} + + +static my_bool create_database_if_not_exists(const char *name) +{ + char dirname[FN_REFLEN]; + size_t length; + MY_STAT stat_info; + DBUG_ENTER("create_database_if_not_exists"); + + dirname_part(dirname, name, &length); + if (!length) + { + /* Skip files without directores */ + DBUG_RETURN(0); + } + /* + Safety; Don't create files with hard path; + Should never happen with MariaDB + If hard path, then error will be detected when trying to create index file + */ + if (test_if_hard_path(dirname)) + DBUG_RETURN(0); + + if (my_stat(dirname,&stat_info,MYF(0))) + DBUG_RETURN(0); + + + tprint(tracef, "Creating not existing database '%s'\n", dirname); + if (my_mkdir(dirname, 0777, MYF(MY_WME))) + { + eprint(tracef, "***WARNING: Can't create not existing database '%s'", + dirname); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + + + + +prototype_redo_exec_hook(REDO_CREATE_TABLE) +{ + File dfile= -1, kfile= -1; + char *linkname_ptr, filename[FN_REFLEN], *name, *ptr, *ptr2, + *data_file_name, *index_file_name; + uchar *kfile_header; + myf create_flag; + uint flags; + int error= 1, create_mode= O_RDWR | O_TRUNC, i; + MARIA_HA *info= NULL; + uint kfile_size_before_extension, keystart; + DBUG_ENTER("exec_REDO_LOGREC_REDO_CREATE_TABLE"); + + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + DBUG_RETURN(0); + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + name= (char *)log_record_buffer.str; + /* + TRUNCATE TABLE and REPAIR USE_FRM call maria_create(), so below we can + find a REDO_CREATE_TABLE for a table which we have open, that's why we + need to look for any open instances and close them first. + */ + if (close_one_table(name, rec->lsn)) + { + eprint(tracef, "Table '%s' got error %d on close", name, my_errno); + ALERT_USER(); + goto end; + } + /* we try hard to get create_rename_lsn, to avoid mistakes if possible */ + info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0); + if (info) + { + MARIA_SHARE *share= info->s; + /* check that we're not already using it */ + if (share->reopen != 1) + { + eprint(tracef, "Table '%s is already open (reopen=%u)", + name, share->reopen); + ALERT_USER(); + goto end; + } + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + if (!share->base.born_transactional) + { + /* + could be that transactional table was later dropped, and a non-trans + one was renamed to its name, thus create_rename_lsn is 0 and should + not be trusted. + */ + tprint(tracef, "Table '%s' is not transactional, ignoring creation\n", + name); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, "Table '%s' has create_rename_lsn " LSN_FMT " more " + "recent than record, ignoring creation\n", + name, LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + eprint(tracef, "Table '%s' is crashed, can't recreate it", name); + ALERT_USER(); + goto end; + } + maria_close(info); + info= NULL; + } + else + { + /* one or two files absent, or header corrupted... */ + tprint(tracef, "Table '%s' can't be opened (Error: %d)\n", + name, my_errno); + } + /* if does not exist, or is older, overwrite it */ + ptr= name + strlen(name) + 1; + if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0)) + tprint(tracef, ", we will only touch index file"); + ptr++; + kfile_size_before_extension= uint2korr(ptr); + ptr+= 2; + keystart= uint2korr(ptr); + ptr+= 2; + kfile_header= (uchar *)ptr; + ptr+= kfile_size_before_extension; + /* set header lsns */ + ptr2= (char *) kfile_header + sizeof(info->s->state.header) + + MARIA_FILE_CREATE_RENAME_LSN_OFFSET; + for (i= 0; i<3; i++) + { + lsn_store(ptr2, rec->lsn); + ptr2+= LSN_STORE_SIZE; + } + data_file_name= ptr; + ptr+= strlen(data_file_name) + 1; + index_file_name= ptr; + ptr+= strlen(index_file_name) + 1; + /** @todo handle symlinks */ + if (data_file_name[0] || index_file_name[0]) + { + eprint(tracef, "Table '%s' DATA|INDEX DIRECTORY clauses are not handled", + name); + goto end; + } + if (create_database_if_not_exists(name)) + goto end; + fn_format(filename, name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH | MY_APPEND_EXT); + linkname_ptr= NULL; + create_flag= MY_DELETE_OLD; + tprint(tracef, "Table '%s' creating as '%s'\n", name, filename); + if ((kfile= mysql_file_create_with_symlink(key_file_kfile, linkname_ptr, + filename, 0, create_mode, + MYF(MY_WME|create_flag))) < 0) + { + eprint(tracef, "Failed to create index file"); + goto end; + } + if (my_pwrite(kfile, kfile_header, + kfile_size_before_extension, 0, MYF(MY_NABP|MY_WME)) || + mysql_file_chsize(kfile, keystart, 0, MYF(MY_WME))) + { + eprint(tracef, "Failed to write to index file"); + goto end; + } + if (!(flags & HA_DONT_TOUCH_DATA)) + { + fn_format(filename,name,"", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + linkname_ptr= NULL; + create_flag=MY_DELETE_OLD; + if (((dfile= + mysql_file_create_with_symlink(key_file_dfile, linkname_ptr, + filename, 0, create_mode, + MYF(MY_WME | create_flag))) < 0) || + mysql_file_close(dfile, MYF(MY_WME))) + { + eprint(tracef, "Failed to create data file"); + goto end; + } + /* + we now have an empty data file. To be able to + _ma_initialize_data_file() we need some pieces of the share to be + correctly filled. So we just open the table (fortunately, an empty + data file does not preclude this). + */ + if (((info= maria_open(name, O_RDONLY, 0, 0)) == NULL) || + _ma_initialize_data_file(info->s, info->dfile.file)) + { + eprint(tracef, "Failed to open new table or write to data file"); + goto end; + } + } + error= 0; +end: + if (kfile >= 0) + error|= mysql_file_close(kfile, MYF(MY_WME)); + if (info != NULL) + error|= maria_close(info); + DBUG_RETURN(error); +} + + +prototype_redo_exec_hook(REDO_RENAME_TABLE) +{ + char *old_name, *new_name; + int error= 1; + MARIA_HA *info= NULL; + my_bool from_table_is_crashed= 0; + DBUG_ENTER("exec_REDO_LOGREC_REDO_RENAME_TABLE"); + + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + DBUG_RETURN(0); + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + old_name= (char *)log_record_buffer.str; + new_name= old_name + strlen(old_name) + 1; + tprint(tracef, "Table '%s' to be renamed to '%s'; old-name table ", old_name, + new_name); + /* + Here is why we skip CREATE/DROP/RENAME when doing a recovery from + ha_maria (whereas we do when called from aria_read_log). Consider: + CREATE TABLE t; + RENAME TABLE t to u; + DROP TABLE u; + RENAME TABLE v to u; # crash between index rename and data rename. + And do a Recovery (not removing tables beforehand). + Recovery replays CREATE, then RENAME: the maria_open("t") works, + maria_open("u") does not (no data file) so table "u" is considered + inexistent and so maria_rename() is done which overwrites u's index file, + which is lost. Ok, the data file (v.MAD) is still available, but only a + REPAIR USE_FRM can rebuild the index, which is unsafe and downtime. + So it is preferrable to not execute RENAME, and leave the "mess" of files, + rather than possibly destroy a file. DBA will manually rename files. + A safe recovery method would probably require checking the existence of + the index file and of the data file separately (not via maria_open()), and + maybe also to store a create_rename_lsn in the data file too + For now, all we risk is to leave the mess (half-renamed files) left by the + crash. We however sync files and directories at each file rename. The SQL + layer is anyway not crash-safe for DDLs (except the repartioning-related + ones). + We replay DDLs in aria_read_log to be able to recreate tables from + scratch. It means that "aria_read_log -a" should not be used on a + database which just crashed during a DDL. And also ALTER TABLE does not + log insertions of records into the temporary table, so replaying may + fail (grep for INCOMPLETE_LOG in files). + */ + info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0); + if (info) + { + MARIA_SHARE *share= info->s; + if (!share->base.born_transactional) + { + tprint(tracef, "is not transactional, ignoring renaming"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, "has create_rename_lsn " LSN_FMT " more recent than" + " record, ignoring renaming", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, "is crashed, can't be used for rename ; new-name table "); + from_table_is_crashed= 1; + } + if (close_one_table(info->s->open_file_name.str, rec->lsn) || + maria_close(info)) + goto end; + info= NULL; + if (!from_table_is_crashed) + tprint(tracef, "is ok for renaming; new-name table "); + } + else /* one or two files absent, or header corrupted... */ + { + tprint(tracef, ", can't be opened, probably does not exist"); + error= 0; + goto end; + } + /* + We must also check the create_rename_lsn of the 'new_name' table if it + exists: otherwise we may, with our rename which overwrites, destroy + another table. For example: + CREATE TABLE t; + RENAME t to u; + DROP TABLE u; + RENAME v to u; # v is an old table, its creation/insertions not in log + And start executing the log (without removing tables beforehand): creates + t, renames it to u (if not testing create_rename_lsn) thus overwriting + old-named v, drops u, and we are stuck, we have lost data. + */ + info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0); + if (info) + { + MARIA_SHARE *share= info->s; + /* We should not have open instances on this table. */ + if (share->reopen != 1) + { + tprint(tracef, "is already open (reopen=%u)", share->reopen); + ALERT_USER(); + goto end; + } + if (!share->base.born_transactional) + { + tprint(tracef, "is not transactional, ignoring renaming"); + ALERT_USER(); + goto drop; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, "has create_rename_lsn " LSN_FMT " more recent than" + " record, ignoring renaming", + LSN_IN_PARTS(share->state.create_rename_lsn)); + /* + We have to drop the old_name table. Consider: + CREATE TABLE t; + CREATE TABLE v; + RENAME TABLE t to u; + DROP TABLE u; + RENAME TABLE v to u; + and apply the log without removing tables beforehand. t will be + created, v too; in REDO_RENAME u will be more recent, but we still + have to drop t otherwise it stays. + */ + goto drop; + } + if (maria_is_crashed(info)) + { + tprint(tracef, "is crashed, can't rename it"); + ALERT_USER(); + goto end; + } + if (maria_close(info)) + goto end; + info= NULL; + /* abnormal situation */ + tprint(tracef, "exists but is older than record, can't rename it"); + goto end; + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef, "can't be opened, probably does not exist"); + + if (from_table_is_crashed) + { + eprint(tracef, "Aborting rename as old table was crashed"); + ALERT_USER(); + goto end; + } + + tprint(tracef, ", renaming '%s'", old_name); + if (maria_rename(old_name, new_name)) + { + eprint(tracef, "Failed to rename table"); + goto end; + } + info= maria_open(new_name, O_RDONLY, 0, 0); + if (info == NULL) + { + eprint(tracef, "Failed to open renamed table"); + goto end; + } + if (_ma_update_state_lsns(info->s, rec->lsn, info->s->state.create_trid, + TRUE, TRUE)) + goto end; + if (maria_close(info)) + goto end; + info= NULL; + error= 0; + goto end; +drop: + tprint(tracef, ", only dropping '%s'", old_name); + if (maria_delete_table(old_name)) + { + eprint(tracef, "Failed to drop table"); + goto end; + } + error= 0; + goto end; +end: + tprint(tracef, "\n"); + if (info != NULL) + error|= maria_close(info); + DBUG_RETURN(error); +} + + +/* + The record may come from REPAIR, ALTER TABLE ENABLE KEYS, OPTIMIZE. +*/ +prototype_redo_exec_hook(REDO_REPAIR_TABLE) +{ + int error= 1; + MARIA_HA *info; + HA_CHECK param; + char *name; + my_bool quick_repair; + DBUG_ENTER("exec_REDO_LOGREC_REDO_REPAIR_TABLE"); + + /* We try to get table first, so that we get the table in in the trace log */ + info= get_MARIA_HA_from_REDO_record(rec); + + if (!info) + { + /* no such table, don't need to warn */ + DBUG_RETURN(0); + } + + if (maria_is_crashed(info)) + { + tprint(tracef, "we skip repairing crashed table\n"); + DBUG_RETURN(0); + } + + if (rec->lsn <= info->s->state.is_of_horizon) + { + DBUG_PRINT("info", ("Table is up to date, skipping redo")); + DBUG_RETURN(0); + } + + /* + Otherwise, the mapping is newer than the table, and our record is newer + than the mapping, so we can repair. + */ + tprint(tracef, " repairing...\n"); + + maria_chk_init(¶m); + param.isam_file_name= name= info->s->open_file_name.str; + param.testflag= uint8korr(rec->header + FILEID_STORE_SIZE); + param.tmpdir= maria_tmpdir; + param.max_trid= max_long_trid; + DBUG_ASSERT(maria_tmpdir); + + info->s->state.key_map= uint8korr(rec->header + FILEID_STORE_SIZE + 8); + quick_repair= MY_TEST(param.testflag & T_QUICK); + + if (param.testflag & T_REP_PARALLEL) + { + if (maria_repair_parallel(¶m, info, name, quick_repair)) + goto end; + } + else if (param.testflag & T_REP_BY_SORT) + { + if (maria_repair_by_sort(¶m, info, name, quick_repair)) + goto end; + } + else if (maria_repair(¶m, info, name, quick_repair)) + goto end; + + if (_ma_update_state_lsns(info->s, rec->lsn, trnman_get_min_safe_trid(), + TRUE, !(param.testflag & T_NO_CREATE_RENAME_LSN))) + goto end; + error= 0; + +end: + DBUG_RETURN(error); +} + + +prototype_redo_exec_hook(REDO_DROP_TABLE) +{ + char *name; + int error= 1; + MARIA_HA *info; + if (skip_DDLs) + { + tprint(tracef, "we skip DDLs\n"); + return 0; + } + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + name= (char *)log_record_buffer.str; + tprint(tracef, "Table '%s'", name); + info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR, 0); + if (info) + { + MARIA_SHARE *share= info->s; + if (!share->base.born_transactional) + { + tprint(tracef, ", is not transactional, ignoring removal\n"); + ALERT_USER(); + error= 0; + goto end; + } + if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0) + { + tprint(tracef, ", has create_rename_lsn " LSN_FMT " more recent than" + " record, ignoring removal", + LSN_IN_PARTS(share->state.create_rename_lsn)); + error= 0; + goto end; + } + if (maria_is_crashed(info)) + { + tprint(tracef, ", is crashed, can't drop it"); + ALERT_USER(); + goto end; + } + if (close_one_table(info->s->open_file_name.str, rec->lsn) || + maria_close(info)) + goto end; + info= NULL; + /* if it is older, or its header is corrupted, drop it */ + tprint(tracef, ", dropping '%s'", name); + if (maria_delete_table(name)) + { + eprint(tracef, "Failed to drop table"); + goto end; + } + } + else /* one or two files absent, or header corrupted... */ + tprint(tracef,", can't be opened, probably does not exist"); + error= 0; +end: + tprint(tracef, "\n"); + if (info != NULL) + error|= maria_close(info); + return error; +} + + +prototype_redo_exec_hook(FILE_ID) +{ + uint16 sid; + int error= 1; + const char *name; + MARIA_HA *info; + DBUG_ENTER("exec_REDO_LOGREC_FILE_ID"); + + if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0) + { + /* + If that mapping was still true at checkpoint time, it was found in + checkpoint record, no need to recreate it. If that mapping had ended at + checkpoint time (table was closed or repaired), a flush and force + happened and so mapping is not needed. + */ + tprint(tracef, "ignoring because before checkpoint\n"); + DBUG_RETURN(0); + } + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + sid= fileid_korr(log_record_buffer.str); + info= all_tables[sid].info; + if (info != NULL) + { + tprint(tracef, " Closing table '%s'\n", info->s->open_file_name.str); + prepare_table_for_close(info, rec->lsn); + + /* + Ensure that open count is 1 on close. This is needed as the + table may initially had an open_count > 0 when we initially + opened it as the server may have crashed without closing it + properly. As we now have applied all redo's for the table up to + now, we know the table is ok, so it's safe to reset the open + count to 0. + */ + if (info->s->state.open_count != 0 && info->s->reopen == 1) + { + /* let ma_close() mark the table properly closed */ + info->s->state.open_count= 1; + info->s->global_changed= 1; + info->s->changed= 1; + } + if (maria_close(info)) + { + eprint(tracef, "Failed to close table"); + goto end; + } + all_tables[sid].info= NULL; + } + name= (char *)log_record_buffer.str + FILEID_STORE_SIZE; + if (new_table(sid, name, rec->lsn)) + goto end; + error= 0; +end: + DBUG_RETURN(error); +} + + +static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id) +{ + /* + -1 (skip table): close table and return 0; + 1 (error): close table and return 1; + 0 (success): leave table open and return 0. + */ + int error= 1; + MARIA_HA *info; + MARIA_SHARE *share; + my_off_t dfile_len, kfile_len; + DBUG_ENTER("new_table"); + + checkpoint_useful= TRUE; + if ((name == NULL) || (name[0] == 0)) + { + /* + we didn't use DBUG_ASSERT() because such record corruption could + silently pass in the "info == NULL" test below. + */ + tprint(tracef, ", record is corrupted"); + eprint(tracef, "\n***WARNING: %s may be corrupted", name ? name : "NULL"); + info= NULL; + recovery_warnings++; + goto end; + } + tprint(tracef, "Table '%s', id %u", name, sid); + info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR, 0); + if (info == NULL) + { + tprint(tracef, ", is absent (must have been dropped later?)" + " or its header is so corrupted that we cannot open it;" + " we skip it"); + if (my_errno != ENOENT) + { + recovery_found_crashed_tables++; + eprint(tracef, "\n***WARNING: %s could not be opened: Error: %d", + name ? name : "NULL", (int) my_errno); + } + error= 0; + goto end; + } + share= info->s; + /* check that we're not already using it */ + if (share->reopen != 1) + { + tprint(tracef, ", is already open (reopen=%u)\n", share->reopen); + /* + It could be that we have in the log + FILE_ID(t1,10) ... (t1 was flushed) ... FILE_ID(t1,12); + */ + if (close_one_table(share->open_file_name.str, lsn_of_file_id)) + goto end; + /* + We should not try to get length of data/index files as the files + are not on disk yet. + */ + _ma_tmp_disable_logging_for_table(info, FALSE); + goto set_lsn_of_file_id; + } + if (!share->base.born_transactional) + { + /* + This can happen if one converts a transactional table to a + not transactional table + */ + tprint(tracef, ", is not transactional. Ignoring open request"); + eprint(tracef, "\n***WARNING: '%s' may be crashed", name); + error= -1; + recovery_warnings++; + goto end; + } + if (cmp_translog_addr(lsn_of_file_id, share->state.create_rename_lsn) <= 0) + { + /* + This can happen if the table was dropped and re-created since this + redo entry or if the table had a bulk insert directly after create, + in which case the create_rename_lsn changed. + */ + tprint(tracef, ", has create_rename_lsn " LSN_FMT " more recent than" + " LOGREC_FILE_ID's LSN " LSN_FMT ", ignoring open request", + LSN_IN_PARTS(share->state.create_rename_lsn), + LSN_IN_PARTS(lsn_of_file_id)); + recovery_warnings++; + error= -1; + goto end; + /* + Note that we tested that before testing corruption; a recent corrupted + table is not a blocker for the present log record. + */ + } + if (maria_is_crashed(info)) + { + tprint(tracef, "\n"); + eprint(tracef, "Table '%s' is crashed, skipping it. Please repair it with" + " aria_chk -r", share->open_file_name.str); + recovery_found_crashed_tables++; + error= -1; /* not fatal, try with other tables */ + goto end; + /* + Note that if a first recovery fails to apply a REDO, it marks the table + corrupted and stops the entire recovery. A second recovery will find the + table is marked corrupted and skip it (and thus possibly handle other + tables). + */ + } + /* don't log any records for this work */ + _ma_tmp_disable_logging_for_table(info, FALSE); + /* execution of some REDO records relies on data_file_length */ + dfile_len= mysql_file_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME)); + kfile_len= mysql_file_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME)); + if ((dfile_len == MY_FILEPOS_ERROR) || + (kfile_len == MY_FILEPOS_ERROR)) + { + tprint(tracef, ", length unknown\n"); + eprint(tracef, "\n***WARNING: Can't read length of file '%s'", + share->open_file_name.str); + recovery_warnings++; + goto end; + } + if (share->state.state.data_file_length != dfile_len) + { + tprint(tracef, ", has wrong state.data_file_length " + "(fixing it from %llu to %llu)", + (ulonglong) share->state.state.data_file_length, (ulonglong) dfile_len); + share->state.state.data_file_length= dfile_len; + } + if (share->state.state.key_file_length != kfile_len) + { + tprint(tracef, ", has wrong state.key_file_length " + "(fixing it from %llu to %llu)", + (ulonglong) share->state.state.key_file_length, (ulonglong) kfile_len); + share->state.state.key_file_length= kfile_len; + } + if ((dfile_len % share->block_size) || (kfile_len % share->block_size)) + { + tprint(tracef, ", has too short last page"); + /* Recovery will fix this, no error */ + ALERT_USER(); + } + +set_lsn_of_file_id: + /* + This LSN serves in this situation; assume log is: + FILE_ID(6->"t2") REDO_INSERT(6) FILE_ID(6->"t1") CHECKPOINT(6->"t1") + then crash, checkpoint record is parsed and opens "t1" with id 6; assume + REDO phase starts from the REDO_INSERT above: it will wrongly try to + update a page of "t1". With this LSN below, REDO_INSERT can realize the + mapping is newer than itself, and not execute. + Same example is possible with UNDO_INSERT (update of the state). + */ + info->s->lsn_of_file_id= lsn_of_file_id; + all_tables[sid].info= info; + /* + We don't set info->s->id, it would be useless (no logging in REDO phase); + if you change that, know that some records in REDO phase call + _ma_update_state_lsns() which resets info->s->id. + */ + tprint(tracef, ", opened"); + error= 0; +end: + tprint(tracef, "\n"); + if (error) + { + if (info != NULL) + { + /* let maria_close() mark the table properly closed */ + info->s->state.open_count= 1; + info->s->global_changed= 1; + info->s->changed= 1; + maria_close(info); + } + if (error == -1) + error= 0; + } + DBUG_RETURN(error); +} + +/* + NOTE + This is called for REDO_INSERT_ROW_HEAD and READ_NEW_ROW_HEAD +*/ + +prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD) +{ + int error= 1; + uchar *buff= NULL; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + { + /* + Table was skipped at open time (because later dropped/renamed, not + transactional, or create_rename_lsn newer than LOGREC_FILE_ID), or + record was skipped due to skip_redo_lsn; it is not an error. + */ + return 0; + } + /* + Note that REDO is per page, we still consider it if its transaction + committed long ago and is unknown. + */ + /* + If REDO's LSN is > page's LSN (read from disk), we are going to modify the + page and change its LSN. The normal runtime code stores the UNDO's LSN + into the page. Here storing the REDO's LSN (rec->lsn) would work + (we are not writing to the log here, so don't have to "flush up to UNDO's + LSN"). But in a test scenario where we do updates at runtime, then remove + tables, apply the log and check that this results in the same table as at + runtime, putting the same LSN as runtime had done will decrease + differences. So we use the UNDO's LSN which is current_group_end_lsn. + */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL) + { + eprint(tracef, "Failed to read allocate buffer for record"); + goto end; + } + if (translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, + HEAD_PAGE, + (rec->type == + LOGREC_REDO_NEW_ROW_HEAD), + buff + FILEID_STORE_SIZE, + buff + + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE))) + goto end; + error= 0; +end: + return error; +} + +/* + NOTE + This is called for REDO_INSERT_ROW_TAIL and READ_NEW_ROW_TAIL +*/ + +prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL) +{ + int error= 1; + uchar *buff; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, + TAIL_PAGE, + (rec->type == + LOGREC_REDO_NEW_ROW_TAIL), + buff + FILEID_STORE_SIZE, + buff + + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE, + rec->record_length - + (FILEID_STORE_SIZE + + PAGE_STORE_SIZE + + DIRPOS_STORE_SIZE))) + goto end; + error= 0; + +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INSERT_ROW_BLOBS) +{ + int error= 1; + uchar *buff; + uint number_of_blobs, number_of_ranges; + pgcache_page_no_t first_page, last_page; + char llbuf1[22], llbuf2[22]; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + buff= log_record_buffer.str; + if (_ma_apply_redo_insert_row_blobs(info, current_group_end_lsn, + buff, rec->lsn, &number_of_blobs, + &number_of_ranges, + &first_page, &last_page)) + goto end; + llstr(first_page, llbuf1); + llstr(last_page, llbuf2); + tprint(tracef, " %u blobs %u ranges, first page %s last %s", + number_of_blobs, number_of_ranges, llbuf1, llbuf2); + + error= 0; + +end: + tprint(tracef, " \n"); + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn, + HEAD_PAGE, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn, + TAIL_PAGE, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_FREE_BLOCKS) +{ + int error= 1; + uchar *buff; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + buff= log_record_buffer.str; + if (_ma_apply_redo_free_blocks(info, current_group_end_lsn, rec->lsn, + buff)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + + if (_ma_apply_redo_free_head_or_tail(info, current_group_end_lsn, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_DELETE_ALL) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + tprint(tracef, " deleting all %lu rows\n", + (ulong)info->s->state.state.records); + if (maria_delete_all_rows(info)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INDEX) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + if (_ma_apply_redo_index(info, current_group_end_lsn, + log_record_buffer.str + FILEID_STORE_SIZE, + rec->record_length - FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + +prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + if (_ma_apply_redo_index_new_page(info, current_group_end_lsn, + log_record_buffer.str + FILEID_STORE_SIZE, + rec->record_length - FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + + if (_ma_apply_redo_index_free_page(info, current_group_end_lsn, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL || maria_is_crashed(info)) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + goto end; + } + + if (cmp_translog_addr(rec->lsn, checkpoint_start) >= 0) + { + /* + Record is potentially after the bitmap flush made by Checkpoint, so has + to be replayed. It may overwrite a more recent state but that will be + corrected by all upcoming REDOs for data pages. + If the condition is false, we must not apply the record: it is unneeded + and nocive (may not be corrected as REDOs can be skipped due to + dirty-pages list). + */ + if (_ma_apply_redo_bitmap_new_page(info, current_group_end_lsn, + log_record_buffer.str + + FILEID_STORE_SIZE)) + goto end; + } + error= 0; +end: + return error; +} + + +static inline void set_undo_lsn_for_active_trans(uint16 short_trid, LSN lsn) +{ + if (all_active_trans[short_trid].long_trid == 0) + { + /* transaction unknown, so has committed or fully rolled back long ago */ + return; + } + all_active_trans[short_trid].undo_lsn= lsn; + if (all_active_trans[short_trid].first_undo_lsn == LSN_IMPOSSIBLE) + all_active_trans[short_trid].first_undo_lsn= lsn; +} + + +prototype_redo_exec_hook(UNDO_ROW_INSERT) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + { + /* + Note that we set undo_lsn anyway. So that if the transaction is later + rolled back, this UNDO is tried for execution and we get a warning (as + it would then be abnormal that info==NULL). + */ + return 0; + } + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + tprint(tracef, " state has LSN " LSN_FMT " older than record, updating" + " row count\n", LSN_IN_PARTS(share->state.is_of_horizon)); + share->state.state.records++; + if (share->calc_checksum) + { + uchar buff[HA_CHECKSUM_STORE_SIZE]; + if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + HA_CHECKSUM_STORE_SIZE, buff, NULL) != + HA_CHECKSUM_STORE_SIZE) + { + eprint(tracef, "Failed to read record"); + return 1; + } + share->state.state.checksum+= ha_checksum_korr(buff); + } + info->s->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + } + tprint(tracef, " row count: %lu\n", (ulong)info->s->state.state.records); + /* Unpin all pages, stamp them with UNDO's LSN */ + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_ROW_DELETE) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record\n"); + share->state.state.records--; + if (share->calc_checksum) + { + uchar buff[HA_CHECKSUM_STORE_SIZE]; + if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 + + PAGERANGE_STORE_SIZE, + HA_CHECKSUM_STORE_SIZE, buff, NULL) != + HA_CHECKSUM_STORE_SIZE) + { + eprint(tracef, "Failed to read record"); + return 1; + } + share->state.state.checksum+= ha_checksum_korr(buff); + } + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + } + tprint(tracef, " row count: %lu\n", (ulong)share->state.state.records); + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_ROW_UPDATE) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + if (share->calc_checksum) + { + uchar buff[HA_CHECKSUM_STORE_SIZE]; + if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, + HA_CHECKSUM_STORE_SIZE, buff, NULL) != + HA_CHECKSUM_STORE_SIZE) + { + eprint(tracef, "Failed to read record"); + return 1; + } + share->state.state.checksum+= ha_checksum_korr(buff); + } + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + } + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_KEY_INSERT) +{ + MARIA_HA *info; + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (!(info= get_MARIA_HA_from_UNDO_record(rec))) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + const uchar *ptr= rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE; + uint keynr= key_nr_korr(ptr); + if (share->base.auto_key == (keynr + 1)) /* it's auto-increment */ + { + const HA_KEYSEG *keyseg= info->s->keyinfo[keynr].seg; + ulonglong value; + char llbuf[22]; + uchar reversed[MARIA_MAX_KEY_BUFF], *to; + tprint(tracef, " state older than record\n"); + /* we read the record to find the auto_increment value */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + to= log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE; + if (keyseg->flag & HA_SWAP_KEY) + { + /* We put key from log record to "data record" packing format... */ + uchar *key_ptr= to; + uchar *key_end= key_ptr + keyseg->length; + to= reversed + keyseg->length; + do + { + *--to= *key_ptr++; + } while (key_ptr != key_end); + /* ... so that we can read it with: */ + } + value= ma_retrieve_auto_increment(to, keyseg->type); + set_if_bigger(share->state.auto_increment, value); + llstr(share->state.auto_increment, llbuf); + tprint(tracef, " auto-inc %s\n", llbuf); + } + } + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_KEY_DELETE) +{ + MARIA_HA *info; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (!(info= get_MARIA_HA_from_UNDO_record(rec))) + return 0; + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (info == NULL) + return 0; + share= info->s; + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + uint key_nr; + my_off_t page; + key_nr= key_nr_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE); + page= page_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE); + share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ? + HA_OFFSET_ERROR : + page * share->block_size); + } + _ma_unpin_all_pages(info, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(UNDO_BULK_INSERT) +{ + /* + If the repair finished it wrote and sync the state. If it didn't finish, + we are going to empty the table and that will fix the state. + */ + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + return 0; +} + + +prototype_redo_exec_hook(IMPORTED_TABLE) +{ + char *name; + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + name= (char *)log_record_buffer.str; + tprint(tracef, "Table '%s' was imported (auto-zerofilled) in this Aria instance\n", name); + return 0; +} + + +prototype_redo_exec_hook(COMMIT) +{ + uint16 sid= rec->short_trid; + TrID long_trid= all_active_trans[sid].long_trid; + char llbuf[22]; + if (long_trid == 0) + { + tprint(tracef, "We don't know about transaction with short_trid %u;" + "it probably committed long ago, forget it\n", sid); + bzero(&all_active_trans[sid], sizeof(all_active_trans[sid])); + return 0; + } + llstr(long_trid, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u committed\n", + llbuf, sid); + bzero(&all_active_trans[sid], sizeof(all_active_trans[sid])); +#ifdef MARIA_VERSIONING + /* + if real recovery: + transaction was committed, move it to some separate list for later + purging (but don't purge now! purging may have been started before, we + may find REDO_PURGE records soon). + */ +#endif + return 0; +} + +prototype_redo_exec_hook(CLR_END) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + LSN previous_undo_lsn; + enum translog_record_type undone_record_type; + const LOG_DESC *log_desc; + my_bool row_entry= 0; + uchar *logpos; + DBUG_ENTER("exec_REDO_LOGREC_CLR_END"); + + previous_undo_lsn= lsn_korr(rec->header); + undone_record_type= + clr_type_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE); + log_desc= &log_record_type_descriptor[undone_record_type]; + + set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn); + if (info == NULL) + DBUG_RETURN(0); + share= info->s; + tprint(tracef, " CLR_END was about %s, undo_lsn " LSN_FMT "\n", + log_desc->name, LSN_IN_PARTS(previous_undo_lsn)); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + logpos= (log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE + + CLR_TYPE_STORE_SIZE); + + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + tprint(tracef, " state older than record\n"); + switch (undone_record_type) { + case LOGREC_UNDO_ROW_DELETE: + row_entry= 1; + share->state.state.records++; + break; + case LOGREC_UNDO_ROW_INSERT: + share->state.state.records--; + share->state.changed|= STATE_NOT_OPTIMIZED_ROWS; + row_entry= 1; + break; + case LOGREC_UNDO_ROW_UPDATE: + row_entry= 1; + break; + case LOGREC_UNDO_KEY_INSERT: + case LOGREC_UNDO_KEY_DELETE: + break; + case LOGREC_UNDO_KEY_INSERT_WITH_ROOT: + case LOGREC_UNDO_KEY_DELETE_WITH_ROOT: + { + uint key_nr; + my_off_t page; + key_nr= key_nr_korr(logpos); + page= page_korr(logpos + KEY_NR_STORE_SIZE); + share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ? + HA_OFFSET_ERROR : + page * share->block_size); + break; + } + case LOGREC_UNDO_BULK_INSERT: + break; + default: + DBUG_ASSERT(0); + } + if (row_entry && share->calc_checksum) + share->state.state.checksum+= ha_checksum_korr(logpos); + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + } + if (row_entry) + tprint(tracef, " row count: %lu\n", (ulong)share->state.state.records); + _ma_unpin_all_pages(info, rec->lsn); + DBUG_RETURN(0); +} + + +/** + Hock to print debug information (like MySQL query) +*/ + +prototype_redo_exec_hook(DEBUG_INFO) +{ + char *data; + enum translog_debug_info_type debug_info; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record debug record"); + return 1; + } + debug_info= (enum translog_debug_info_type) log_record_buffer.str[0]; + data= (char*) log_record_buffer.str + 1; + switch (debug_info) { + case LOGREC_DEBUG_INFO_QUERY: + tprint(tracef, "Query: %.*s\n", (int) rec->record_length - 1, data); + break; + default: + DBUG_ASSERT(0); + } + return 0; +} + + +/** + In some cases we have to skip execution of an UNDO record during the UNDO + phase. +*/ + +static void skip_undo_record(LSN previous_undo_lsn, TRN *trn) +{ + trn->undo_lsn= previous_undo_lsn; + if (previous_undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */ + trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn); + skipped_undo_phase++; +} + + +prototype_undo_exec_hook(UNDO_ROW_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + const uchar *record_ptr; + + if (info == NULL || maria_is_crashed(info)) + { + /* + Unlike for REDOs, if the table was skipped it is abnormal; we have a + transaction to rollback which used this table, as it is not rolled back + it was supposed to hold this table and so the table should still be + there. Skip it (user may have repaired the table with maria_chk because + it was so badly corrupted that a previous recovery failed) but warn. + */ + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED | + STATE_NOT_MOVABLE); + record_ptr= rec->header; + if (share->calc_checksum) + { + /* + We need to read more of the record to put the checksum into the record + buffer used by _ma_apply_undo_row_insert(). + If the table has no live checksum, rec->header will be enough. + */ + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + record_ptr= log_record_buffer.str; + } + + info->trn= trn; + error= _ma_apply_undo_row_insert(info, previous_undo_lsn, + record_ptr + LSN_STORE_SIZE + + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " row count: %lu\n", (ulong)info->s->state.state.records); + tprint(tracef, " undo_lsn now LSN " LSN_FMT "\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_ROW_DELETE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_row_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - + (LSN_STORE_SIZE + FILEID_STORE_SIZE)); + info->trn= 0; + tprint(tracef, " row count: %lu\n undo_lsn now LSN " LSN_FMT "\n", + (ulong)share->state.state.records, LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_ROW_UPDATE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_row_update(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - + (LSN_STORE_SIZE + FILEID_STORE_SIZE)); + info->trn= 0; + tprint(tracef, " undo_lsn now LSN " LSN_FMT "\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_insert(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN " LSN_FMT "\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_DELETE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE, FALSE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN " LSN_FMT "\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL || maria_is_crashed(info)) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + eprint(tracef, "Failed to read record"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE, TRUE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN " LSN_FMT "\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_BULK_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + /* Here we don't check for crashed as we can undo the bulk insert */ + if (info == NULL) + { + skip_undo_record(previous_undo_lsn, trn); + return 0; + } + + share= info->s; + share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED | + STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE); + + info->trn= trn; + error= _ma_apply_undo_bulk_insert(info, previous_undo_lsn); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN " LSN_FMT "\n", + LSN_IN_PARTS(trn->undo_lsn)); + return error; +} + + +static int run_redo_phase(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply) +{ + TRANSLOG_HEADER_BUFFER rec; + struct st_translog_scanner_data scanner; + int len; + uint i; + DBUG_ENTER("run_redo_phase"); + + /* install hooks for execution */ +#define install_redo_exec_hook(R) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \ + exec_REDO_LOGREC_ ## R; +#define install_redo_exec_hook_shared(R,S) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \ + exec_REDO_LOGREC_ ## S; +#define install_undo_exec_hook(R) \ + log_record_type_descriptor[LOGREC_ ## R].record_execute_in_undo_phase= \ + exec_UNDO_LOGREC_ ## R; + install_redo_exec_hook(LONG_TRANSACTION_ID); + install_redo_exec_hook(CHECKPOINT); + install_redo_exec_hook(REDO_CREATE_TABLE); + install_redo_exec_hook(REDO_RENAME_TABLE); + install_redo_exec_hook(REDO_REPAIR_TABLE); + install_redo_exec_hook(REDO_DROP_TABLE); + install_redo_exec_hook(FILE_ID); + install_redo_exec_hook(INCOMPLETE_LOG); + install_redo_exec_hook(INCOMPLETE_GROUP); + install_redo_exec_hook(REDO_INSERT_ROW_HEAD); + install_redo_exec_hook(REDO_INSERT_ROW_TAIL); + install_redo_exec_hook(REDO_INSERT_ROW_BLOBS); + install_redo_exec_hook(REDO_PURGE_ROW_HEAD); + install_redo_exec_hook(REDO_PURGE_ROW_TAIL); + install_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL); + install_redo_exec_hook(REDO_FREE_BLOCKS); + install_redo_exec_hook(REDO_DELETE_ALL); + install_redo_exec_hook(REDO_INDEX); + install_redo_exec_hook(REDO_INDEX_NEW_PAGE); + install_redo_exec_hook(REDO_INDEX_FREE_PAGE); + install_redo_exec_hook(REDO_BITMAP_NEW_PAGE); + install_redo_exec_hook(UNDO_ROW_INSERT); + install_redo_exec_hook(UNDO_ROW_DELETE); + install_redo_exec_hook(UNDO_ROW_UPDATE); + install_redo_exec_hook(UNDO_KEY_INSERT); + install_redo_exec_hook(UNDO_KEY_DELETE); + install_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); + install_redo_exec_hook(COMMIT); + install_redo_exec_hook(CLR_END); + install_undo_exec_hook(UNDO_ROW_INSERT); + install_undo_exec_hook(UNDO_ROW_DELETE); + install_undo_exec_hook(UNDO_ROW_UPDATE); + install_undo_exec_hook(UNDO_KEY_INSERT); + install_undo_exec_hook(UNDO_KEY_DELETE); + install_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); + /* REDO_NEW_ROW_HEAD shares entry with REDO_INSERT_ROW_HEAD */ + install_redo_exec_hook_shared(REDO_NEW_ROW_HEAD, REDO_INSERT_ROW_HEAD); + /* REDO_NEW_ROW_TAIL shares entry with REDO_INSERT_ROW_TAIL */ + install_redo_exec_hook_shared(REDO_NEW_ROW_TAIL, REDO_INSERT_ROW_TAIL); + install_redo_exec_hook(UNDO_BULK_INSERT); + install_undo_exec_hook(UNDO_BULK_INSERT); + install_redo_exec_hook(IMPORTED_TABLE); + install_redo_exec_hook(DEBUG_INFO); + + current_group_end_lsn= LSN_IMPOSSIBLE; +#ifndef DBUG_OFF + current_group_table= NULL; +#endif + + if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon())) + { + tprint(tracef, "checkpoint address refers to the log end log or " + "log is empty, nothing to do.\n"); + DBUG_RETURN(0); + } + + len= translog_read_record_header(lsn, &rec); + + if (len == RECHEADER_READ_ERROR) + { + eprint(tracef, "Failed to read header of the first record."); + DBUG_RETURN(1); + } + if (translog_scanner_init(lsn, 1, &scanner, 1)) + { + tprint(tracef, "Scanner init failed\n"); + DBUG_RETURN(1); + } + for (i= 1;;i++) + { + uint16 sid= rec.short_trid; + const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type]; + display_record_position(log_desc, &rec, i); + /* + A complete group is a set of log records with an "end mark" record + (e.g. a set of REDOs for an operation, terminated by an UNDO for this + operation); if there is no "end mark" record the group is incomplete and + won't be executed. + */ + if ((log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) || + (log_desc->record_in_group == LOGREC_LAST_IN_GROUP)) + { + if (all_active_trans[sid].group_start_lsn != LSN_IMPOSSIBLE) + { + if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) + { + /* + Can happen if the transaction got a table write error, then + unlocked tables thus wrote a COMMIT record. Or can be an + INCOMPLETE_GROUP record written by a previous recovery. + */ + tprint(tracef, "\nDiscarding incomplete group before this record\n"); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + else + { + struct st_translog_scanner_data scanner2; + TRANSLOG_HEADER_BUFFER rec2; + /* + There is a complete group for this transaction, containing more + than this event. + */ + tprint(tracef, " ends a group:\n"); + len= + translog_read_record_header(all_active_trans[sid].group_start_lsn, + &rec2); + if (len < 0) /* EOF or error */ + { + tprint(tracef, "Cannot find record where it should be\n"); + goto err; + } + if (lsn_end != LSN_IMPOSSIBLE && rec2.lsn >= lsn_end) + { + tprint(tracef, + "lsn_redo_end reached at " LSN_FMT ". " + "Skipping rest of redo entries\n", + LSN_IN_PARTS(rec2.lsn)); + translog_destroy_scanner(&scanner); + translog_free_record_header(&rec); + DBUG_RETURN(0); + } + + if (translog_scanner_init(rec2.lsn, 1, &scanner2, 1)) + { + tprint(tracef, "Scanner2 init failed\n"); + goto err; + } + current_group_end_lsn= rec.lsn; + do + { + if (rec2.short_trid == sid) /* it's in our group */ + { + const LOG_DESC *log_desc2= &log_record_type_descriptor[rec2.type]; + display_record_position(log_desc2, &rec2, 0); + if (apply == MARIA_LOG_CHECK) + { + translog_size_t read_len; + enlarge_buffer(&rec2); + read_len= + translog_read_record(rec2.lsn, 0, rec2.record_length, + log_record_buffer.str, NULL); + if (read_len != rec2.record_length) + { + tprint(tracef, "Cannot read record's body: read %u of" + " %u bytes\n", read_len, rec2.record_length); + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + goto err; + } + } + if (apply == MARIA_LOG_APPLY && + display_and_apply_record(log_desc2, &rec2)) + { + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + goto err; + } + } + translog_free_record_header(&rec2); + len= translog_read_next_record_header(&scanner2, &rec2); + if (len < 0) /* EOF or error */ + { + tprint(tracef, "Cannot find record where it should be\n"); + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + goto err; + } + } + while (rec2.lsn < rec.lsn); + /* group finished */ + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + current_group_end_lsn= LSN_IMPOSSIBLE; /* for debugging */ + display_record_position(log_desc, &rec, 0); + translog_destroy_scanner(&scanner2); + translog_free_record_header(&rec2); + } + } + if (apply == MARIA_LOG_APPLY && + display_and_apply_record(log_desc, &rec)) + goto err; +#ifndef DBUG_OFF + current_group_table= NULL; +#endif + } + else /* record does not end group */ + { + /* just record the fact, can't know if can execute yet */ + if (all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE) + { + /* group not yet started */ + all_active_trans[sid].group_start_lsn= rec.lsn; + } + } + translog_free_record_header(&rec); + len= translog_read_next_record_header(&scanner, &rec); + if (len < 0) + { + switch (len) + { + case RECHEADER_READ_EOF: + tprint(tracef, "*** End of log ***\n"); + break; + case RECHEADER_READ_ERROR: + tprint(tracef, "Error reading log\n"); + goto err; + } + break; + } + } + translog_destroy_scanner(&scanner); + translog_free_record_header(&rec); + if (recovery_message_printed == REC_MSG_REDO) + { + fprintf(stderr, " 100%%"); + fflush(stderr); + procent_printed= 1; /* Will be follwed by time */ + } + DBUG_RETURN(0); + +err: + translog_destroy_scanner(&scanner); + translog_free_record_header(&rec); + DBUG_RETURN(1); +} + + +/** + @brief Informs about any aborted groups or uncommitted transactions, + prepares for the UNDO phase if needed. + + @note Observe that it may init trnman. +*/ +static uint end_of_redo_phase(my_bool prepare_for_undo_phase) +{ + uint sid, uncommitted= 0; + char llbuf[22]; + LSN addr; + + my_hash_free(&all_dirty_pages); + /* + hash_free() can be called multiple times probably, but be safe if that + changes + */ + bzero(&all_dirty_pages, sizeof(all_dirty_pages)); + my_free(dirty_pages_pool); + dirty_pages_pool= NULL; + + llstr(max_long_trid, llbuf); + tprint(tracef, "Maximum transaction long id seen: %s\n", llbuf); + llstr(max_trid_in_control_file, llbuf); + tprint(tracef, "Maximum transaction long id seen in control file: %s\n", + llbuf); + /* + If logs were deleted, or lost, trid in control file is needed to set + trnman's generator: + */ + set_if_bigger(max_long_trid, max_trid_in_control_file); + if (prepare_for_undo_phase && trnman_init(max_long_trid)) + return -1; + + trns_created= TRUE; + + for (sid= 0; sid <= SHORT_TRID_MAX; sid++) + { + TrID long_trid= all_active_trans[sid].long_trid; + LSN gslsn= all_active_trans[sid].group_start_lsn; + TRN *trn; + if (gslsn != LSN_IMPOSSIBLE) + { + tprint(tracef, "Group at LSN " LSN_FMT " short_trid %u incomplete\n", + LSN_IN_PARTS(gslsn), sid); + all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE; + } + if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE) + { + llstr(long_trid, llbuf); + tprint(tracef, "Transaction long_trid %s short_trid %u uncommitted\n", + llbuf, sid); + /* + dummy_transaction_object serves only for DDLs, where there is never a + rollback or incomplete group. And unknown transactions (which have + long_trid==0) should have undo_lsn==LSN_IMPOSSIBLE. + */ + if (long_trid ==0) + { + eprint(tracef, "Transaction with long_trid 0 should not roll back"); + ALERT_USER(); + return -1; + } + if (prepare_for_undo_phase) + { + if ((trn= trnman_recreate_trn_from_recovery(sid, long_trid)) == NULL) + return -1; + trn->undo_lsn= all_active_trans[sid].undo_lsn; + trn->first_undo_lsn= all_active_trans[sid].first_undo_lsn | + TRANSACTION_LOGGED_LONG_ID; /* because trn is known in log */ + if (gslsn != LSN_IMPOSSIBLE) + { + /* + UNDO phase will log some records. So, a future recovery may see: + REDO(from incomplete group) - REDO(from rollback) - CLR_END + and thus execute the first REDO (finding it in "a complete + group"). To prevent that: + */ + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS]; + LSN lsn; + if (translog_write_record(&lsn, LOGREC_INCOMPLETE_GROUP, + trn, NULL, 0, + TRANSLOG_INTERNAL_PARTS, log_array, + NULL, NULL)) + return -1; + } + } + uncommitted++; + } +#ifdef MARIA_VERSIONING + /* + If real recovery: if transaction was committed, move it to some separate + list for soon purging. + */ +#endif + } + + my_free(all_active_trans); + all_active_trans= NULL; + + /* + The UNDO phase uses some normal run-time code of ROLLBACK: generates log + records, etc; prepare tables for that + */ + addr= translog_get_horizon(); + for (sid= 0; sid <= SHARE_ID_MAX; sid++) + { + MARIA_HA *info= all_tables[sid].info; + if (info != NULL) + { + prepare_table_for_close(info, addr); + /* + But we don't close it; we leave it available for the UNDO phase; + it's likely that the UNDO phase will need it. + */ + if (prepare_for_undo_phase) + translog_assign_id_to_share_from_recovery(info->s, sid); + } + } + return uncommitted; +} + + +static int run_undo_phase(LSN end_undo_lsn, uint uncommitted) +{ + LSN last_undo __attribute__((unused)); + DBUG_ENTER("run_undo_phase"); + + if (uncommitted > 0) + { + checkpoint_useful= TRUE; + if (tracef != stdout) + { + if (recovery_message_printed == REC_MSG_NONE) + print_preamble(); + fprintf(stderr, "transactions to roll back:"); + recovery_message_printed= REC_MSG_UNDO; + } + tprint(tracef, "%u transactions will be rolled back\n", uncommitted); + for( ; ; ) + { + char llbuf[22]; + TRN *trn; + if (recovery_message_printed == REC_MSG_UNDO) + { + fprintf(stderr, " %u", uncommitted); + fflush(stderr); + } + if ((uncommitted--) == 0) + { + if (aria_undo_aborted <= 0) + { + aria_undo_aborted= 0; + break; + } + } + if (aria_undo_aborted) + { + tprint(tracef, + "lsn_undo_end found. Skipping rest of undo entries\n"); + break; + } + + trn= trnman_get_any_trn(); + DBUG_ASSERT(trn != NULL); + llstr(trn->trid, llbuf); + tprint(tracef, "Rolling back transaction of long id %s\n", llbuf); + last_undo= trn->undo_lsn + 1; + + /* Execute all undo entries */ + while (trn->undo_lsn) + { + TRANSLOG_HEADER_BUFFER rec; + LOG_DESC *log_desc; + DBUG_ASSERT(trn->undo_lsn < last_undo); + last_undo= trn->undo_lsn; + + if (translog_read_record_header(trn->undo_lsn, &rec) == + RECHEADER_READ_ERROR) + DBUG_RETURN(1); + log_desc= &log_record_type_descriptor[rec.type]; + display_record_position(log_desc, &rec, 0); + if (log_desc->record_execute_in_undo_phase(&rec, trn)) + { + eprint(tracef, "Got error %d when executing undo %s", my_errno, + log_desc->name); + translog_free_record_header(&rec); + DBUG_RETURN(1); + } + translog_free_record_header(&rec); + + if (last_undo == end_undo_lsn) + { + aria_undo_aborted= trn->undo_lsn ? 1 : -1; + break; + } + } + + /* Force a crash to test recovery of recovery */ + if (maria_recovery_force_crash_counter) + { + DBUG_ASSERT(--maria_recovery_force_crash_counter > 0); + } + + trn->undo_lsn= 0; /* Avoid abort in trnman_rollbac_trn */ + if (trnman_rollback_trn(trn)) + DBUG_RETURN(1); + /* We could want to span a few threads (4?) instead of 1 */ + /* In the future, we want to have this phase *online* */ + } + } + DBUG_RETURN(0); +} + + +/** + In case of error in recovery, deletes all transactions from the transaction + manager so that this module does not assert. + + @note no checkpoint should be taken as those transactions matter for the + next recovery (they still haven't been properly dealt with). +*/ + +static void delete_all_transactions() +{ + for( ; ; ) + { + TRN *trn= trnman_get_any_trn(); + if (trn == NULL) + break; + trn->undo_lsn= trn->first_undo_lsn= LSN_IMPOSSIBLE; + trnman_rollback_trn(trn); /* ignore error */ + } +} + + +/** + @brief re-enables transactionality, updates is_of_horizon + + @param info table + @param horizon address to set is_of_horizon +*/ + +static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon) +{ + MARIA_SHARE *share= info->s; + /* + In a fully-forward REDO phase (no checkpoint record), + state is now at least as new as the LSN of the current record. It may be + newer, in case we are seeing a LOGREC_FILE_ID which tells us to close a + table, but that table was later modified further in the log. + But if we parsed a checkpoint record, it may be this way in the log: + FILE_ID(6->t2)... FILE_ID(6->t1)... CHECKPOINT(6->t1) + Checkpoint parsing opened t1 with id 6; first FILE_ID above is going to + make t1 close; the first condition below is however false (when checkpoint + was taken it increased is_of_horizon) and so it works. For safety we + add the second condition. + */ + if (cmp_translog_addr(share->state.is_of_horizon, horizon) < 0 && + cmp_translog_addr(share->lsn_of_file_id, horizon) < 0) + { + share->state.is_of_horizon= horizon; + _ma_state_info_write_sub(share->kfile.file, &share->state, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET); + } + + /* + Ensure that info->state is up to date as + _ma_renable_logging_for_table() is depending on this + */ + *info->state= info->s->state.state; + + /* + This leaves PAGECACHE_PLAIN_PAGE pages into the cache, while the table is + going to switch back to transactional. So the table will be a mix of + pages, which is ok as long as we don't take any checkpoints until all + tables get closed at the end of the UNDO phase. + */ + _ma_reenable_logging_for_table(info, FALSE); + info->trn= NULL; /* safety */ +} + + +static MARIA_HA *get_MARIA_HA_from_REDO_record(const + TRANSLOG_HEADER_BUFFER *rec) +{ + uint16 sid; + pgcache_page_no_t UNINIT_VAR(page); + MARIA_HA *info; + MARIA_SHARE *share; + char llbuf[22]; + my_bool index_page_redo_entry= FALSE, page_redo_entry= FALSE; + + print_redo_phase_progress(rec->lsn); + sid= fileid_korr(rec->header); + switch (rec->type) { + /* not all REDO records have a page: */ + case LOGREC_REDO_INDEX_NEW_PAGE: + case LOGREC_REDO_INDEX: + case LOGREC_REDO_INDEX_FREE_PAGE: + index_page_redo_entry= 1; + /* fall through*/ + case LOGREC_REDO_INSERT_ROW_HEAD: + case LOGREC_REDO_INSERT_ROW_TAIL: + case LOGREC_REDO_PURGE_ROW_HEAD: + case LOGREC_REDO_PURGE_ROW_TAIL: + case LOGREC_REDO_NEW_ROW_HEAD: + case LOGREC_REDO_NEW_ROW_TAIL: + case LOGREC_REDO_FREE_HEAD_OR_TAIL: + page_redo_entry= TRUE; + page= page_korr(rec->header + FILEID_STORE_SIZE); + llstr(page, llbuf); + break; + case LOGREC_REDO_FREE_BLOCKS: + /* + We are checking against the dirty pages in _ma_apply_redo_free_blocks() + */ + break; + default: + break; + } + tprint(tracef, " For table of short id %u", sid); + info= all_tables[sid].info; +#ifndef DBUG_OFF + DBUG_ASSERT(current_group_table == NULL || current_group_table == info); + current_group_table= info; +#endif + if (info == NULL) + { + tprint(tracef, ", table skipped, so skipping record\n"); + return NULL; + } + share= info->s; + tprint(tracef, ", '%s'", share->open_file_name.str); + DBUG_ASSERT(in_redo_phase); + if (!table_is_part_of_recovery_set(&share->open_file_name)) + { + tprint(tracef, ", skipped by user\n"); + return NULL; + } + + if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0) + { + /* + This can happen only if processing a record before the checkpoint + record. + id->name mapping is newer than REDO record: for sure the table subject + of the REDO has been flushed and forced (id re-assignment implies this); + REDO can be ignored (and must be, as we don't know what this subject + table was). + */ + DBUG_ASSERT(cmp_translog_addr(rec->lsn, checkpoint_start) < 0); + tprint(tracef, ", table's LOGREC_FILE_ID has LSN " LSN_FMT " more recent" + " than record, skipping record", + LSN_IN_PARTS(share->lsn_of_file_id)); + return NULL; + } + if (cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0) + { + /* probably a bulk insert repair */ + tprint(tracef, ", has skip_redo_lsn " LSN_FMT " more recent than" + " record, skipping record\n", + LSN_IN_PARTS(share->state.skip_redo_lsn)); + return NULL; + } + /* detect if an open instance of a dropped table (internal bug) */ + DBUG_ASSERT(share->last_version != 0); + if (page_redo_entry) + { + /* + Consult dirty pages list. + REDO_INSERT_ROW_BLOBS will consult list by itself, as it covers several + pages. + */ + if (_ma_redo_not_needed_for_page(sid, rec->lsn, page, + index_page_redo_entry)) + return NULL; + } + /* + So we are going to read the page, and if its LSN is older than the + record's we will modify the page + */ + tprint(tracef, ", applying record\n"); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */ + return info; +} + + +static MARIA_HA *get_MARIA_HA_from_UNDO_record(const + TRANSLOG_HEADER_BUFFER *rec) +{ + uint16 sid; + MARIA_HA *info; + MARIA_SHARE *share; + + sid= fileid_korr(rec->header + LSN_STORE_SIZE); + tprint(tracef, " For table of short id %u", sid); + info= all_tables[sid].info; +#ifndef DBUG_OFF + DBUG_ASSERT(!in_redo_phase || + current_group_table == NULL || current_group_table == info); + current_group_table= info; +#endif + if (info == NULL) + { + tprint(tracef, ", table skipped, so skipping record\n"); + return NULL; + } + share= info->s; + tprint(tracef, ", '%s'", share->open_file_name.str); + + if (!table_is_part_of_recovery_set(&share->open_file_name)) + { + tprint(tracef, ", skipped by user\n"); + return NULL; + } + + if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0) + { + tprint(tracef, ", table's LOGREC_FILE_ID has LSN " LSN_FMT " more recent" + " than record, skipping record", + LSN_IN_PARTS(share->lsn_of_file_id)); + return NULL; + } + if (in_redo_phase && + cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0) + { + /* probably a bulk insert repair */ + tprint(tracef, ", has skip_redo_lsn " LSN_FMT " more recent than" + " record, skipping record\n", + LSN_IN_PARTS(share->state.skip_redo_lsn)); + return NULL; + } + DBUG_ASSERT(share->last_version != 0); + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */ + if (in_redo_phase) + tprint(tracef, ", remembering undo\n"); + else + tprint(tracef, ", applying record\n"); + return info; +} + + +/** + @brief Parses checkpoint record. + + Builds from it the dirty_pages list (a hash), opens tables and maps them to + their 2-byte IDs, recreates transactions (not real TRNs though). + + @return LSN from where in the log the REDO phase should start + @retval LSN_ERROR error + @retval other ok +*/ + +static LSN parse_checkpoint_record(LSN lsn) +{ + ulong i; + ulonglong nb_dirty_pages; + TRANSLOG_HEADER_BUFFER rec; + TRANSLOG_ADDRESS start_address; + int len; + uint nb_active_transactions, nb_committed_transactions, nb_tables; + uchar *ptr; + LSN minimum_rec_lsn_of_active_transactions, minimum_rec_lsn_of_dirty_pages; + struct st_dirty_page *next_dirty_page_in_pool; + + tprint(tracef, "Loading data from checkpoint record at LSN " LSN_FMT "\n", + LSN_IN_PARTS(lsn)); + if ((len= translog_read_record_header(lsn, &rec)) == RECHEADER_READ_ERROR || + rec.type != LOGREC_CHECKPOINT) + { + eprint(tracef, "Cannot find checkpoint record at LSN " LSN_FMT, + LSN_IN_PARTS(lsn)); + return LSN_ERROR; + } + + enlarge_buffer(&rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec.lsn, 0, rec.record_length, + log_record_buffer.str, NULL) != + rec.record_length) + { + eprint(tracef, "Failed to read record"); + return LSN_ERROR; + } + + ptr= log_record_buffer.str; + start_address= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + tprint(tracef, "Checkpoint record has start_horizon at " LSN_FMT "\n", + LSN_IN_PARTS(start_address)); + + /* transactions */ + nb_active_transactions= uint2korr(ptr); + ptr+= 2; + tprint(tracef, "%u active transactions\n", nb_active_transactions); + minimum_rec_lsn_of_active_transactions= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + max_long_trid= transid_korr(ptr); + ptr+= TRANSID_SIZE; + + /* + how much brain juice and discussions there was to come to writing this + line. It may make start_address slightly decrease (only by the time it + takes to write one or a few rows, roughly). + */ + tprint(tracef, "Checkpoint record has min_rec_lsn of active transactions" + " at " LSN_FMT "\n", + LSN_IN_PARTS(minimum_rec_lsn_of_active_transactions)); + set_if_smaller(start_address, minimum_rec_lsn_of_active_transactions); + + for (i= 0; i < nb_active_transactions; i++) + { + uint16 sid= uint2korr(ptr); + TrID long_id; + LSN undo_lsn, first_undo_lsn; + ptr+= 2; + long_id= uint6korr(ptr); + ptr+= 6; + DBUG_ASSERT(sid > 0 && long_id > 0); + undo_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + first_undo_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + new_transaction(sid, long_id, undo_lsn, first_undo_lsn); + } + nb_committed_transactions= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%lu committed transactions\n", + (ulong)nb_committed_transactions); + /* no purging => committed transactions are not important */ + ptr+= (6 + LSN_STORE_SIZE) * nb_committed_transactions; + + /* tables */ + nb_tables= uint4korr(ptr); + ptr+= 4; + tprint(tracef, "%u open tables\n", nb_tables); + for (i= 0; i< nb_tables; i++) + { + char name[FN_REFLEN]; + LSN first_log_write_lsn; + size_t name_len; + uint16 sid= uint2korr(ptr); + ptr+= 2; + DBUG_ASSERT(sid > 0); + first_log_write_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + name_len= strlen((char *)ptr) + 1; + strmake_buf(name, (char *)ptr); + ptr+= name_len; + if (new_table(sid, name, first_log_write_lsn)) + return LSN_ERROR; + } + + /* dirty pages */ + nb_dirty_pages= uint8korr(ptr); + + /* Ensure casts later will not lose significant bits. */ + DBUG_ASSERT((nb_dirty_pages <= SIZE_T_MAX/sizeof(struct st_dirty_page)) && + (nb_dirty_pages <= ULONG_MAX)); + + ptr+= 8; + tprint(tracef, "%lu dirty pages\n", (ulong) nb_dirty_pages); + if (my_hash_init(PSI_INSTRUMENT_ME, &all_dirty_pages, &my_charset_bin, + (ulong)nb_dirty_pages, offsetof(struct st_dirty_page, file_and_page_id), + sizeof(((struct st_dirty_page *)NULL)->file_and_page_id), + NULL, NULL, 0)) + return LSN_ERROR; + dirty_pages_pool= + (struct st_dirty_page *)my_malloc(PSI_INSTRUMENT_ME, (size_t)nb_dirty_pages * + sizeof(struct st_dirty_page), + MYF(MY_WME)); + if (unlikely(dirty_pages_pool == NULL)) + return LSN_ERROR; + next_dirty_page_in_pool= dirty_pages_pool; + minimum_rec_lsn_of_dirty_pages= LSN_MAX; + if (maria_recovery_verbose) + tprint(tracef, "Table_id Is_index Page_id Rec_lsn\n"); + for (i= 0; i < nb_dirty_pages ; i++) + { + pgcache_page_no_t page_id; + LSN rec_lsn; + uint32 is_index; + uint16 table_id= uint2korr(ptr); + ptr+= 2; + is_index= ptr[0]; + ptr++; + page_id= page_korr(ptr); + ptr+= PAGE_STORE_SIZE; + rec_lsn= lsn_korr(ptr); + ptr+= LSN_STORE_SIZE; + if (new_page((is_index << 16) | table_id, + page_id, rec_lsn, next_dirty_page_in_pool++)) + return LSN_ERROR; + if (maria_recovery_verbose) + tprint(tracef, "%8u %8u %12lu " LSN_FMT "\n", (uint) table_id, + (uint) is_index, (ulong) page_id, LSN_IN_PARTS(rec_lsn)); + set_if_smaller(minimum_rec_lsn_of_dirty_pages, rec_lsn); + } + /* after that, there will be no insert/delete into the hash */ + /* + sanity check on record (did we screw up with all those "ptr+=", did the + checkpoint write code and checkpoint read code go out of sync?). + */ + if (ptr != (log_record_buffer.str + log_record_buffer.length)) + { + eprint(tracef, "checkpoint record corrupted\n"); + return LSN_ERROR; + } + + /* + start_address is now from where the dirty pages list can be ignored. + Find LSN higher or equal to this TRANSLOG_ADDRESS, suitable for + translog_read_record() functions. + */ + start_address= checkpoint_start= + translog_next_LSN(start_address, LSN_IMPOSSIBLE); + tprint(tracef, "Checkpoint record start_horizon now adjusted to" + " LSN " LSN_FMT "\n", LSN_IN_PARTS(start_address)); + if (checkpoint_start == LSN_IMPOSSIBLE) + { + /* + There must be a problem, as our checkpoint record exists and is >= the + address which is stored in its first bytes, which is >= start_address. + */ + return LSN_ERROR; + } + /* now, where the REDO phase should start reading log: */ + tprint(tracef, "Checkpoint has min_rec_lsn of dirty pages at" + " LSN " LSN_FMT "\n", LSN_IN_PARTS(minimum_rec_lsn_of_dirty_pages)); + set_if_smaller(start_address, minimum_rec_lsn_of_dirty_pages); + DBUG_PRINT("info", + ("checkpoint_start: " LSN_FMT " start_address: " LSN_FMT, + LSN_IN_PARTS(checkpoint_start), LSN_IN_PARTS(start_address))); + return start_address; +} + + +static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn, + struct st_dirty_page *dirty_page) +{ + /* serves as hash key */ + dirty_page->file_and_page_id= (((uint64)fileid) << 40) | pageid; + dirty_page->rec_lsn= rec_lsn; + return my_hash_insert(&all_dirty_pages, (uchar *)dirty_page); +} + + +static int close_all_tables(void) +{ + int error= 0; + uint count= 0; + LIST *list_element, *next_open; + MARIA_HA *info; + TRANSLOG_ADDRESS addr; + DBUG_ENTER("close_all_tables"); + + mysql_mutex_lock(&THR_LOCK_maria); + if (maria_open_list == NULL) + goto end; + tprint(tracef, "Closing all tables\n"); + if (tracef != stdout) + { + if (recovery_message_printed == REC_MSG_NONE) + print_preamble(); + for (count= 0, list_element= maria_open_list ; + list_element ; count++, (list_element= list_element->next)) + ; + fprintf(stderr, "tables to flush:"); + recovery_message_printed= REC_MSG_FLUSH; + } + /* + Since the end of end_of_redo_phase(), we may have written new records + (if UNDO phase ran) and thus the state is newer than at + end_of_redo_phase(), we need to bump is_of_horizon again. + */ + addr= translog_get_horizon(); + for (list_element= maria_open_list ; ; list_element= next_open) + { + if (recovery_message_printed == REC_MSG_FLUSH) + { + fprintf(stderr, " %u", count--); + fflush(stderr); + } + if (list_element == NULL) + break; + next_open= list_element->next; + info= (MARIA_HA*)list_element->data; + mysql_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */ + /* + Tables which we see here are exactly those which were open at time of + crash. They might have open_count>0 as Checkpoint maybe flushed their + state while they were used. As Recovery corrected them, don't alarm the + user, don't ask for a table check: + */ + if (info->s->state.open_count != 0) + { + /* let maria_close() mark the table properly closed */ + info->s->state.open_count= 1; + info->s->global_changed= 1; + info->s->changed= 1; + } + prepare_table_for_close(info, addr); + error|= maria_close(info); + mysql_mutex_lock(&THR_LOCK_maria); + + /* Force a crash to test recovery of recovery */ + if (maria_recovery_force_crash_counter) + { + DBUG_ASSERT(--maria_recovery_force_crash_counter > 0); + } + } +end: + if (recovery_message_printed == REC_MSG_FLUSH) + { + fputc('\n', stderr); + fflush(stderr); + } + mysql_mutex_unlock(&THR_LOCK_maria); + DBUG_RETURN(error); +} + + +/** + @brief Close all table instances with a certain name which are present in + all_tables. + + @param name Name of table + @param addr Log address passed to prepare_table_for_close() +*/ + +static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr) +{ + my_bool res= 0; + /* There are no other threads using the tables, so we don't need any locks */ + struct st_table_for_recovery *internal_table, *end; + for (internal_table= all_tables, end= internal_table + SHARE_ID_MAX + 1; + internal_table < end ; + internal_table++) + { + MARIA_HA *info= internal_table->info; + if ((info != NULL) && !strcmp(info->s->open_file_name.str, name)) + { + prepare_table_for_close(info, addr); + if (maria_close(info)) + res= 1; + internal_table->info= NULL; + } + } + return res; +} + + +/** + Temporarily disables logging for this table. + + If that makes the log incomplete, writes a LOGREC_INCOMPLETE_LOG to the log + to warn log readers. + + @param info table + @param log_incomplete if that disabling makes the log incomplete + + @note for example in the REDO phase we disable logging but that does not + make the log incomplete. +*/ + +void _ma_tmp_disable_logging_for_table(MARIA_HA *info, + my_bool log_incomplete) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_tmp_disable_logging_for_table"); + + /* + We have to ensure that bitmap is flushed, as it's checking + that share->now_transactional is set + */ + if (share->now_transactional && share->data_file_type == BLOCK_RECORD) + _ma_bitmap_flush_all(share); + + if (log_incomplete) + { + uchar log_data[FILEID_STORE_SIZE]; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + LSN lsn; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + translog_write_record(&lsn, LOGREC_INCOMPLETE_LOG, + &dummy_transaction_object, info, + (translog_size_t) sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL); + } + + /* if we disabled before writing the record, record wouldn't reach log */ + share->now_transactional= FALSE; + + /* + Reset state pointers. This is needed as in ALTER table we may do + commit followed by _ma_renable_logging_for_table and then + info->state may point to a state that was deleted by + _ma_trnman_end_trans_hook() + */ + share->state.no_logging= *info->state; + info->state= &share->state.no_logging; + info->switched_transactional= TRUE; + + /* + Some code in ma_blockrec.c assumes a trn even if !now_transactional but in + this case it only reads trn->rec_lsn, which has to be LSN_IMPOSSIBLE and + should be now. info->trn may be NULL in maria_chk. + */ + if (info->trn == NULL) + { + info->trn= &dummy_transaction_object; + info->trn_next= 0; + info->trn_prev= 0; + } + + DBUG_ASSERT(info->trn->rec_lsn == LSN_IMPOSSIBLE); + share->page_type= PAGECACHE_PLAIN_PAGE; + /* Functions below will pick up now_transactional and change callbacks */ + _ma_set_data_pagecache_callbacks(&info->dfile, share); + _ma_set_index_pagecache_callbacks(&share->kfile, share); + _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share); + DBUG_VOID_RETURN; +} + + +/** + Re-enables logging for a table which had it temporarily disabled. + + Only the thread which disabled logging is allowed to reenable it. Indeed, + re-enabling logging affects all open instances, one must have exclusive + access to the table to do that. In practice, the one which disables has + such access. + + @param info table + @param flush_pages if function needs to flush pages first +*/ + +my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_reenable_logging_for_table"); + + if (share->now_transactional == share->base.born_transactional || + !info->switched_transactional) + { + info->switched_transactional= FALSE; + DBUG_RETURN(0); + } + info->switched_transactional= FALSE; + + if ((share->now_transactional= share->base.born_transactional)) + { + share->page_type= PAGECACHE_LSN_PAGE; + + /* + Copy state information that where updated while the table was used + in not transactional mode + */ + _ma_copy_nontrans_state_information(info); + _ma_reset_history(info->s); + + /* Reset state to point to state.common, as on open() */ + info->state= &share->state.common; + *info->state= share->state.state; + + if (flush_pages) + { + /* Ensure that recover is not executing any redo before this */ + if (!maria_in_recovery) + { + if (share->id != 0) + { + mysql_mutex_lock(&share->intern_lock); + translog_deassign_id_from_share(share); + mysql_mutex_unlock(&share->intern_lock); + } + share->state.is_of_horizon= share->state.create_rename_lsn= + share->state.skip_redo_lsn= translog_get_horizon(); + } + /* + We are going to change callbacks; if a page is flushed at this moment + this can cause race conditions, that's one reason to flush pages + now. Other reasons: a checkpoint could be running and miss pages; the + pages have type PAGECACHE_PLAIN_PAGE which should not remain. As + there are no REDOs for pages, them, bitmaps and the state also have to + be flushed and synced. + */ + if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE) || + _ma_state_info_write(share, + MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET | + MA_STATE_INFO_WRITE_LOCK) || + _ma_sync_table_files(info)) + DBUG_RETURN(1); + } + else if (!maria_in_recovery) + { + /* + Except in Recovery, we mustn't leave dirty pages (see comments above). + Note that this does not verify that the state was flushed, but hey. + */ + pagecache_file_no_dirty_page(share->pagecache, &info->dfile); + pagecache_file_no_dirty_page(share->pagecache, &share->kfile); + } + _ma_set_data_pagecache_callbacks(&info->dfile, share); + _ma_set_index_pagecache_callbacks(&share->kfile, share); + _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share); + /* + info->trn was not changed in the disable/enable combo, so that it's + still usable in this kind of combination: + external_lock; + start_bulk_insert; # table is empty, disables logging + end_bulk_insert; # enables logging + start_bulk_insert; # table is not empty, logging stays + # so rows insertion needs the real trn. + as happens during row-based replication on the slave. + */ + } + DBUG_RETURN(0); +} + + +static void print_redo_phase_progress(TRANSLOG_ADDRESS addr) +{ + static uint end_logno= FILENO_IMPOSSIBLE, percentage_printed= 0; + static ulong end_offset; + static ulonglong initial_remainder= ~(ulonglong) 0; + + uint cur_logno; + ulong cur_offset; + ulonglong local_remainder; + uint percentage_done; + + if (tracef == stdout) + return; + if (recovery_message_printed == REC_MSG_NONE) + { + print_preamble(); + fprintf(stderr, "recovered pages: 0%%"); + fflush(stderr); + procent_printed= 1; + recovery_message_printed= REC_MSG_REDO; + } + if (end_logno == FILENO_IMPOSSIBLE) + { + LSN end_addr= translog_get_horizon(); + end_logno= LSN_FILE_NO(end_addr); + end_offset= LSN_OFFSET(end_addr); + } + cur_logno= LSN_FILE_NO(addr); + cur_offset= LSN_OFFSET(addr); + local_remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) : + (((longlong)log_file_size) - cur_offset + + MY_MAX(end_logno - cur_logno - 1, 0) * ((longlong)log_file_size) + + end_offset); + if (initial_remainder == (ulonglong)(-1)) + initial_remainder= local_remainder; + percentage_done= (uint) ((initial_remainder - local_remainder) * 100ULL / + initial_remainder); + if ((percentage_done - percentage_printed) >= 10) + { + percentage_printed= percentage_done; + fprintf(stderr, " %u%%", percentage_done); + fflush(stderr); + procent_printed= 1; + } +} + + +#ifdef MARIA_EXTERNAL_LOCKING +#error Marias Checkpoint and Recovery are really not ready for it +#endif + +/* +Recovery of the state : how it works +===================================== + +Here we ignore Checkpoints for a start. + +The state (MARIA_HA::MARIA_SHARE::MARIA_STATE_INFO) is updated in +memory frequently (at least at every row write/update/delete) but goes +to disk at few moments: maria_close() when closing the last open +instance, and a few rare places like CHECK/REPAIR/ALTER +(non-transactional tables also do it at maria_lock_database() but we +needn't cover them here). + +In case of crash, state on disk is likely to be older than what it was +in memory, the REDO phase needs to recreate the state as it was in +memory at the time of crash. When we say Recovery here we will always +mean "REDO phase". + +For example MARIA_STATUS_INFO::records (count of records). It is updated at +the end of every row write/update/delete/delete_all. When Recovery sees the +sign of such row operation (UNDO or REDO), it may need to update the records' +count if that count does not reflect that operation (is older). How to know +the age of the state compared to the log record: every time the state +goes to disk at runtime, its member "is_of_horizon" is updated to the +current end-of-log horizon. So Recovery just needs to compare is_of_horizon +and the record's LSN to know if it should modify "records". + +Other operations like ALTER TABLE DISABLE KEYS update the state but +don't write log records, thus the REDO phase cannot repeat their +effect on the state in case of crash. But we make them sync the state +as soon as they have finished. This reduces the window for a problem. + +It looks like only one thread at a time updates the state in memory or +on disk. We assume that the upper level (normally MySQL) has protection +against issuing HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME) so that these +are not issued while there are any running transactions on the given table. +If this is not done, we may write a corrupted state to disk. + +With checkpoints +================ + +Checkpoint module needs to read the state in memory and write it to +disk. This may happen while some other thread is modifying the state +in memory or on disk. Checkpoint thus may be reading changing data, it +needs a mutex to not have it corrupted, and concurrent modifiers of +the state need that mutex too for the same reason. +"records" is modified for every row write/update/delete, we don't want +to add a mutex lock/unlock there. So we re-use the mutex lock/unlock +which is already present in these moments, namely the log's mutex which is +taken when UNDO_ROW_INSERT|UPDATE|DELETE is written: we update "records" in +under-log-mutex hooks when writing these records (thus "records" is +not updated at the end of maria_write/update/delete() anymore). +Thus Checkpoint takes the log's lock and can read "records" from +memory an write it to disk and release log's lock. +We however want to avoid having the disk write under the log's +lock. So it has to be under another mutex, natural choice is +intern_lock (as Checkpoint needs it anyway to read MARIA_SHARE::kfile, +and as maria_close() takes it too). All state writes to disk are +changed to be protected with intern_lock. +So Checkpoint takes intern_lock, log's lock, reads "records" from +memory, releases log's lock, updates is_of_horizon and writes "records" to +disk, release intern_lock. +In practice, not only "records" needs to be written but the full +state. So, Checkpoint reads the full state from memory. Some other +thread may at this moment be modifying in memory some pieces of the +state which are not protected by the lock's log (see ma_extra.c +HA_EXTRA_NO_KEYS), and Checkpoint would be reading a corrupted state +from memory; to guard against that we extend the intern_lock-zone to +changes done to the state in memory by HA_EXTRA_NO_KEYS et al, and +also any change made in memory to create_rename_lsn/state_is_of_horizon. +Last, we don't want in Checkpoint to do + log lock; read state from memory; release log lock; +for each table, it may hold the log's lock too much in total. +So, we instead do + log lock; read N states from memory; release log lock; +Thus, the sequence above happens outside of any intern_lock. +But this re-introduces the problem that some other thread may be changing the +state in memory and on disk under intern_lock, without log's lock, like +HA_EXTRA_NO_KEYS, while we read the N states. However, when Checkpoint later +comes to handling the table under intern_lock, which is serialized with +HA_EXTRA_NO_KEYS, it can see that is_of_horizon is higher then when the state +was read from memory under log's lock, and thus can decide to not flush the +obsolete state it has, knowing that the other thread flushed a more recent +state already. If on the other hand is_of_horizon is not higher, the read +state is current and can be flushed. So we have a per-table sequence: + lock intern_lock; test if is_of_horizon is higher than when we read the state + under log's lock; if no then flush the read state to disk. +*/ + +/* some comments and pseudo-code which we keep for later */ +#if 0 + /* + MikaelR suggests: support checkpoints during REDO phase too: do checkpoint + after a certain amount of log records have been executed. This helps + against repeated crashes. Those checkpoints could not be user-requested + (as engine is not communicating during the REDO phase), so they would be + automatic: this changes the original assumption that we don't write to the + log while in the REDO phase, but why not. How often should we checkpoint? + */ + + /* + We want to have two steps: + engine->recover_with_max_memory(); + next_engine->recover_with_max_memory(); + engine->init_with_normal_memory(); + next_engine->init_with_normal_memory(); + So: in recover_with_max_memory() allocate a giant page cache, do REDO + phase, then all page cache is flushed and emptied and freed (only retain + small structures like TM): take full checkpoint, which is useful if + next engine crashes in its recovery the next second. + Destroy all shares (maria_close()), then at init_with_normal_memory() we + do this: + */ + + /**** UNDO PHASE *****/ + + /* + Launch one or more threads to do the background rollback. Don't wait for + them to complete their rollback (background rollback; for debugging, we + can have an option which waits). Set a counter (total_of_rollback_threads) + to the number of threads to lauch. + + Note that InnoDB's rollback-in-background works as long as InnoDB is the + last engine to recover, otherwise MySQL will refuse new connections until + the last engine has recovered so it's not "background" from the user's + point of view. InnoDB is near top of sys_table_types so all others + (e.g. BDB) recover after it... So it's really "online rollback" only if + InnoDB is the only engine. + */ + + /* wake up delete/update handler */ + /* tell the TM that it can now accept new transactions */ + + /* + mark that checkpoint requests are now allowed. + */ +#endif diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h new file mode 100644 index 00000000..4373ef52 --- /dev/null +++ b/storage/maria/ma_recovery.h @@ -0,0 +1,39 @@ +/* Copyright (C) 2006,2007 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + WL#3072 Maria recovery + First version written by Guilhem Bichot on 2006-04-27. +*/ + +/* This is the interface of this module. */ + +/* Performs recovery of the engine at start */ + +C_MODE_START +enum maria_apply_log_way +{ MARIA_LOG_APPLY, MARIA_LOG_DISPLAY_HEADER, MARIA_LOG_CHECK }; +int maria_recovery_from_log(void); +int maria_apply_log(LSN lsn, LSN lsn_end, LSN lsn_undo_end, + enum maria_apply_log_way apply, + FILE *trace_file, + my_bool skip_DDLs, my_bool take_checkpoints, + uint *warnings_count); +/* Table of tables to recover */ +extern HASH tables_to_redo; +extern ulong maria_recovery_force_crash_counter; +extern ulong recovery_found_crashed_tables; +extern uint skipped_lsn_err_count; +C_MODE_END diff --git a/storage/maria/ma_recovery_util.c b/storage/maria/ma_recovery_util.c new file mode 100644 index 00000000..fe43d812 --- /dev/null +++ b/storage/maria/ma_recovery_util.c @@ -0,0 +1,149 @@ +/* Copyright (C) 2006,2007,2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Q: Why isn't ma_recovery_util.c simply moved to ma_recovery.c ? + + A: ma_recovery.c, because it invokes objects from ma_check.c (like + maria_chk_init()) causes the following problem: + if a source file a.c of a program invokes a function defined in + ma_recovery.c, then a.o depends on ma_recovery.o which depends on + ma_check.o: linker thus brings in ma_check.o. That brings in the + dependencies of ma_check.o which are definitions of _ma_check_print_info() + etc; if a.o does not define them then the ones of ha_maria.o are used + i.e. ha_maria.o is linked into the program, and this brings in dependencies + of ha_maria.o on mysqld.o into the program's linking which thus fails, as + the program is not linked with mysqld.o. + Thus, while several functions defined in ma_recovery.c could be useful to + other files, they cannot be used by them. + So we are going to gradually move a great share of ma_recovery.c's exported + functions into the present file, to isolate the problematic components and + avoid the problem. +*/ + +#include "maria_def.h" + +HASH all_dirty_pages; +struct st_dirty_page /* used only in the REDO phase */ +{ + uint64 file_and_page_id; + LSN rec_lsn; +}; +/* + LSN after which dirty pages list does not apply. Can be slightly before + when ma_checkpoint_execute() started. +*/ +LSN checkpoint_start= LSN_IMPOSSIBLE; + +/** @todo looks like duplicate of recovery_message_printed */ +my_bool procent_printed; +FILE *tracef; /**< trace file for debugging */ + +ulong recovery_found_crashed_tables; +uint skipped_lsn_err_count; + +/** @brief Prints to a trace file if it is not NULL */ +void tprint(FILE *trace_file __attribute__ ((unused)), + const char *format __attribute__ ((unused)), ...) +{ + va_list args; +#ifndef DBUG_OFF + { + char buff[1024]; + size_t length; + va_start(args, format); + length= my_vsnprintf(buff, sizeof(buff)-1, format, args); + if (length && buff[length-1] == '\n') + buff[length-1]= 0; /* Don't print end \n */ + DBUG_PRINT("info", ("%s", buff)); + va_end(args); + } +#endif + va_start(args, format); + if (trace_file != NULL) + vfprintf(trace_file, format, args); + va_end(args); +} + + +void eprint(FILE *trace_file __attribute__ ((unused)), + const char *format __attribute__ ((unused)), ...) +{ + va_list args; + va_start(args, format); + DBUG_PRINT("error", ("%s", format)); + if (!trace_file) + trace_file= stderr; + + if (procent_printed) + { + procent_printed= 0; + /* In silent mode, print on another line than the 0% 10% 20% line */ + fputc('\n', stderr); + fflush(stderr); + } + vfprintf(trace_file , format, args); + fputc('\n', trace_file); + if (trace_file != stderr) + { + va_start(args, format); + my_printv_error(HA_ERR_INITIALIZATION, format, MYF(0), args); + } + va_end(args); + fflush(trace_file); +} + + +/** + Tells if the dirty pages list found in checkpoint record allows to ignore a + REDO for a certain page. + + @param shortid short id of the table + @param lsn REDO record's LSN + @param page page number + @param index TRUE if index page, FALSE if data page +*/ + +my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn, + pgcache_page_no_t page, + my_bool index) +{ + if (cmp_translog_addr(lsn, checkpoint_start) < 0) + { + /* + 64-bit key is formed like this: + Most significant byte: 0 if data page, 1 if index page + Next 2 bytes: table's short id + Next 5 bytes: page number + */ + char llbuf[22]; + uint64 file_and_page_id= + (((uint64)((index << 16) | shortid)) << 40) | page; + struct st_dirty_page *dirty_page= (struct st_dirty_page *) + my_hash_search(&all_dirty_pages, + (uchar *)&file_and_page_id, sizeof(file_and_page_id)); + DBUG_PRINT("info", ("page %lld in dirty pages list: %d", + (ulonglong) page, + dirty_page != NULL)); + if ((dirty_page == NULL) || + cmp_translog_addr(lsn, dirty_page->rec_lsn) < 0) + { + tprint(tracef, ", ignoring page %s because of dirty_pages list\n", + llstr((ulonglong) page, llbuf)); + return TRUE; + } + } + return FALSE; +} diff --git a/storage/maria/ma_recovery_util.h b/storage/maria/ma_recovery_util.h new file mode 100644 index 00000000..39c16bc5 --- /dev/null +++ b/storage/maria/ma_recovery_util.h @@ -0,0 +1,42 @@ +/* Copyright (C) 2006,2007,2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +struct st_dirty_page /* used only in the REDO phase */ +{ + uint64 file_and_page_id; + LSN rec_lsn; +}; +extern HASH all_dirty_pages; +/* + LSN after which dirty pages list does not apply. Can be slightly before + when ma_checkpoint_execute() started. +*/ +extern LSN checkpoint_start; +extern my_bool procent_printed; +extern FILE *tracef; + + +my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn, + pgcache_page_no_t page, + my_bool index); +#ifdef WAITING_FOR_BUGFIX_TO_VSPRINTF +void tprint(FILE *trace_file, const char *format, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +void eprint(FILE *trace_file, const char *format, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +#else +void tprint(FILE *trace_file, const char *format, ...); +void eprint(FILE *trace_file, const char *format, ...); +#endif diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c new file mode 100644 index 00000000..ade6e52f --- /dev/null +++ b/storage/maria/ma_rename.c @@ -0,0 +1,165 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Rename a table +*/ + +#include "ma_fulltext.h" +#include "trnman_public.h" + +/** + @brief renames a table + + @param old_name current name of table + @param new_name table should be renamed to this name + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int maria_rename(const char *old_name, const char *new_name) +{ + char from[FN_REFLEN],to[FN_REFLEN]; + int data_file_rename_error= 0, index_file_rename_error= 0; +#ifdef USE_RAID + uint raid_type=0,raid_chunks=0; +#endif + MARIA_HA *info; + MARIA_SHARE *share; + myf sync_dir= 0; + my_bool ddl_recovery= 0; + DBUG_ENTER("maria_rename"); + +#ifdef EXTRA_DEBUG + _ma_check_table_is_closed(old_name,"rename old_table"); + _ma_check_table_is_closed(new_name,"rename new table2"); +#endif + /** @todo LOCK take X-lock on table */ + if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR, 0))) + { + int error= my_errno; + /* + Check if we are in recovery from a rename that failed in the middle + and we are now renaming things back. + */ + if (error == ENOENT) + { + char *index_file= from; + char *data_file= to; + fn_format(index_file, old_name, "", MARIA_NAME_IEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + fn_format(data_file, old_name, "", MARIA_NAME_DEXT, + MY_UNPACK_FILENAME | MY_APPEND_EXT); + if (!access(data_file, F_OK) && access(index_file, F_OK)) + { + ddl_recovery= 1; + goto forced_rename; + } + } + DBUG_RETURN(error); + } + share= info->s; +#ifdef USE_RAID + raid_type = share->base.raid_type; + raid_chunks = share->base.raid_chunks; +#endif + + /* + the renaming of an internal table to the final table (like in ALTER TABLE) + is the moment when this table receives its correct create_rename_lsn and + this is important; make sure transactionality has been re-enabled. + */ + DBUG_ASSERT(share->now_transactional == share->base.born_transactional); + if (share->now_transactional && !share->temporary && !maria_in_recovery) + { + LSN lsn; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + size_t old_name_len= strlen(old_name)+1, new_name_len= strlen(new_name)+1; + sync_dir= MY_SYNC_DIR; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)old_name; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= old_name_len; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (uchar*)new_name; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= new_name_len; + /* + For this record to be of any use for Recovery, we need the upper + MySQL layer to be crash-safe, which it is not now (that would require + work using the ddl_log of sql/sql_table.cc); when it is, we should + reconsider the moment of writing this log record (before or after op, + under THR_LOCK_maria or not...), how to use it in Recovery. + For now it can serve to apply logs to a backup so we sync it. + */ + if (unlikely(translog_write_record(&lsn, LOGREC_REDO_RENAME_TABLE, + &dummy_transaction_object, NULL, + (translog_size_t)(old_name_len + new_name_len), + sizeof(log_array)/sizeof(log_array[0]), + log_array, NULL, NULL) || + translog_flush(lsn))) + { + maria_close(info); + DBUG_RETURN(1); + } + /* + store LSN into file, needed for Recovery to not be confused if a + RENAME happened (applying REDOs to the wrong table). + */ + if (_ma_update_state_lsns(share, lsn, share->state.create_trid, TRUE, + TRUE)) + { + maria_close(info); + DBUG_RETURN(1); + } + } + + _ma_reset_state(info); + maria_close(info); + +forced_rename: + /* + This code is written so that it should be possible to re-run a + failed rename (even if there is a server crash in between the + renames) and complete it. + */ + fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + if (mysql_file_rename_with_symlink(key_file_kfile, from, to, + MYF(MY_WME | sync_dir))) + index_file_rename_error= my_errno; + fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT); + if (mysql_file_rename_with_symlink(key_file_dfile, from, to, + MYF(MY_WME | sync_dir))) + data_file_rename_error= my_errno; + if (data_file_rename_error && data_file_rename_error != ENOENT && + !ddl_recovery) + { + /* + Now we have a renamed index file and a non-renamed data file, try to + undo a successful rename of the index file. + */ + if (!index_file_rename_error) + { + fn_format(from, old_name, "", MARIA_NAME_IEXT, + MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + fn_format(to, new_name, "", MARIA_NAME_IEXT, + MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT)); + mysql_file_rename_with_symlink(key_file_kfile, to, from, + MYF(MY_WME | sync_dir)); + } + } + DBUG_RETURN(data_file_rename_error ? data_file_rename_error: + index_file_rename_error); +} diff --git a/storage/maria/ma_rfirst.c b/storage/maria/ma_rfirst.c new file mode 100644 index 00000000..44d19485 --- /dev/null +++ b/storage/maria/ma_rfirst.c @@ -0,0 +1,26 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + + /* Read first row through a specfic key */ + +int maria_rfirst(MARIA_HA *info, uchar *buf, int inx) +{ + DBUG_ENTER("maria_rfirst"); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_PREV_FOUND; + DBUG_RETURN(maria_rnext(info,buf,inx)); +} /* maria_rfirst */ diff --git a/storage/maria/ma_rkey.c b/storage/maria/ma_rkey.c new file mode 100644 index 00000000..8cd82e1c --- /dev/null +++ b/storage/maria/ma_rkey.c @@ -0,0 +1,266 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Read record based on a key */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +/** + Read a record using key + + @note + Ordinary search_flag is 0 ; Give error if no record with key +*/ + +int maria_rkey(MARIA_HA *info, uchar *buf, int inx, const uchar *key_data, + key_part_map keypart_map, enum ha_rkey_function search_flag) +{ + uchar *key_buff; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + HA_KEYSEG *last_used_keyseg; + uint32 nextflag; + MARIA_KEY key; + check_result_t check= CHECK_POS; + DBUG_ENTER("maria_rkey"); + DBUG_PRINT("enter", ("base:%p buf:%p inx: %d search_flag: %d", + info, buf, inx, search_flag)); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->last_key_func= search_flag; + keyinfo= info->last_key.keyinfo; + + key_buff= info->lastkey_buff2; + + if (info->once_flags & USE_PACKED_KEYS) + { + info->once_flags&= ~USE_PACKED_KEYS; /* Reset flag */ + /* + key is already packed!; This happens when we are using a MERGE TABLE + In this key 'key_part_map' is the length of the key ! + */ + if (key_buff != key_data) + bmove(key_buff, key_data, keypart_map); + key.data= key_buff; + key.keyinfo= keyinfo; + key.data_length= keypart_map; + key.ref_length= 0; + key.flag= 0; + + last_used_keyseg= keyinfo->seg + info->last_used_keyseg; + } + else + { + DBUG_ASSERT(keypart_map); + /* Save the packed key for later use in the second buffer of lastkey. */ + _ma_pack_key(info, &key, inx, key_buff, key_data, + keypart_map, &last_used_keyseg); + /* Save packed_key_length for use by the MERGE engine. */ + info->pack_key_length= key.data_length; + info->last_used_keyseg= (uint16) (last_used_keyseg - + keyinfo->seg); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key);); + } + + if (fast_ma_readinfo(info)) + goto err; + if (share->lock_key_trees) + mysql_rwlock_rdlock(&keyinfo->root_lock); + + nextflag= maria_read_vec[search_flag] | key.flag; + if (search_flag != HA_READ_KEY_EXACT) + { + /* Assume we will get a read next/previous call after this one */ + nextflag|= SEARCH_SAVE_BUFF; + } + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + if (maria_rtree_find_first(info, &key, nextflag) < 0) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + info->cur_row.lastpos= HA_OFFSET_ERROR; + } + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!_ma_search(info, &key, nextflag, info->s->state.key_root[inx])) + { + MARIA_KEY lastkey; + /* + Found a key, but it might not be usable. We cannot use rows that + are inserted by other threads after we got our table lock + ("concurrent inserts"). The record may not even be present yet. + Keys are inserted into the index(es) before the record is + inserted into the data file. + + If index condition is present, it must be either satisfied or + not satisfied with an out-of-range condition. + */ + if ((*share->row_is_visible)(info) && + ((check= ma_check_index_cond(info, inx, buf)) != CHECK_NEG)) + break; + + /* The key references a concurrently inserted record. */ + if (search_flag == HA_READ_KEY_EXACT && + last_used_keyseg == keyinfo->seg + keyinfo->keysegs) + { + /* Simply ignore the key if it matches exactly. (Bug #29838) */ + my_errno= HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + + lastkey.keyinfo= keyinfo; + lastkey.data= info->lastkey_buff; + do + { + uint not_used[2]; + /* + Skip rows that are inserted by other threads since we got + a lock. Note that this can only happen if we are not + searching after a full length exact key, because the keys + are sorted according to position. + */ + lastkey.data_length= info->last_key.data_length; + lastkey.ref_length= info->last_key.ref_length; + lastkey.flag= info->last_key.flag; + if (_ma_search_next(info, &lastkey, maria_readnext_vec[search_flag], + info->s->state.key_root[inx])) + break; /* purecov: inspected */ + + /* + If we are at the last key on the key page, allow writers to + access the index. + */ + if (info->int_keypos >= info->int_maxpos && + ma_yield_and_check_if_killed(info, inx)) + { + DBUG_ASSERT(info->cur_row.lastpos == HA_OFFSET_ERROR); + break; + } + + /* + Check that the found key does still match the search. + _ma_search_next() delivers the next key regardless of its + value. + */ + if (!(nextflag & (SEARCH_BIGGER | SEARCH_SMALLER)) && + ha_key_cmp(keyinfo->seg, info->last_key.data, key.data, + key.data_length, SEARCH_FIND, not_used)) + { + /* purecov: begin inspected */ + my_errno= HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + /* purecov: end */ + } + + } while (!(*share->row_is_visible)(info) || + ((check= ma_check_index_cond(info, inx, buf)) == 0)); + } + else + { + DBUG_ASSERT(info->cur_row.lastpos); + } + } + if (share->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + + if (info->cur_row.lastpos == HA_OFFSET_ERROR) + { + if (check == CHECK_OUT_OF_RANGE) + { + /* We don't want HA_ERR_END_OF_FILE in this particular case */ + my_errno= HA_ERR_KEY_NOT_FOUND; + } + fast_ma_writeinfo(info); + goto err; + } + + /* Calculate length of the found key; Used by maria_rnext_same */ + if ((keyinfo->flag & HA_VAR_LENGTH_KEY)) + info->last_rkey_length= _ma_keylength_part(keyinfo, info->lastkey_buff, + last_used_keyseg); + else + info->last_rkey_length= key.data_length; + + /* Check if we don't want to have record back, only error message */ + if (!buf) + { + fast_ma_writeinfo(info); + DBUG_RETURN(0); + } + if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + + info->cur_row.lastpos= HA_OFFSET_ERROR; /* Didn't find row */ + +err: + /* Store last used key as a base for read next */ + memcpy(info->last_key.data, key_buff, key.data_length); + info->last_key.data_length= key.data_length; + info->last_key.ref_length= info->s->base.rec_reflength; + info->last_key.flag= 0; + /* Create key with rowid 0 */ + bzero((char*) info->last_key.data + info->last_key.data_length, + info->s->base.rec_reflength); + + if (search_flag == HA_READ_AFTER_KEY) + info->update|=HA_STATE_NEXT_FOUND; /* Previous gives last row */ + DBUG_RETURN(my_errno); +} /* _ma_rkey */ + + +/* + Yield to possible other writers during a index scan. + Check also if we got killed by the user and if yes, return + HA_ERR_LOCK_WAIT_TIMEOUT + + return 0 ok + return 1 Query has been requested to be killed +*/ + +my_bool ma_yield_and_check_if_killed(MARIA_HA *info, int inx) +{ + MARIA_SHARE *share; + if (ma_killed(info)) + { + /* purecov: begin tested */ + /* Mark that we don't have an active row */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + /* Set error that we where aborted by kill from application */ + my_errno= HA_ERR_ABORTED_BY_USER; + return 1; + /* purecov: end */ + } + + if ((share= info->s)->lock_key_trees) + { + /* Give writers a chance to access index */ + mysql_rwlock_unlock(&share->keyinfo[inx].root_lock); + mysql_rwlock_rdlock(&share->keyinfo[inx].root_lock); + } + return 0; +} + diff --git a/storage/maria/ma_rlast.c b/storage/maria/ma_rlast.c new file mode 100644 index 00000000..2a74024d --- /dev/null +++ b/storage/maria/ma_rlast.c @@ -0,0 +1,26 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + + /* Read last row with the same key as the previous read. */ + +int maria_rlast(MARIA_HA *info, uchar *buf, int inx) +{ + DBUG_ENTER("maria_rlast"); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->update|= HA_STATE_NEXT_FOUND; + DBUG_RETURN(maria_rprev(info,buf,inx)); +} /* maria_rlast */ diff --git a/storage/maria/ma_rnext.c b/storage/maria/ma_rnext.c new file mode 100644 index 00000000..6fd6f891 --- /dev/null +++ b/storage/maria/ma_rnext.c @@ -0,0 +1,156 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + +#include "ma_rt_index.h" + + /* + Read next row with the same key as previous read + One may have done a write, update or delete of the previous row. + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! + */ + +int maria_rnext(MARIA_HA *info, uchar *buf, int inx) +{ + int error,changed; + uint flag; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + check_result_t check= CHECK_POS; + uint update_mask= HA_STATE_NEXT_FOUND; + DBUG_ENTER("maria_rnext"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + flag=SEARCH_BIGGER; /* Read next */ + if (info->cur_row.lastpos == HA_OFFSET_ERROR && + info->update & HA_STATE_PREV_FOUND) + flag=0; /* Read first */ + + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + keyinfo= share->keyinfo + inx; + if (share->lock_key_trees) + mysql_rwlock_rdlock(&keyinfo->root_lock); + changed= _ma_test_if_changed(info); + if (!flag) + { + switch (keyinfo->key_alg){ +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + error=maria_rtree_get_first(info, inx, + info->last_key.data_length + + info->last_key.ref_length); + + break; +#endif + case HA_KEY_ALG_BTREE: + default: + error= _ma_search_first(info, keyinfo, share->state.key_root[inx]); + break; + } + /* + "search first" failed. This means we have no pivot for + "search next", or in other words MI_INFO::lastkey is + likely uninitialized. + + Normally SQL layer would never request "search next" if + "search first" failed. But HANDLER may do anything. + + As mi_rnext() without preceding mi_rkey()/mi_rfirst() + equals to mi_rfirst(), we must restore original state + as if failing mi_rfirst() was not called. + */ + if (error) + update_mask|= HA_STATE_PREV_FOUND; + } + else + { + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + /* + Note that rtree doesn't support that the table + may be changed since last call, so we do need + to skip rows inserted by other threads like in btree + */ + error= maria_rtree_get_next(info, inx, info->last_key.data_length + + info->last_key.ref_length); + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!changed) + error= _ma_search_next(info, &info->last_key, + flag | info->last_key.flag, + share->state.key_root[inx]); + else + error= _ma_search(info, &info->last_key, flag | info->last_key.flag, + share->state.key_root[inx]); + } + } + + if (!error) + { + while (!(*share->row_is_visible)(info) || + ((check= ma_check_index_cond(info, inx, buf)) == CHECK_NEG)) + { + /* + If we are at the last key on the key page, allow writers to + access the index. + */ + if (info->int_keypos >= info->int_maxpos && + ma_yield_and_check_if_killed(info, inx)) + { + /* my_errno is set by ma_yield_and_check_if_killed() */ + error= 1; + break; + } + + /* Skip rows inserted by other threads since we got a lock */ + if ((error= _ma_search_next(info, &info->last_key, + SEARCH_BIGGER, + share->state.key_root[inx]))) + break; + } + } + if (share->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + + /* Don't clear if database-changed */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= update_mask; + + if (error || check != CHECK_POS) + { + fast_ma_writeinfo(info); + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno= HA_ERR_END_OF_FILE; + } + else if (!buf) + { + fast_ma_writeinfo(info); + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_PRINT("error",("Got error: %d, errno: %d",error, my_errno)); + DBUG_RETURN(my_errno); +} /* maria_rnext */ diff --git a/storage/maria/ma_rnext_same.c b/storage/maria/ma_rnext_same.c new file mode 100644 index 00000000..19a950d3 --- /dev/null +++ b/storage/maria/ma_rnext_same.c @@ -0,0 +1,122 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "ma_rt_index.h" + +/* + Read next row with the same key as previous read, but abort if + the key changes. + One may have done a write, update or delete of the previous row. + + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! +*/ + +int maria_rnext_same(MARIA_HA *info, uchar *buf) +{ + int error; + uint inx,not_used[2]; + MARIA_KEYDEF *keyinfo; + check_result_t check= CHECK_POS; + DBUG_ENTER("maria_rnext_same"); + + if ((int) (inx= info->lastinx) < 0 || + info->cur_row.lastpos == HA_OFFSET_ERROR) + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + + keyinfo= info->s->keyinfo+inx; + if (info->s->lock_key_trees) + mysql_rwlock_rdlock(&keyinfo->root_lock); + + switch (keyinfo->key_alg) { +#ifdef HAVE_RTREE_KEYS + case HA_KEY_ALG_RTREE: + if ((error=maria_rtree_find_next(info,inx, + maria_read_vec[info->last_key_func]))) + { + error=1; + my_errno=HA_ERR_END_OF_FILE; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + break; +#endif + case HA_KEY_ALG_BTREE: + default: + if (!(info->update & HA_STATE_RNEXT_SAME)) + { + /* First rnext_same; Store old key */ + memcpy(info->lastkey_buff2, info->last_key.data, + info->last_rkey_length); + } + for (;;) + { + if ((error= _ma_search_next(info, &info->last_key, + SEARCH_BIGGER, + info->s->state.key_root[inx]))) + break; + if (ha_key_cmp(keyinfo->seg, info->last_key.data, + info->lastkey_buff2, + info->last_rkey_length, SEARCH_FIND, + not_used)) + { + error=1; + my_errno=HA_ERR_END_OF_FILE; + info->cur_row.lastpos= HA_OFFSET_ERROR; + break; + } + /* + If we are at the last key on the key page, allow writers to + access the index. + */ + if (info->int_keypos >= info->int_maxpos && + ma_yield_and_check_if_killed(info, inx)) + { + error= 1; + break; + } + /* Skip rows that are inserted by other threads since we got a lock */ + if ((info->s->row_is_visible)(info) && + ((check= ma_check_index_cond(info, inx, buf)) != CHECK_NEG)) + break; + } + } + if (info->s->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + /* Don't clear if database-changed */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_NEXT_FOUND | HA_STATE_RNEXT_SAME; + + if (error || check != CHECK_POS) + { + fast_ma_writeinfo(info); + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno= HA_ERR_END_OF_FILE; + } + else if (!buf) + { + fast_ma_writeinfo(info); + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_RETURN(my_errno); +} /* maria_rnext_same */ diff --git a/storage/maria/ma_rprev.c b/storage/maria/ma_rprev.c new file mode 100644 index 00000000..15010d96 --- /dev/null +++ b/storage/maria/ma_rprev.c @@ -0,0 +1,106 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + + /* + Read previous row with the same key as previous read + One may have done a write, update or delete of the previous row. + NOTE! Even if one changes the previous row, the next read is done + based on the position of the last used key! + */ + +int maria_rprev(MARIA_HA *info, uchar *buf, int inx) +{ + int error,changed; + register uint flag; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + check_result_t check= CHECK_POS; + DBUG_ENTER("maria_rprev"); + + if ((inx = _ma_check_index(info,inx)) < 0) + DBUG_RETURN(my_errno); + flag=SEARCH_SMALLER; /* Read previous */ + if (info->cur_row.lastpos == HA_OFFSET_ERROR && + info->update & HA_STATE_NEXT_FOUND) + flag=0; /* Read last */ + + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + keyinfo= share->keyinfo + inx; + changed= _ma_test_if_changed(info); + if (share->lock_key_trees) + mysql_rwlock_rdlock(&keyinfo->root_lock); + if (!flag) + error= _ma_search_last(info, keyinfo, share->state.key_root[inx]); + else if (!changed) + error= _ma_search_next(info, &info->last_key, + flag | info->last_key.flag, + share->state.key_root[inx]); + else + error= _ma_search(info, &info->last_key, flag | info->last_key.flag, + share->state.key_root[inx]); + + if (!error) + { + my_off_t cur_keypage= info->last_keypage; + while (!(*share->row_is_visible)(info) || + ((check= ma_check_index_cond(info, inx, buf)) == CHECK_NEG)) + { + /* + If we are at the last (i.e. first?) key on the key page, + allow writers to access the index. + */ + if (info->last_keypage != cur_keypage) + { + cur_keypage= info->last_keypage; + if (ma_yield_and_check_if_killed(info, inx)) + { + error= 1; + break; + } + } + + /* Skip rows that are inserted by other threads since we got a lock */ + if ((error= _ma_search_next(info, &info->last_key, + SEARCH_SMALLER, + share->state.key_root[inx]))) + break; + } + } + if (share->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + info->update|= HA_STATE_PREV_FOUND; + + if (error || check != CHECK_POS) + { + fast_ma_writeinfo(info); + if (my_errno == HA_ERR_KEY_NOT_FOUND) + my_errno= HA_ERR_END_OF_FILE; + } + else if (!buf) + { + fast_ma_writeinfo(info); + DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0); + } + else if (!(*info->read_record)(info, buf, info->cur_row.lastpos)) + { + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + DBUG_RETURN(my_errno); +} /* maria_rprev */ diff --git a/storage/maria/ma_rrnd.c b/storage/maria/ma_rrnd.c new file mode 100644 index 00000000..b2039e01 --- /dev/null +++ b/storage/maria/ma_rrnd.c @@ -0,0 +1,46 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Read a record with random-access. The position to the record must + get by MARIA_HA. The next record can be read with pos= MARIA_POS_ERROR */ + + +#include "maria_def.h" + +/* + Read a row based on position. + + RETURN + 0 Ok. + HA_ERR_RECORD_DELETED Record is deleted. + HA_ERR_END_OF_FILE EOF. +*/ + +int maria_rrnd(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos) +{ + int ret; + DBUG_ENTER("maria_rrnd"); + + DBUG_ASSERT(filepos != HA_OFFSET_ERROR); + + /* Init all but update-flag */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + + info->cur_row.lastpos= filepos; /* Remember for update */ + ret= (*info->s->read_record)(info, buf, filepos); + DBUG_RETURN(ret); +} diff --git a/storage/maria/ma_rsame.c b/storage/maria/ma_rsame.c new file mode 100644 index 00000000..ae9545b9 --- /dev/null +++ b/storage/maria/ma_rsame.c @@ -0,0 +1,78 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + +/** + Find current row with read on position or read on key + + @notes + If inx >= 0 find record using key else re-read row on last position + + @warning + This function is not row version safe. + This is not crtical as this function is not used by MySQL + + @return + @retval 0 Ok + @retval HA_ERR_KEY_NOT_FOUND Row is deleted + @retval HA_ERR_END_OF_FILE End of file + @retval HA_ERR_WRONG_INDEX Wrong inx argument +*/ + + +int maria_rsame(MARIA_HA *info, uchar *record, int inx) +{ + DBUG_ENTER("maria_rsame"); + + if (inx >= 0 && _ma_check_index(info, inx) < 0) + { + DBUG_PRINT("error", ("wrong index usage")); + DBUG_RETURN(my_errno); + } + if (info->cur_row.lastpos == HA_OFFSET_ERROR || + info->update & HA_STATE_DELETED) + { + DBUG_PRINT("error", ("no current record")); + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); /* No current record */ + } + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + + /* Read row from data file */ + if (fast_ma_readinfo(info)) + DBUG_RETURN(my_errno); + + if (inx >= 0) + { + MARIA_KEYDEF *keyinfo= info->last_key.keyinfo; + (*keyinfo->make_key)(info, &info->last_key, (uint) inx, + info->lastkey_buff, record, + info->cur_row.lastpos, + info->cur_row.trid); + if (info->s->lock_key_trees) + mysql_rwlock_rdlock(&keyinfo->root_lock); + _ma_search(info, &info->last_key, SEARCH_SAME, + info->s->state.key_root[inx]); + if (info->s->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + } + + if (!(*info->read_record)(info, record, info->cur_row.lastpos)) + DBUG_RETURN(0); + if (my_errno == HA_ERR_RECORD_DELETED) + my_errno=HA_ERR_KEY_NOT_FOUND; + DBUG_PRINT("error", ("my_errno: %d", my_errno)); + DBUG_RETURN(my_errno); +} /* maria_rsame */ diff --git a/storage/maria/ma_rsamepos.c b/storage/maria/ma_rsamepos.c new file mode 100644 index 00000000..092bb50d --- /dev/null +++ b/storage/maria/ma_rsamepos.c @@ -0,0 +1,63 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* read record through position and fix key-position */ +/* As maria_rsame but supply a position */ + +#include "maria_def.h" + + +/* + Read row based on postion + + @param inx If inx >= 0 postion the given index on found row + + @return + @retval 0 Ok + @retval HA_ERR_KEY_NOT_FOUND Row is deleted + @retval HA_ERR_END_OF_FILE End of file +*/ + +int maria_rsame_with_pos(MARIA_HA *info, uchar *record, int inx, + MARIA_RECORD_POS filepos) +{ + DBUG_ENTER("maria_rsame_with_pos"); + DBUG_PRINT("enter",("index: %d filepos: %ld", inx, (long) filepos)); + + if (inx < -1 || + (inx >= 0 && ! maria_is_key_active(info->s->state.key_map, inx))) + { + DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX); + } + + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + if ((*info->s->read_record)(info, record, filepos)) + { + if (my_errno == HA_ERR_RECORD_DELETED) + my_errno=HA_ERR_KEY_NOT_FOUND; + DBUG_RETURN(my_errno); + } + info->cur_row.lastpos= filepos; + info->lastinx= inx; + if (inx >= 0) + { + (*info->s->keyinfo[inx].make_key)(info, &info->last_key, (uint) inx, + info->lastkey_buff, + record, info->cur_row.lastpos, + info->cur_row.trid); + info->update|=HA_STATE_KEY_CHANGED; /* Don't use indexposition */ + } + DBUG_RETURN(0); +} /* maria_rsame_pos */ diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c new file mode 100644 index 00000000..6fddc895 --- /dev/null +++ b/storage/maria/ma_rt_index.c @@ -0,0 +1,1378 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +#define REINSERT_BUFFER_INC 10 +#define PICK_BY_AREA +/*#define PICK_BY_PERIMETER*/ + +typedef struct st_page_level +{ + uint level; + my_off_t offs; +} stPageLevel; + +typedef struct st_page_list +{ + uint n_pages; + uint m_pages; + stPageLevel *pages; +} stPageList; + + +/* + Find next key in r-tree according to search_flag recursively + + NOTES + Used in maria_rtree_find_first() and maria_rtree_find_next() + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uint32 search_flag, + uint nod_cmp_flag, my_off_t page_pos, + int level) +{ + MARIA_SHARE *share= info->s; + uint nod_flag; + int res; + uchar *page_buf, *k, *last; + int key_data_length; + uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level; + MARIA_PAGE page; + my_bool buff_alloced; + + alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced, + keyinfo->block_length); + if (!page_buf) + { + my_errno= HA_ERR_OUT_OF_MEM; + return(-1); + } + + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + + key_data_length= keyinfo->keylength - share->base.rec_reflength; + + if (info->maria_rtree_recursion_depth >= level) + { + k= page_buf + *saved_key; + } + else + { + k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag); + } + last= rt_PAGE_END(&page); + + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag)) + { + if (nod_flag) + { + /* this is an internal node in the tree */ + if (!(res= maria_rtree_key_cmp(keyinfo->seg, + info->first_mbr_key, k, + info->last_rkey_length, nod_cmp_flag))) + { + switch ((res= maria_rtree_find_req(info, keyinfo, search_flag, + nod_cmp_flag, + _ma_kpos(nod_flag, k), + level + 1))) + { + case 0: /* found - exit from recursion */ + *saved_key= (uint) (k - page_buf); + goto ok; + case 1: /* not found - continue searching */ + info->maria_rtree_recursion_depth= level; + break; + default: /* error */ + case -1: + goto err; + } + } + } + else + { + /* this is a leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, info->first_mbr_key, + k, info->last_rkey_length, search_flag)) + { + uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0); + MARIA_KEY tmp_key; + + /* + We don't need to set all MARIA_KEY elements here as + _ma_row_pos_from_key() only uses a few of them. + */ + tmp_key.keyinfo= keyinfo; + tmp_key.data= k; + tmp_key.data_length= key_data_length; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + info->last_key.data_length= key_data_length; + info->last_key.ref_length= share->base.rec_reflength; + info->last_key.flag= 0; + memcpy(info->last_key.data, k, + info->last_key.data_length + info->last_key.ref_length); + info->maria_rtree_recursion_depth= level; + *saved_key= (uint) (last - page_buf); + + if (after_key < last) + { + uchar *keyread_buff= info->keyread_buff; + info->int_keypos= keyread_buff; + info->int_maxpos= keyread_buff + (last - after_key); + memcpy(keyread_buff, after_key, last - after_key); + info->keyread_buff_used= 0; + } + else + { + info->keyread_buff_used= 1; + } + + res= 0; + goto ok; + } + } + } + info->cur_row.lastpos= HA_OFFSET_ERROR; + my_errno= HA_ERR_KEY_NOT_FOUND; + res= 1; + +ok: + stack_alloc_free(page_buf, buff_alloced); + return res; + +err: + stack_alloc_free(page_buf, buff_alloced); + info->cur_row.lastpos= HA_OFFSET_ERROR; + return -1; +} + + +/* + Find first key in r-tree according to search_flag condition + + SYNOPSIS + maria_rtree_find_first() + info Handler to MARIA file + key Key to search for + search_flag Bitmap of flags how to do the search + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint32 search_flag) +{ + my_off_t root; + uint nod_cmp_flag; + MARIA_KEYDEF *keyinfo= key->keyinfo; + + /* + At the moment index can only properly handle the + MBR_INTERSECT, so we use it for all sorts of queries. + TODO: better searsh for CONTAINS/WITHIN. + */ + search_flag= nod_cmp_flag= MBR_INTERSECT; + if ((root= info->s->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + /* + Save searched key, include data pointer. + The data pointer is required if the search_flag contains MBR_DATA. + (minimum bounding rectangle) + */ + memcpy(info->first_mbr_key, key->data, key->data_length + key->ref_length); + info->last_rkey_length= key->data_length; + + info->maria_rtree_recursion_depth= -1; + info->keyread_buff_used= 1; + + /* + TODO better search for CONTAINS/WITHIN. + nod_cmp_flag= ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ? + MBR_WITHIN : MBR_INTERSECT); + */ + return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, + 0); +} + + +/* + Find next key in r-tree according to search_flag condition + + SYNOPSIS + maria_rtree_find_next() + info Handler to MARIA file + uint keynr Key number to use + search_flag Bitmap of flags how to do the search + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag) +{ + my_off_t root; + uint32 nod_cmp_flag; + MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + DBUG_ASSERT(info->last_key.keyinfo == keyinfo); + /* + At the moment index can only properly handle the + MBR_INTERSECT, so we use it for all sorts of queries. + TODO: better searsh for CONTAINS/WITHIN. + */ + search_flag= nod_cmp_flag= MBR_INTERSECT; + + if (info->update & HA_STATE_DELETED) + return maria_rtree_find_first(info, &info->last_key, search_flag); + + if (!info->keyread_buff_used) + { + uchar *key= info->int_keypos; + + while (key < info->int_maxpos) + { + if (!maria_rtree_key_cmp(keyinfo->seg, + info->first_mbr_key, key, + info->last_rkey_length, search_flag)) + { + uchar *after_key= key + keyinfo->keylength; + MARIA_KEY tmp_key; + + /* + We don't need to set all MARIA_KEY elements here as + _ma_row_pos_from_key only uses a few of them. + */ + tmp_key.keyinfo= keyinfo; + tmp_key.data= key; + tmp_key.data_length= keyinfo->keylength - info->s->base.rec_reflength; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + memcpy(info->last_key.data, key, info->last_key.data_length); + + if (after_key < info->int_maxpos) + info->int_keypos= after_key; + else + info->keyread_buff_used= 1; + return 0; + } + key+= keyinfo->keylength; + } + } + if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + /* + TODO better search for CONTAINS/WITHIN. + nod_cmp_flag= (((search_flag & (MBR_EQUAL | MBR_WITHIN)) ? + MBR_WITHIN : MBR_INTERSECT)); + */ + return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root, + 0); +} + + +/* + Get next key in r-tree recursively + + NOTES + Used in maria_rtree_get_first() and maria_rtree_get_next() + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uint key_length, my_off_t page_pos, int level) +{ + MARIA_SHARE *share= info->s; + uchar *page_buf, *last, *k; + uint nod_flag, key_data_length; + int res; + uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level; + my_bool buff_alloced; + MARIA_PAGE page; + + alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced, + keyinfo->block_length); + if (!page_buf) + { + my_errno= HA_ERR_OUT_OF_MEM; + return(-1); + } + + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + + key_data_length= keyinfo->keylength - share->base.rec_reflength; + + if (info->maria_rtree_recursion_depth >= level) + { + k= page.buff + *saved_key; + if (!nod_flag) + { + /* Only leaf pages contain data references. */ + /* Need to check next key with data reference. */ + k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag); + } + } + else + { + k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag); + } + last= rt_PAGE_END(&page); + + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag)) + { + if (nod_flag) + { + /* this is an internal node in the tree */ + switch ((res= maria_rtree_get_req(info, keyinfo, key_length, + _ma_kpos(nod_flag, k), level + 1))) + { + case 0: /* found - exit from recursion */ + *saved_key= (uint) (k - page.buff); + goto ok; + case 1: /* not found - continue searching */ + info->maria_rtree_recursion_depth= level; + break; + default: + case -1: /* error */ + goto err; + } + } + else + { + /* this is a leaf */ + uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0); + MARIA_KEY tmp_key; + + /* + We don't need to set all MARIA_KEY elements here as + _ma_row_pos_from_key() only uses a few of them. + */ + tmp_key.keyinfo= keyinfo; + tmp_key.data= k; + tmp_key.data_length= key_data_length; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + info->last_key.data_length= key_data_length; + info->last_key.ref_length= share->base.rec_reflength; + + memcpy(info->last_key.data, k, + info->last_key.data_length + info->last_key.ref_length); + + info->maria_rtree_recursion_depth= level; + *saved_key= (uint) (k - page.buff); + + if (after_key < last) + { + uchar *keyread_buff= info->keyread_buff; + info->last_rtree_keypos= saved_key; + memcpy(keyread_buff, page.buff, page.size); + info->int_maxpos= keyread_buff + page.size; + info->keyread_buff_used= 0; + } + else + { + info->keyread_buff_used= 1; + } + + res= 0; + goto ok; + } + } + info->cur_row.lastpos= HA_OFFSET_ERROR; + my_errno= HA_ERR_KEY_NOT_FOUND; + res= 1; + +ok: + stack_alloc_free(page_buf, buff_alloced); + return res; + +err: + stack_alloc_free(page_buf, buff_alloced); + info->cur_row.lastpos= HA_OFFSET_ERROR; + return -1; +} + + +/* + Get first key in r-tree + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length) +{ + my_off_t root; + MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + + if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + info->maria_rtree_recursion_depth= -1; + info->keyread_buff_used= 1; + + return maria_rtree_get_req(info, keyinfo, key_length, root, 0); +} + + +/* + Get next key in r-tree + + RETURN + -1 Error + 0 Found + 1 Not found +*/ + +int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length) +{ + my_off_t root; + MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + uchar *keyread_buff= info->keyread_buff; + + if (!info->keyread_buff_used) + { + uint key_data_length= keyinfo->keylength - info->s->base.rec_reflength; + /* rt_PAGE_NEXT_KEY(*info->last_rtree_keypos) */ + uchar *key= keyread_buff + *info->last_rtree_keypos + keyinfo->keylength; + /* rt_PAGE_NEXT_KEY(key) */ + uchar *after_key= key + keyinfo->keylength; + MARIA_KEY tmp_key; + + tmp_key.keyinfo= keyinfo; + tmp_key.data= key; + tmp_key.data_length= key_data_length; + tmp_key.ref_length= info->s->base.rec_reflength; + tmp_key.flag= 0; + + info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key); + _ma_copy_key(&info->last_key, &tmp_key); + + *info->last_rtree_keypos= (uint) (key - keyread_buff); + if (after_key >= info->int_maxpos) + { + info->keyread_buff_used= 1; + } + + return 0; + } + else + { + if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + return -1; + } + + return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0); + } +} + + +/* + Choose non-leaf better key for insertion + + Returns a pointer inside the page_buf buffer. +*/ +#ifdef PICK_BY_PERIMETER +static const uchar *maria_rtree_pick_key(const MARIA_KEY *key, + const MARIA_PAGE *page) +{ + double increase; + double UNINIT_VAR(best_incr); + double perimeter; + double UNINIT_VAR(best_perimeter); + uchar *best_key= NULL; + const MARIA_HA *info= page->info; + + uchar *k= rt_PAGE_FIRST_KEY(info->s, page->buf, page->node); + uchar *last= rt_PAGE_END(info, page); + + for (; k < last; k= rt_PAGE_NEXT_KEY(k, key->data_length, nod_flag)) + { + if ((increase= maria_rtree_perimeter_increase(keyinfo->seg, k, key, + &perimeter)) == -1) + return NULL; + if ((increase < best_incr)|| + (increase == best_incr && perimeter < best_perimeter)) + { + best_key= k; + best_perimeter= perimeter; + best_incr= increase; + } + } + return best_key; +} + +#endif /*PICK_BY_PERIMETER*/ + +#ifdef PICK_BY_AREA +static const uchar *maria_rtree_pick_key(const MARIA_KEY *key, + const MARIA_PAGE *page) +{ + const MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + double increase; + double best_incr= DBL_MAX; + double area; + double UNINIT_VAR(best_area); + const uchar *best_key= NULL; + const uchar *k= rt_PAGE_FIRST_KEY(share, page->buff, page->node); + const uchar *last= rt_PAGE_END(page); + + for (; k < last; + k= rt_PAGE_NEXT_KEY(share, k, key->data_length, page->node)) + { + /* The following is safe as -1.0 is an exact number */ + if ((increase= maria_rtree_area_increase(key->keyinfo->seg, k, key->data, + key->data_length + + key->ref_length, + &area)) == -1.0) + return NULL; + /* The following should be safe, even if we compare doubles */ + if (!best_key || increase < best_incr || + ((increase == best_incr) && (area < best_area))) + { + best_key= k; + best_area= area; + best_incr= increase; + } + } + return best_key; +} + +#endif /*PICK_BY_AREA*/ + +/* + Go down and insert key into tree + + RETURN + -1 Error + 0 Child was not split + 1 Child was split +*/ + +static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEY *key, + my_off_t page_pos, my_off_t *new_page, + int ins_level, int level) +{ + uint nod_flag; + uint key_length= key->data_length; + int res; + my_bool buff_alloced; + uchar *page_buf, *k; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("maria_rtree_insert_req"); + + alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced, + keyinfo->block_length + keyinfo->max_store_length); + if (!page_buf) + { + my_errno= HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + DBUG_PRINT("rtree", ("page: %lu level: %d ins_level: %d nod_flag: %u", + (ulong) page.pos, level, ins_level, nod_flag)); + + if ((ins_level == -1 && nod_flag) || /* key: go down to leaf */ + (ins_level > -1 && ins_level > level)) /* branch: go down to ins_level */ + { + if (!(k= (uchar *)maria_rtree_pick_key(key, &page))) + goto err; + /* k is now a pointer inside the page_buf buffer */ + switch ((res= maria_rtree_insert_req(info, key, + _ma_kpos(nod_flag, k), new_page, + ins_level, level + 1))) + { + case 0: /* child was not split, most common case */ + { + maria_rtree_combine_rect(keyinfo->seg, k, key->data, k, key_length); + if (share->now_transactional && + _ma_log_change(&page, k, key_length, + KEY_OP_DEBUG_RTREE_COMBINE)) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + goto ok; + } + case 1: /* child was split */ + { + /* Set new_key to point to a free buffer area */ + uchar *new_key_buff= page_buf + keyinfo->block_length + nod_flag; + MARIA_KEY new_key; + MARIA_KEY k_key; + + DBUG_ASSERT(nod_flag); + k_key.keyinfo= new_key.keyinfo= keyinfo; + new_key.data= new_key_buff; + k_key.data= k; + k_key.data_length= new_key.data_length= key->data_length; + k_key.ref_length= new_key.ref_length= key->ref_length; + k_key.flag= new_key.flag= 0; /* Safety */ + + /* set proper MBR for key */ + if (maria_rtree_set_key_mbr(info, &k_key, _ma_kpos(nod_flag, k))) + goto err; + if (share->now_transactional && + _ma_log_change(&page, k, key_length, + KEY_OP_DEBUG_RTREE_SPLIT)) + goto err; + /* add new key for new page */ + _ma_kpointer(info, new_key_buff - nod_flag, *new_page); + if (maria_rtree_set_key_mbr(info, &new_key, *new_page)) + goto err; + res= maria_rtree_add_key(&new_key, &page, new_page); + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + goto ok; + } + default: + case -1: /* error */ + { + goto err; + } + } + } + else + { + res= maria_rtree_add_key(key, &page, new_page); + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + +ok: + stack_alloc_free(page_buf, buff_alloced); + DBUG_RETURN(res); + +err: + res= -1; /* purecov: inspected */ + goto ok; /* purecov: inspected */ +} + + +/** + Insert key into the tree + + @param info table + @param key KEY to insert + @param ins_level at which level key insertion should start + @param root put new key_root there + + @return Operation result + @retval -1 Error + @retval 0 Root was not split + @retval 1 Root was split +*/ + +int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level, + my_off_t *root) +{ + my_off_t old_root; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + int res; + my_off_t new_page; + enum pagecache_page_lock write_lock; + DBUG_ENTER("maria_rtree_insert_level"); + + if ((old_root= share->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR) + { + MARIA_PINNED_PAGE tmp_page_link, *page_link; + MARIA_PAGE page; + + page_link= &tmp_page_link; + if ((old_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + DBUG_RETURN(-1); + write_lock= page_link->write_lock; + info->keyread_buff_used= 1; + bzero(info->buff, share->block_size); + _ma_store_keynr(share, info->buff, keyinfo->key_nr); + _ma_store_page_used(share, info->buff, share->keypage_header); + _ma_page_setup(&page, info, keyinfo, old_root, info->buff); + + if (share->now_transactional && _ma_log_new(&page, 1)) + DBUG_RETURN(1); + + res= maria_rtree_add_key(key, &page, NULL); + if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS)) + DBUG_RETURN(1); + *root= old_root; + DBUG_RETURN(res); + } + + switch ((res= maria_rtree_insert_req(info, key, old_root, &new_page, + ins_level, 0))) + { + case 0: /* root was not split */ + { + break; + } + case 1: /* root was split, grow a new root; very rare */ + { + uchar *new_root_buf, *new_key_buff; + my_bool new_root_buf_alloced; + my_off_t new_root; + uint nod_flag= share->base.key_reflength; + MARIA_PINNED_PAGE tmp_page_link, *page_link; + MARIA_KEY new_key; + MARIA_PAGE page; + page_link= &tmp_page_link; + + DBUG_PRINT("rtree", ("root was split, grow a new root")); + + alloc_on_stack(*info->stack_end_ptr, new_root_buf, new_root_buf_alloced, + keyinfo->block_length + keyinfo->max_store_length); + if (!new_root_buf) + { + my_errno= HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); /* purecov: inspected */ + } + + bzero(new_root_buf, keyinfo->block_length); + _ma_store_keypage_flag(share, new_root_buf, KEYPAGE_FLAG_ISNOD); + _ma_store_keynr(share, new_root_buf, keyinfo->key_nr); + _ma_store_page_used(share, new_root_buf, share->keypage_header); + if ((new_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + goto err; + write_lock= page_link->write_lock; + + _ma_page_setup(&page, info, keyinfo, new_root, new_root_buf); + + if (share->now_transactional && _ma_log_new(&page, 1)) + goto err; + + /* Point to some free space */ + new_key_buff= new_root_buf + keyinfo->block_length + nod_flag; + new_key.keyinfo= keyinfo; + new_key.data= new_key_buff; + new_key.data_length= key->data_length; + new_key.ref_length= key->ref_length; + new_key.flag= 0; + + _ma_kpointer(info, new_key_buff - nod_flag, old_root); + if (maria_rtree_set_key_mbr(info, &new_key, old_root)) + goto err; + if (maria_rtree_add_key(&new_key, &page, NULL) == -1) + goto err; + _ma_kpointer(info, new_key_buff - nod_flag, new_page); + if (maria_rtree_set_key_mbr(info, &new_key, new_page)) + goto err; + if (maria_rtree_add_key(&new_key, &page, NULL) == -1) + goto err; + if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS)) + goto err; + *root= new_root; + DBUG_PRINT("rtree", ("new root page: %lu level: %d nod_flag: %u", + (ulong) new_root, 0, page.node)); + + stack_alloc_free(new_root_buf, new_root_buf_alloced); + break; +err: + stack_alloc_free(new_root_buf, new_root_buf_alloced); + DBUG_RETURN(-1); /* purecov: inspected */ + } + default: + case -1: /* error */ + { + DBUG_ASSERT(0); + break; + } + } + DBUG_RETURN(res); +} + + +/* + Insert key into the tree - interface function + + RETURN + 1 Error + 0 OK +*/ + +my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key) +{ + int res; + MARIA_SHARE *share= info->s; + my_off_t *root, new_root; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("maria_rtree_insert"); + + if (!key) + DBUG_RETURN(1); /* _ma_sp_make_key failed */ + + root= &share->state.key_root[key->keyinfo->key_nr]; + new_root= *root; + + if ((res= (maria_rtree_insert_level(info, key, -1, &new_root) == -1))) + goto err; + if (share->now_transactional) + res= _ma_write_undo_key_insert(info, key, root, new_root, &lsn); + else + { + *root= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); +err: + DBUG_RETURN(res != 0); +} + + +/* + Fill reinsert page buffer + + RETURN + 1 Error + 0 OK +*/ + +static my_bool maria_rtree_fill_reinsert_list(stPageList *ReinsertList, + my_off_t page, int level) +{ + DBUG_ENTER("maria_rtree_fill_reinsert_list"); + DBUG_PRINT("rtree", ("page: %lu level: %d", (ulong) page, level)); + if (ReinsertList->n_pages == ReinsertList->m_pages) + { + ReinsertList->m_pages += REINSERT_BUFFER_INC; + if (!(ReinsertList->pages= (stPageLevel*)my_realloc(PSI_INSTRUMENT_ME, (uchar*)ReinsertList->pages, + ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR)))) + goto err; + } + /* save page to ReinsertList */ + ReinsertList->pages[ReinsertList->n_pages].offs= page; + ReinsertList->pages[ReinsertList->n_pages].level= level; + ReinsertList->n_pages++; + DBUG_RETURN(0); + +err: + DBUG_RETURN(1); /* purecov: inspected */ +} + + +/* + Go down and delete key from the tree + + RETURN + -1 Error + 0 Deleted + 1 Not found + 2 Empty leaf +*/ + +static int maria_rtree_delete_req(MARIA_HA *info, const MARIA_KEY *key, + my_off_t page_pos, uint *page_size, + stPageList *ReinsertList, int level) +{ + ulong i; + uint nod_flag; + int res; + my_bool buff_alloced; + uchar *page_buf, *last, *k; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("maria_rtree_delete_req"); + + alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced, + keyinfo->block_length); + if (!page_buf) + { + my_errno= HA_ERR_OUT_OF_MEM; + DBUG_RETURN(-1); + } + + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + DBUG_PRINT("rtree", ("page: %lu level: %d nod_flag: %u", + (ulong) page_pos, level, nod_flag)); + + k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag); + last= rt_PAGE_END(&page); + + for (i= 0; + k < last; + k= rt_PAGE_NEXT_KEY(share, k, key->data_length, nod_flag), i++) + { + if (nod_flag) + { + /* not leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length, + MBR_WITHIN)) + { + switch ((res= maria_rtree_delete_req(info, key, + _ma_kpos(nod_flag, k), + page_size, ReinsertList, + level + 1))) + { + case 0: /* deleted */ + { + /* test page filling */ + if (*page_size + key->data_length >= + rt_PAGE_MIN_SIZE(keyinfo->block_length)) + { + /* OK */ + /* Calculate a new key value (MBR) for the shrinked block. */ + MARIA_KEY tmp_key; + tmp_key.keyinfo= keyinfo; + tmp_key.data= k; + tmp_key.data_length= key->data_length; + tmp_key.ref_length= key->ref_length; + tmp_key.flag= 0; /* Safety */ + + if (maria_rtree_set_key_mbr(info, &tmp_key, + _ma_kpos(nod_flag, k))) + goto err; + if (share->now_transactional && + _ma_log_change(&page, k, key->data_length, + KEY_OP_DEBUG_RTREE_SET_KEY)) + goto err; + page_mark_changed(info, &page) + if (_ma_write_keypage(&page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + else + { + /* + Too small: delete key & add it descendant to reinsert list. + Store position and level of the block so that it can be + accessed later for inserting the remaining keys. + */ + DBUG_PRINT("rtree", ("too small. move block to reinsert list")); + if (maria_rtree_fill_reinsert_list(ReinsertList, + _ma_kpos(nod_flag, k), + level + 1)) + goto err; + /* + Delete the key that references the block. This makes the + block disappear from the index. Hence we need to insert + its remaining keys later. Note: if the block is a branch + block, we do not only remove this block, but the whole + subtree. So we need to re-insert its keys on the same + level later to reintegrate the subtrees. + */ + if (maria_rtree_delete_key(&page, k, key->data_length)) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + *page_size= page.size; + } + + goto ok; + } + case 1: /* not found - continue searching */ + { + break; + } + case 2: /* vacuous case: last key in the leaf */ + { + if (maria_rtree_delete_key(&page, k, key->data_length)) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + *page_size= page.size; + res= 0; + goto ok; + } + default: /* error */ + case -1: + { + goto err; + } + } + } + } + else + { + /* leaf */ + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length, + MBR_EQUAL | MBR_DATA)) + { + page_mark_changed(info, &page); + if (maria_rtree_delete_key(&page, k, key->data_length)) + goto err; + *page_size= page.size; + if (*page_size == info->s->keypage_header) + { + /* last key in the leaf */ + res= 2; + if (_ma_dispose(info, page.pos, 0)) + goto err; + } + else + { + res= 0; + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + goto ok; + } + } + } + res= 1; + +ok: + stack_alloc_free(page_buf, buff_alloced); + DBUG_RETURN(res); + +err: + stack_alloc_free(page_buf, buff_alloced); + DBUG_RETURN(-1); /* purecov: inspected */ +} + + +/* + Delete key - interface function + + RETURN + 1 Error + 0 Deleted +*/ + +my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key) +{ + MARIA_SHARE *share= info->s; + my_off_t new_root= share->state.key_root[key->keyinfo->key_nr]; + int res; + LSN lsn= LSN_IMPOSSIBLE; + DBUG_ENTER("maria_rtree_delete"); + + if ((res= maria_rtree_real_delete(info, key, &new_root))) + goto err; + + if (share->now_transactional) + res= _ma_write_undo_key_delete(info, key, new_root, &lsn); + else + share->state.key_root[key->keyinfo->key_nr]= new_root; + +err: + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res != 0); +} + + +my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root) +{ + uint page_size; + stPageList ReinsertList; + my_off_t old_root; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + uint key_data_length= key->data_length; + my_bool buff_alloced= 0; + uchar *page_buf= 0; + DBUG_ENTER("maria_rtree_real_delete"); + + if ((old_root= share->state.key_root[keyinfo->key_nr]) == + HA_OFFSET_ERROR) + { + my_errno= HA_ERR_END_OF_FILE; + DBUG_RETURN(1); /* purecov: inspected */ + } + DBUG_PRINT("rtree", ("starting deletion at root page: %lu", + (ulong) old_root)); + + ReinsertList.pages= NULL; + ReinsertList.n_pages= 0; + ReinsertList.m_pages= 0; + + switch (maria_rtree_delete_req(info, key, old_root, &page_size, + &ReinsertList, 0)) { + case 2: /* empty */ + { + *root= HA_OFFSET_ERROR; + break; + } + case 0: /* deleted */ + { + uint nod_flag; + ulong i; + MARIA_PAGE page; + MARIA_KEY tmp_key; + + tmp_key.keyinfo= key->keyinfo; + tmp_key.data_length= key->data_length; + tmp_key.ref_length= key->ref_length; + tmp_key.flag= 0; /* Safety */ + + if (ReinsertList.n_pages) + { + alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced, + keyinfo->block_length); + if (!page_buf) + { + my_errno= HA_ERR_OUT_OF_MEM; + goto err; + } + + for (i= 0; i < ReinsertList.n_pages; ++i) + { + uchar *k, *last; + if (_ma_fetch_keypage(&page, info, keyinfo, ReinsertList.pages[i].offs, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0)) + goto err; + nod_flag= page.node; + DBUG_PRINT("rtree", ("reinserting keys from " + "page: %lu level: %d nod_flag: %u", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level, nod_flag)); + + k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag); + last= rt_PAGE_END(&page); + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, + nod_flag)) + { + int res; + tmp_key.data= k; + if ((res= maria_rtree_insert_level(info, &tmp_key, + ReinsertList.pages[i].level, + root)) == -1) + goto err; + if (res) + { + uint j; + DBUG_PRINT("rtree", ("root has been split, adjust levels")); + for (j= i; j < ReinsertList.n_pages; j++) + { + ReinsertList.pages[j].level++; + DBUG_PRINT("rtree", ("keys from page: %lu now level: %d", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level)); + } + } + } + page_mark_changed(info, &page); + if (_ma_dispose(info, page.pos, 0)) + goto err; + } + } + + /* check for redundant root (not leaf, 1 child) and eliminate */ + if ((old_root= *root) == HA_OFFSET_ERROR) + goto err; + if (_ma_fetch_keypage(&page, info, keyinfo, old_root, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, info->buff, 0)) + goto err; + nod_flag= page.node; + if (nod_flag && (page.size == share->keypage_header + key_data_length + + nod_flag)) + { + *root= _ma_kpos(nod_flag, + rt_PAGE_FIRST_KEY(share, info->buff, nod_flag)); + page_mark_changed(info, &page); + if (_ma_dispose(info, page.pos, 0)) + goto err; + } + info->update= HA_STATE_DELETED; + break; + } + case 1: /* not found */ + { + my_errno= HA_ERR_KEY_NOT_FOUND; + goto err; + } + case -1: /* error */ + default: + goto err; /* purecov: inspected */ + } + my_free(ReinsertList.pages); + stack_alloc_free(page_buf, buff_alloced); + DBUG_RETURN(0); + +err: + my_free(ReinsertList.pages); + stack_alloc_free(page_buf, buff_alloced); + DBUG_RETURN(1); +} + + +/* + Estimate number of suitable keys in the tree + + RETURN + estimated value +*/ + +ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag) +{ + my_off_t root; + uint i= 0; + uint nod_flag, key_data_length; + uchar *page_buf, *k, *last; + double area= 0; + ha_rows res= 0; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + my_bool buff_alloced; + + if (flag & MBR_DISJOINT) + return HA_POS_ERROR; + + if ((root= share->state.key_root[key->keyinfo->key_nr]) == HA_OFFSET_ERROR) + return HA_POS_ERROR; + + alloc_on_stack(*info->stack_end_ptr, page_buf, buff_alloced, + keyinfo->block_length); + if (!page_buf) + return(HA_POS_ERROR); + + if (_ma_fetch_keypage(&page, info, keyinfo, root, + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, page_buf, + 0)) + goto err; + nod_flag= page.node; + + key_data_length= key->data_length; + + k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag); + last= rt_PAGE_END(&page); + + for (; k < last; + k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag), i++) + { + if (nod_flag) + { + double k_area= maria_rtree_rect_volume(keyinfo->seg, k, key_data_length); + + /* The following should be safe, even if we compare doubles */ + if (k_area == 0) + { + if (flag & (MBR_CONTAIN | MBR_INTERSECT)) + { + area+= 1; + } + else if (flag & (MBR_WITHIN | MBR_EQUAL)) + { + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length, + MBR_WITHIN)) + area+= 1; + } + else + goto err; + } + else + { + if (flag & (MBR_CONTAIN | MBR_INTERSECT)) + { + area+= maria_rtree_overlapping_area(keyinfo->seg, key->data, k, + key_data_length) / k_area; + } + else if (flag & (MBR_WITHIN | MBR_EQUAL)) + { + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length, + MBR_WITHIN)) + area+= (maria_rtree_rect_volume(keyinfo->seg, key->data, + key_data_length) / k_area); + } + else + goto err; + } + } + else + { + if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length, + flag)) + ++res; + } + } + if (nod_flag) + { + if (i) + res= (ha_rows) (area / i * info->state->records); + else + res= HA_POS_ERROR; + } + + stack_alloc_free(page_buf, buff_alloced); + return res; + +err: + stack_alloc_free(page_buf, buff_alloced); + return HA_POS_ERROR; +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_index.h b/storage/maria/ma_rt_index.h new file mode 100644 index 00000000..42df5cf9 --- /dev/null +++ b/storage/maria/ma_rt_index.h @@ -0,0 +1,46 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _rt_index_h +#define _rt_index_h + +#ifdef HAVE_RTREE_KEYS + +#define rt_PAGE_FIRST_KEY(share, page, nod_flag) (page + share->keypage_header + nod_flag) +#define rt_PAGE_NEXT_KEY(share, key, key_length, nod_flag) (key + key_length +\ + (nod_flag ? nod_flag : share->base.rec_reflength)) +#define rt_PAGE_END(page) ((page)->buff + (page)->size) + +#define rt_PAGE_MIN_SIZE(block_length) ((uint)(block_length - KEYPAGE_CHECKSUM_SIZE) / 3) + +my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key); +my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key); +int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, + int ins_level, my_off_t *root); +my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root); +int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint search_flag); +int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag); + +int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length); +int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length); + +ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag); + +int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page_offs); +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_index_h */ diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c new file mode 100644 index 00000000..231bd9ba --- /dev/null +++ b/storage/maria/ma_rt_key.c @@ -0,0 +1,120 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#ifdef HAVE_RTREE_KEYS +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +/* + Add key to the page + + RESULT VALUES + -1 Error + 0 Not split + 1 Split +*/ + +int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + uint page_size= page->size; + uint nod_flag= page->node; + uchar *key_pos= rt_PAGE_END(page); + uint tot_key_length= key->data_length + key->ref_length + nod_flag; + DBUG_ENTER("maria_rtree_add_key"); + + if (page_size + tot_key_length <= + (uint)(key->keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE)) + { + /* split won't be necessary */ + if (nod_flag) + { + DBUG_ASSERT(_ma_kpos(nod_flag, key->data) < + info->state->key_file_length); + /* We don't store reference to row on nod pages for rtree index */ + tot_key_length-= key->ref_length; + } + /* save key */ + memcpy(key_pos, key->data - nod_flag, tot_key_length); + page->size+= tot_key_length; + page_store_size(share, page); + if (share->now_transactional && + _ma_log_add(page, (uint)(key_pos - page->buff), + key_pos, tot_key_length, tot_key_length, 0, + KEY_OP_DEBUG_LOG_ADD_1)) + DBUG_RETURN(-1); + DBUG_RETURN(0); + } + DBUG_RETURN(maria_rtree_split_page(key, page, new_page) ? -1 : 1); +} + + +/* + Delete key from the page + + Notes + key_length is only the data part of the key +*/ + +int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + uint key_length_with_nod_flag; + uchar *key_start; + + key_start= key - page->node; + if (!page->node) + key_length+= share->base.rec_reflength; + + memmove(key_start, key + key_length, page->size - key_length - + (key - page->buff)); + key_length_with_nod_flag= key_length + page->node; + page->size-= key_length_with_nod_flag; + page_store_size(share, page); + if (share->now_transactional && + _ma_log_delete(page, key_start, 0, key_length_with_nod_flag, + 0, KEY_OP_DEBUG_LOG_DEL_CHANGE_RT)) + return -1; + return 0; +} + + +/* + Calculate and store key MBR into *key. +*/ + +int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key, + my_off_t child_page) +{ + MARIA_PAGE page; + DBUG_ENTER("maria_rtree_set_key_mbr"); + if (_ma_fetch_keypage(&page, info, key->keyinfo, child_page, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->buff, 0)) + DBUG_RETURN(-1); + + DBUG_RETURN(maria_rtree_page_mbr(key->keyinfo->seg, + &page, key->data, key->data_length)); +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_key.h b/storage/maria/ma_rt_key.h new file mode 100644 index 00000000..e1bd6edc --- /dev/null +++ b/storage/maria/ma_rt_key.h @@ -0,0 +1,31 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Written by Ramil Kalimullin, who has a shared copyright to this code */ + +#ifndef _rt_key_h +#define _rt_key_h + +#ifdef HAVE_RTREE_KEYS + +int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page); +int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length); +int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key, + my_off_t child_page); + +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_key_h */ diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c new file mode 100644 index 00000000..9f8946bc --- /dev/null +++ b/storage/maria/ma_rt_mbr.c @@ -0,0 +1,818 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_mbr.h" + +#define INTERSECT_CMP(amin, amax, bmin, bmax) ((amin > bmax) || (bmin > amax)) +#define CONTAIN_CMP(amin, amax, bmin, bmax) ((bmin > amin) || (bmax < amax)) +#define WITHIN_CMP(amin, amax, bmin, bmax) ((amin > bmin) || (amax < bmax)) +#define DISJOINT_CMP(amin, amax, bmin, bmax) ((amin <= bmax) && (bmin <= amax)) +#define EQUAL_CMP(amin, amax, bmin, bmax) ((amin != bmin) || (amax != bmax)) + +#define FCMP(A, B) ((int)(A) - (int)(B)) +#define p_inc(A, B, X) {A += X; B += X;} + +#define RT_CMP(nextflag) \ + if (nextflag & MBR_INTERSECT) \ + { \ + if (INTERSECT_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_CONTAIN) \ + { \ + if (CONTAIN_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_WITHIN) \ + { \ + if (WITHIN_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_EQUAL) \ + { \ + if (EQUAL_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + } \ + else if (nextflag & MBR_DISJOINT) \ + { \ + if (DISJOINT_CMP(amin, amax, bmin, bmax)) \ + return 1; \ + }\ + else /* if unknown comparison operator */ \ + { \ + DBUG_ASSERT(0); \ + } + +#define RT_CMP_KORR(type, korr_func, len, nextflag) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + RT_CMP(nextflag); \ +} + +#define RT_CMP_GET(type, get_func, len, nextflag) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + RT_CMP(nextflag); \ +} + +/* + Compares two keys a and b depending on nextflag + nextflag can contain these flags: + MBR_INTERSECT(a,b) a overlaps b + MBR_CONTAIN(a,b) a contains b + MBR_DISJOINT(a,b) a disjoint b + MBR_WITHIN(a,b) a within b + MBR_EQUAL(a,b) All coordinates of MBRs are equal + MBR_DATA(a,b) Data reference is the same + Returns 0 on success. +*/ + +int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *b, const uchar *a, + uint key_length, uint32 nextflag) +{ + for (; (int) key_length > 0; keyseg += 2 ) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_CMP_KORR(int8, mi_sint1korr, 1, nextflag); + break; + case HA_KEYTYPE_BINARY: + RT_CMP_KORR(uint8, mi_uint1korr, 1, nextflag); + break; + case HA_KEYTYPE_SHORT_INT: + RT_CMP_KORR(int16, mi_sint2korr, 2, nextflag); + break; + case HA_KEYTYPE_USHORT_INT: + RT_CMP_KORR(uint16, mi_uint2korr, 2, nextflag); + break; + case HA_KEYTYPE_INT24: + RT_CMP_KORR(int32, mi_sint3korr, 3, nextflag); + break; + case HA_KEYTYPE_UINT24: + RT_CMP_KORR(uint32, mi_uint3korr, 3, nextflag); + break; + case HA_KEYTYPE_LONG_INT: + RT_CMP_KORR(int32, mi_sint4korr, 4, nextflag); + break; + case HA_KEYTYPE_ULONG_INT: + RT_CMP_KORR(uint32, mi_uint4korr, 4, nextflag); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_CMP_KORR(longlong, mi_sint8korr, 8, nextflag) + break; + case HA_KEYTYPE_ULONGLONG: + RT_CMP_KORR(ulonglong, mi_uint8korr, 8, nextflag) + break; +#endif + case HA_KEYTYPE_FLOAT: + /* The following should be safe, even if we compare doubles */ + RT_CMP_GET(float, mi_float4get, 4, nextflag); + break; + case HA_KEYTYPE_DOUBLE: + RT_CMP_GET(double, mi_float8get, 8, nextflag); + break; + case HA_KEYTYPE_END: + goto end; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + +end: + if (nextflag & MBR_DATA) + { + const uchar *end= a + keyseg->length; + do + { + if (*a++ != *b++) + return FCMP(a[-1], b[-1]); + } while (a != end); + } + return 0; +} + +#define RT_VOL_KORR(type, korr_func, len, cast) \ +{ \ + type amin, amax; \ + amin= korr_func(a); \ + amax= korr_func(a+len); \ + res *= (cast(amax) - cast(amin)); \ +} + +#define RT_VOL_GET(type, get_func, len, cast) \ +{ \ + type amin, amax; \ + get_func(amin, a); \ + get_func(amax, a+len); \ + res *= (cast(amax) - cast(amin)); \ +} + +/* + Calculates rectangle volume +*/ +double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar *a, uint key_length) +{ + double res= 1; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_VOL_KORR(int8, mi_sint1korr, 1, (double)); + break; + case HA_KEYTYPE_BINARY: + RT_VOL_KORR(uint8, mi_uint1korr, 1, (double)); + break; + case HA_KEYTYPE_SHORT_INT: + RT_VOL_KORR(int16, mi_sint2korr, 2, (double)); + break; + case HA_KEYTYPE_USHORT_INT: + RT_VOL_KORR(uint16, mi_uint2korr, 2, (double)); + break; + case HA_KEYTYPE_INT24: + RT_VOL_KORR(int32, mi_sint3korr, 3, (double)); + break; + case HA_KEYTYPE_UINT24: + RT_VOL_KORR(uint32, mi_uint3korr, 3, (double)); + break; + case HA_KEYTYPE_LONG_INT: + RT_VOL_KORR(int32, mi_sint4korr, 4, (double)); + break; + case HA_KEYTYPE_ULONG_INT: + RT_VOL_KORR(uint32, mi_uint4korr, 4, (double)); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_VOL_KORR(longlong, mi_sint8korr, 8, (double)); + break; + case HA_KEYTYPE_ULONGLONG: + RT_VOL_KORR(longlong, mi_sint8korr, 8, ulonglong2double); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_VOL_GET(float, mi_float4get, 4, (double)); + break; + case HA_KEYTYPE_DOUBLE: + RT_VOL_GET(double, mi_float8get, 8, (double)); + break; + case HA_KEYTYPE_END: + key_length= 0; + break; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + } + return res; +} + +#define RT_D_MBR_KORR(type, korr_func, len, cast) \ +{ \ + type amin, amax; \ + amin= korr_func(a); \ + amax= korr_func(a+len); \ + *res++= cast(amin); \ + *res++= cast(amax); \ +} + +#define RT_D_MBR_GET(type, get_func, len, cast) \ +{ \ + type amin, amax; \ + get_func(amin, a); \ + get_func(amax, a+len); \ + *res++= cast(amin); \ + *res++= cast(amax); \ +} + + +/* + Creates an MBR as an array of doubles. + Fills *res. +*/ + +int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a, + uint key_length, double *res) +{ + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_D_MBR_KORR(int8, mi_sint1korr, 1, (double)); + break; + case HA_KEYTYPE_BINARY: + RT_D_MBR_KORR(uint8, mi_uint1korr, 1, (double)); + break; + case HA_KEYTYPE_SHORT_INT: + RT_D_MBR_KORR(int16, mi_sint2korr, 2, (double)); + break; + case HA_KEYTYPE_USHORT_INT: + RT_D_MBR_KORR(uint16, mi_uint2korr, 2, (double)); + break; + case HA_KEYTYPE_INT24: + RT_D_MBR_KORR(int32, mi_sint3korr, 3, (double)); + break; + case HA_KEYTYPE_UINT24: + RT_D_MBR_KORR(uint32, mi_uint3korr, 3, (double)); + break; + case HA_KEYTYPE_LONG_INT: + RT_D_MBR_KORR(int32, mi_sint4korr, 4, (double)); + break; + case HA_KEYTYPE_ULONG_INT: + RT_D_MBR_KORR(uint32, mi_uint4korr, 4, (double)); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_D_MBR_KORR(longlong, mi_sint8korr, 8, (double)); + break; + case HA_KEYTYPE_ULONGLONG: + RT_D_MBR_KORR(longlong, mi_sint8korr, 8, ulonglong2double); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_D_MBR_GET(float, mi_float4get, 4, (double)); + break; + case HA_KEYTYPE_DOUBLE: + RT_D_MBR_GET(double, mi_float8get, 8, (double)); + break; + case HA_KEYTYPE_END: + key_length= 0; + break; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + } + return 0; +} + +#define RT_COMB_KORR(type, korr_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + amin= MY_MIN(amin, bmin); \ + amax= MY_MAX(amax, bmax); \ + store_func(c, amin); \ + store_func(c+len, amax); \ +} + +#define RT_COMB_GET(type, get_func, store_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + amin= MY_MIN(amin, bmin); \ + amax= MY_MAX(amax, bmax); \ + store_func(c, amin); \ + store_func(c+len, amax); \ +} + +/* + Creates common minimal bounding rectungle + for two input rectagnles a and b + Result is written to c +*/ + +int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, const uchar* a, + const uchar* b, uchar* c, + uint key_length) +{ + for ( ; (int) key_length > 0 ; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_COMB_KORR(int8, mi_sint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_BINARY: + RT_COMB_KORR(uint8, mi_uint1korr, mi_int1store, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_COMB_KORR(int16, mi_sint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_COMB_KORR(uint16, mi_uint2korr, mi_int2store, 2); + break; + case HA_KEYTYPE_INT24: + RT_COMB_KORR(int32, mi_sint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_UINT24: + RT_COMB_KORR(uint32, mi_uint3korr, mi_int3store, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_COMB_KORR(int32, mi_sint4korr, mi_int4store, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_COMB_KORR(uint32, mi_uint4korr, mi_int4store, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_COMB_KORR(longlong, mi_sint8korr, mi_int8store, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_COMB_KORR(ulonglong, mi_uint8korr, mi_int8store, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_COMB_GET(float, mi_float4get, mi_float4store, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_COMB_GET(double, mi_float8get, mi_float8store, 8); + break; + case HA_KEYTYPE_END: + return 0; + default: + return 1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + c+= keyseg_length; + } + return 0; +} + + +#define RT_OVL_AREA_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + amin= MY_MAX(amin, bmin); \ + amax= MY_MIN(amax, bmax); \ + if (amin >= amax) \ + return 0; \ + res *= amax - amin; \ +} + +#define RT_OVL_AREA_GET(type, get_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + amin= MY_MAX(amin, bmin); \ + amax= MY_MIN(amax, bmax); \ + if (amin >= amax) \ + return 0; \ + res *= amax - amin; \ +} + +/* +Calculates overlapping area of two MBRs a & b +*/ +double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length) +{ + double res= 1; + for (; (int) key_length > 0 ; keyseg += 2) + { + uint32 keyseg_length; + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_OVL_AREA_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_OVL_AREA_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_OVL_AREA_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_OVL_AREA_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_OVL_AREA_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_OVL_AREA_KORR(uint32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_OVL_AREA_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_OVL_AREA_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_OVL_AREA_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_OVL_AREA_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + return res; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + return res; +} + +#define RT_AREA_INC_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + a_area *= (((double)amax) - ((double)amin)); \ + loc_ab_area *= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ +} + +#define RT_AREA_INC_GET(type, get_func, len)\ +{\ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + a_area *= (((double)amax) - ((double)amin)); \ + loc_ab_area *= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ +} + +/* + Calculates MBR_AREA(a+b) - MBR_AREA(a) + Fills *ab_area. + Note: when 'a' and 'b' objects are far from each other, + the area increase can be really big, so this function + can return 'inf' as a result. +*/ + +double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a, + const uchar *b, + uint key_length, double *ab_area) +{ + double a_area= 1.0; + double loc_ab_area= 1.0; + + *ab_area= 1.0; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + + if (keyseg->null_bit) /* Handle NULL part */ + return -1; + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_AREA_INC_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_AREA_INC_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_AREA_INC_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_AREA_INC_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_AREA_INC_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_AREA_INC_KORR(int32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_AREA_INC_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_AREA_INC_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_AREA_INC_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_AREA_INC_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_AREA_INC_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_AREA_INC_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + goto safe_end; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } +safe_end: + *ab_area= loc_ab_area; + return loc_ab_area - a_area; +} + +#define RT_PERIM_INC_KORR(type, korr_func, len) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(a); \ + bmin= korr_func(b); \ + amax= korr_func(a+len); \ + bmax= korr_func(b+len); \ + a_perim+= (((double)amax) - ((double)amin)); \ + *ab_perim+= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ +} + +#define RT_PERIM_INC_GET(type, get_func, len)\ +{\ + type amin, amax, bmin, bmax; \ + get_func(amin, a); \ + get_func(bmin, b); \ + get_func(amax, a+len); \ + get_func(bmax, b+len); \ + a_perim+= (((double)amax) - ((double)amin)); \ + *ab_perim+= ((double)MY_MAX(amax, bmax) - (double)MY_MIN(amin, bmin)); \ +} + +/* +Calculates MBR_PERIMETER(a+b) - MBR_PERIMETER(a) +*/ +double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length, double *ab_perim) +{ + double a_perim= 0.0; + + *ab_perim= 0.0; + for (; (int)key_length > 0; keyseg += 2) + { + uint32 keyseg_length; + + if (keyseg->null_bit) /* Handle NULL part */ + return -1; + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_PERIM_INC_KORR(int8, mi_sint1korr, 1); + break; + case HA_KEYTYPE_BINARY: + RT_PERIM_INC_KORR(uint8, mi_uint1korr, 1); + break; + case HA_KEYTYPE_SHORT_INT: + RT_PERIM_INC_KORR(int16, mi_sint2korr, 2); + break; + case HA_KEYTYPE_USHORT_INT: + RT_PERIM_INC_KORR(uint16, mi_uint2korr, 2); + break; + case HA_KEYTYPE_INT24: + RT_PERIM_INC_KORR(int32, mi_sint3korr, 3); + break; + case HA_KEYTYPE_UINT24: + RT_PERIM_INC_KORR(int32, mi_uint3korr, 3); + break; + case HA_KEYTYPE_LONG_INT: + RT_PERIM_INC_KORR(int32, mi_sint4korr, 4); + break; + case HA_KEYTYPE_ULONG_INT: + RT_PERIM_INC_KORR(uint32, mi_uint4korr, 4); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8); + break; + case HA_KEYTYPE_ULONGLONG: + RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_PERIM_INC_GET(float, mi_float4get, 4); + break; + case HA_KEYTYPE_DOUBLE: + RT_PERIM_INC_GET(double, mi_float8get, 8); + break; + case HA_KEYTYPE_END: + return *ab_perim - a_perim; + default: + return -1; + } + keyseg_length= keyseg->length * 2; + key_length-= keyseg_length; + a+= keyseg_length; + b+= keyseg_length; + } + return *ab_perim - a_perim; +} + + +#define RT_PAGE_MBR_KORR(share, type, korr_func, store_func, len, to) \ +{ \ + type amin, amax, bmin, bmax; \ + amin= korr_func(k + inc); \ + amax= korr_func(k + inc + len); \ + k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag); \ + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag)) \ +{ \ + bmin= korr_func(k + inc); \ + bmax= korr_func(k + inc + len); \ + if (amin > bmin) \ + amin= bmin; \ + if (amax < bmax) \ + amax= bmax; \ +} \ + store_func(to, amin); \ + to+= len; \ + store_func(to, amax); \ + to += len; \ + inc += 2 * len; \ +} + +#define RT_PAGE_MBR_GET(share, type, get_func, store_func, len, to) \ +{ \ + type amin, amax, bmin, bmax; \ + get_func(amin, k + inc); \ + get_func(amax, k + inc + len); \ + k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag); \ + for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag)) \ +{ \ + get_func(bmin, k + inc); \ + get_func(bmax, k + inc + len); \ + if (amin > bmin) \ + amin= bmin; \ + if (amax < bmax) \ + amax= bmax; \ +} \ + store_func(to, amin); \ + to+= len; \ + store_func(to, amax); \ + to+= len; \ + inc += 2 * len; \ +} + +/* + Calculates key page total MBR= MBR(key1) + MBR(key2) + ... + Stores into *to. +*/ +int maria_rtree_page_mbr(const HA_KEYSEG *keyseg, + MARIA_PAGE *page, + uchar *to, uint key_length) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + uint inc= 0; + uint k_len= key_length; + uint nod_flag= page->node; + const uchar *k; + const uchar *last= rt_PAGE_END(page); + + for (; (int)key_length > 0; keyseg += 2) + { + key_length -= keyseg->length * 2; + + /* Handle NULL part */ + if (keyseg->null_bit) + { + return 1; + } + + k= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag); + + switch ((enum ha_base_keytype) keyseg->type) { + case HA_KEYTYPE_INT8: + RT_PAGE_MBR_KORR(share, int8, mi_sint1korr, mi_int1store, 1, to); + break; + case HA_KEYTYPE_BINARY: + RT_PAGE_MBR_KORR(share, uint8, mi_uint1korr, mi_int1store, 1, to); + break; + case HA_KEYTYPE_SHORT_INT: + RT_PAGE_MBR_KORR(share, int16, mi_sint2korr, mi_int2store, 2, to); + break; + case HA_KEYTYPE_USHORT_INT: + RT_PAGE_MBR_KORR(share, uint16, mi_uint2korr, mi_int2store, 2, to); + break; + case HA_KEYTYPE_INT24: + RT_PAGE_MBR_KORR(share, int32, mi_sint3korr, mi_int3store, 3, to); + break; + case HA_KEYTYPE_UINT24: + RT_PAGE_MBR_KORR(share, uint32, mi_uint3korr, mi_int3store, 3, to); + break; + case HA_KEYTYPE_LONG_INT: + RT_PAGE_MBR_KORR(share, int32, mi_sint4korr, mi_int4store, 4, to); + break; + case HA_KEYTYPE_ULONG_INT: + RT_PAGE_MBR_KORR(share, uint32, mi_uint4korr, mi_int4store, 4, to); + break; +#ifdef HAVE_LONG_LONG + case HA_KEYTYPE_LONGLONG: + RT_PAGE_MBR_KORR(share, longlong, mi_sint8korr, mi_int8store, 8, to); + break; + case HA_KEYTYPE_ULONGLONG: + RT_PAGE_MBR_KORR(share, ulonglong, mi_uint8korr, mi_int8store, 8, to); + break; +#endif + case HA_KEYTYPE_FLOAT: + RT_PAGE_MBR_GET(share, float, mi_float4get, mi_float4store, 4, to); + break; + case HA_KEYTYPE_DOUBLE: + RT_PAGE_MBR_GET(share, double, mi_float8get, mi_float8store, 8, to); + break; + case HA_KEYTYPE_END: + return 0; + default: + return 1; + } + } + return 0; +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_mbr.h b/storage/maria/ma_rt_mbr.h new file mode 100644 index 00000000..5583947c --- /dev/null +++ b/storage/maria/ma_rt_mbr.h @@ -0,0 +1,40 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _rt_mbr_h +#define _rt_mbr_h + +#ifdef HAVE_RTREE_KEYS + +int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *a, const uchar *b, + uint key_length, uint32 nextflag); +int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, + const uchar *, const uchar *, uchar*, + uint key_length); +double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar*, uint key_length); +int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a, + uint key_length, double *res); +double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar *a, uchar *b, + uint key_length); +double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a, + const uchar *b, + uint key_length, double *ab_area); +double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b, + uint key_length, double *ab_perim); +int maria_rtree_page_mbr(const HA_KEYSEG *keyseg, MARIA_PAGE *page, + uchar *key, uint key_length); +#endif /*HAVE_RTREE_KEYS*/ +#endif /* _rt_mbr_h */ diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c new file mode 100644 index 00000000..a0acb9ce --- /dev/null +++ b/storage/maria/ma_rt_split.c @@ -0,0 +1,550 @@ +/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" +#include "ma_rt_key.h" +#include "ma_rt_mbr.h" + +typedef struct +{ + double square; + int n_node; + const uchar *key; + double *coords; +} SplitStruct; + +inline static double *reserve_coords(double **d_buffer, int n_dim) +{ + double *coords= *d_buffer; + (*d_buffer)+= n_dim * 2; + return coords; +} + +static void mbr_join(double *a, const double *b, int n_dim) +{ + double *end= a + n_dim * 2; + do + { + if (a[0] > b[0]) + a[0]= b[0]; + + if (a[1] < b[1]) + a[1]= b[1]; + + a+= 2; + b+= 2; + } while (a != end); +} + +/* +Counts the square of mbr which is a join of a and b +*/ +static double mbr_join_square(const double *a, const double *b, int n_dim) +{ + const double *end= a + n_dim * 2; + double square= 1.0; + do + { + square *= + ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]); + + a+= 2; + b+= 2; + } while (a != end); + + return square; +} + +static double count_square(const double *a, int n_dim) +{ + const double *end= a + n_dim * 2; + double square= 1.0; + do + { + square *= a[1] - a[0]; + a+= 2; + } while (a != end); + return square; +} + +inline static void copy_coords(double *dst, const double *src, int n_dim) +{ + memcpy(dst, src, sizeof(double) * (n_dim * 2)); +} + +/** + Select two nodes to collect group upon. + + Note that such function uses 'double' arithmetic so may behave differently + on different platforms/builds. There are others in this file. +*/ +static void pick_seeds(SplitStruct *node, int n_entries, + SplitStruct **seed_a, SplitStruct **seed_b, int n_dim) +{ + SplitStruct *cur1; + SplitStruct *lim1= node + (n_entries - 1); + SplitStruct *cur2; + SplitStruct *lim2= node + n_entries; + + double max_d= -DBL_MAX; + double d; + + for (cur1= node; cur1 < lim1; cur1++) + { + for (cur2=cur1 + 1; cur2 < lim2; cur2++) + { + + d= mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square - + cur2->square; + if (d > max_d) + { + max_d= d; + *seed_a= cur1; + *seed_b= cur2; + } + } + } +} + +/* +Select next node and group where to add +*/ +static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2, + SplitStruct **choice, int *n_group, int n_dim) +{ + SplitStruct *cur= node; + SplitStruct *end= node + n_entries; + + double max_diff= -DBL_MAX; + + for (; cur < end; cur++) + { + double diff; + double abs_diff; + + if (cur->n_node) + { + continue; + } + + diff= mbr_join_square(g1, cur->coords, n_dim) - + mbr_join_square(g2, cur->coords, n_dim); + + abs_diff= fabs(diff); + if (abs_diff > max_diff) + { + max_diff= abs_diff; + *n_group= 1 + (diff > 0); + *choice= cur; + } + } +} + +/* +Mark not-in-group entries as n_group +*/ +static void mark_all_entries(SplitStruct *node, int n_entries, int n_group) +{ + SplitStruct *cur= node; + SplitStruct *end= node + n_entries; + + for (; cur < end; cur++) + { + if (cur->n_node) + { + continue; + } + cur->n_node= n_group; + } +} + +static int split_maria_rtree_node(SplitStruct *node, int n_entries, + int all_size, /* Total key's size */ + int key_size, + int min_size, /* Minimal group size */ + int size1, int size2 /* initial group sizes */, + double **d_buffer, int n_dim) +{ + SplitStruct *cur; + SplitStruct *UNINIT_VAR(a); + SplitStruct *UNINIT_VAR(b); + double *g1= reserve_coords(d_buffer, n_dim); + double *g2= reserve_coords(d_buffer, n_dim); + SplitStruct *UNINIT_VAR(next); + int UNINIT_VAR(next_node); + int i; + SplitStruct *end= node + n_entries; + + if (all_size < min_size * 2) + { + return 1; + } + + cur= node; + for (; cur < end; cur++) + { + cur->square= count_square(cur->coords, n_dim); + cur->n_node= 0; + } + + pick_seeds(node, n_entries, &a, &b, n_dim); + a->n_node= 1; + b->n_node= 2; + + + copy_coords(g1, a->coords, n_dim); + size1+= key_size; + copy_coords(g2, b->coords, n_dim); + size2+= key_size; + + + for (i=n_entries - 2; i>0; --i) + { + if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */ + { + mark_all_entries(node, n_entries, 1); + break; + } + + if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */ + { + mark_all_entries(node, n_entries, 2); + break; + } + + pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim); + if (next_node == 1) + { + size1+= key_size; + mbr_join(g1, next->coords, n_dim); + } + else + { + size2+= key_size; + mbr_join(g2, next->coords, n_dim); + } + next->n_node= next_node; + } + + return 0; +} + + +/** + Logs key reorganization done in a split page (new page is logged elsewhere). + + The effect of a split on the split page is three changes: + - some piece of the page move to different places inside this page (we are + not interested here in the pieces which move to the new page) + - the key is inserted into the page or not (could be in the new page) + - page is shrunk + All this is uniquely determined by a few parameters: + - the key (starting at 'key-nod_flag', for 'full_length' bytes + (maria_rtree_split_page() seems to depend on its parameters key&key_length + but in fact it reads more (to the left: nod_flag, and to the right: + full_length) + - the binary content of the page + - some variables in the share + - double arithmetic, which is unpredictable from machine to machine and + from build to build (see pick_seeds() above: it has a comparison between + double-s 'if (d > max_d)' so the comparison can go differently from machine + to machine or build to build, it has happened in real life). + If one day we use precision-math instead of double-math, in GIS, then the + last parameter would become constant accross machines and builds and we + could some cheap logging: just log the few parameters above. + Until then, we log the list of memcpy() operations (fortunately, we often do + not have to log the source bytes, as they can be found in the page before + applying the REDO; the only source bytes to log are the key), the key if it + was inserted into this page, and the shrinking. + + @param info table + @param page page's offset in the file + @param buff content of the page (post-split) + @param key_with_nod_flag pointer to key-nod_flag + @param full_length length of (key + (nod_flag (if node) or rowid (if + leaf))) + @param log_internal_copy encoded list of mempcy() operations done on + split page, having their source in the page + @param log_internal_copy_length length of above list, in bytes + @param log_key_copy operation describing the key's copy, or NULL if the + inserted key was not put into the page (was put in + new page, so does not have to be logged here) + @param length_diff by how much the page has shrunk during split +*/ + +static my_bool _ma_log_rt_split(MARIA_PAGE *page, + const uchar *key_with_nod_flag, + uint full_length, + const uchar *log_internal_copy, + uint log_internal_copy_length, + const uchar *log_key_copy, + uint length_diff) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 2 + 1 + 2 + 2 + 7], + *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + uint translog_parts, extra_length= 0; + my_off_t page_pos; + DBUG_ENTER("_ma_log_rt_split"); + DBUG_PRINT("enter", ("page: %p", page)); + + DBUG_ASSERT(share->now_transactional); + page_pos= page->pos / share->block_size; + page_store(log_data + FILEID_STORE_SIZE, page_pos); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + log_pos[0]= KEY_OP_DEL_SUFFIX; + log_pos++; + DBUG_ASSERT((int)length_diff > 0); + int2store(log_pos, length_diff); + log_pos+= 2; + log_pos[0]= KEY_OP_MULTI_COPY; + log_pos++; + int2store(log_pos, full_length); + log_pos+= 2; + int2store(log_pos, log_internal_copy_length); + log_pos+= 2; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data) - 7; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_internal_copy; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= log_internal_copy_length; + translog_parts= 2; + if (log_key_copy != NULL) /* need to store key into record */ + { + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= log_key_copy; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= 1 + 2 + 1 + 2; + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_with_nod_flag; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= full_length; + extra_length= 1 + 2 + 1 + 2 + full_length; + translog_parts+= 2; + } + + _ma_log_key_changes(page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + page->org_size= page->size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) ((log_pos - log_data) + + log_internal_copy_length + + extra_length), + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + +/** + 0 ok; the created page is put into page cache; the shortened one is not (up + to the caller to do it) + 1 or -1: error. + If new_page_offs==NULL, won't create new page (for redo phase). +*/ + +int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page, + my_off_t *new_page_offs) +{ + MARIA_HA *info= page->info; + MARIA_SHARE *share= info->s; + const my_bool transactional= share->now_transactional; + int n1, n2; /* Number of items in groups */ + SplitStruct *task; + SplitStruct *cur; + SplitStruct *stop; + double *coord_buf; + double *next_coord; + int n_dim; + uchar *source_cur, *cur1, *cur2; + uchar *new_page_buff= 0, *log_internal_copy, *log_internal_copy_ptr, + *log_key_copy= NULL; + int err_code= 0; + uint new_page_length; + uint nod_flag= page->node; + uint org_length= page->size; + uint full_length= key->data_length + (nod_flag ? nod_flag : + key->ref_length); + uint key_data_length= key->data_length; + int max_keys= ((org_length - share->keypage_header) / (full_length)); + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEYDEF *keyinfo= key->keyinfo; + my_bool new_page_buff_alloced= 0, coord_buf_alloced= 0; + DBUG_ENTER("maria_rtree_split_page"); + DBUG_PRINT("rtree", ("splitting block")); + + n_dim= keyinfo->keysegs / 2; + + alloc_on_stack(*info->stack_end_ptr, coord_buf, coord_buf_alloced, + (n_dim * 2 * sizeof(double) * (max_keys + 1 + 4) + + sizeof(SplitStruct) * (max_keys + 1))); + if (!coord_buf) + DBUG_RETURN(-1); + + task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4)); + + next_coord= coord_buf; + + stop= task + max_keys; + source_cur= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag); + + for (cur= task; + cur < stop; + cur++, source_cur= rt_PAGE_NEXT_KEY(share, source_cur, key_data_length, + nod_flag)) + { + cur->coords= reserve_coords(&next_coord, n_dim); + cur->key= source_cur; + maria_rtree_d_mbr(keyinfo->seg, source_cur, key_data_length, cur->coords); + } + + cur->coords= reserve_coords(&next_coord, n_dim); + maria_rtree_d_mbr(keyinfo->seg, key->data, key_data_length, cur->coords); + cur->key= key->data; + + + if (split_maria_rtree_node(task, max_keys + 1, + page->size + full_length + 2, + full_length, + rt_PAGE_MIN_SIZE(keyinfo->block_length), + 2, 2, &next_coord, n_dim)) + { + err_code= 1; + goto split_err; + } + + /* Allocate buffer for new page and piece of log record */ + alloc_on_stack(*info->stack_end_ptr, new_page_buff, new_page_buff_alloced, + (keyinfo->block_length + + (transactional ? max_keys * (2 + 2) + 1 + 2 + 1 + 2 : 0))); + if (!new_page_buff) + { + err_code= -1; + goto split_err; + } + + log_internal_copy= log_internal_copy_ptr= new_page_buff + + keyinfo->block_length; + bzero(new_page_buff, share->block_size); + + stop= task + (max_keys + 1); + cur1= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag); + cur2= rt_PAGE_FIRST_KEY(share, new_page_buff, nod_flag); + + n1= n2= 0; + for (cur= task; cur < stop; cur++) + { + uchar *to; + const uchar *cur_key= cur->key; + my_bool log_this_change; + DBUG_ASSERT(log_key_copy == NULL); + if (cur->n_node == 1) + { + to= cur1; + cur1= rt_PAGE_NEXT_KEY(share, cur1, key_data_length, nod_flag); + n1++; + log_this_change= transactional; + } + else + { + to= cur2; + cur2= rt_PAGE_NEXT_KEY(share, cur2, key_data_length, nod_flag); + n2++; + log_this_change= FALSE; + } + if (to != cur_key) + { + uchar *to_with_nod_flag= to - nod_flag; + const uchar *cur_key_with_nod_flag= cur_key - nod_flag; + memcpy(to_with_nod_flag, cur_key_with_nod_flag, full_length); + if (log_this_change) + { + size_t to_with_nod_flag_offs= to_with_nod_flag - page->buff; + if (likely(cur_key != key->data)) + { + /* this memcpy() is internal to the page (source in the page) */ + size_t cur_key_with_nod_flag_offs= cur_key_with_nod_flag - page->buff; + int2store(log_internal_copy_ptr, to_with_nod_flag_offs); + log_internal_copy_ptr+= 2; + int2store(log_internal_copy_ptr, cur_key_with_nod_flag_offs); + log_internal_copy_ptr+= 2; + } + else + { + /* last iteration, and this involves *key: source is external */ + log_key_copy= log_internal_copy_ptr; + log_key_copy[0]= KEY_OP_OFFSET; + int2store(log_key_copy + 1, to_with_nod_flag_offs); + log_key_copy[3]= KEY_OP_CHANGE; + int2store(log_key_copy + 4, full_length); + /* _ma_log_rt_split() will store *key, right after */ + } + } + } + } + { /* verify that above loop didn't touch header bytes */ + uint i; + for (i= 0; i < share->keypage_header; i++) + DBUG_ASSERT(new_page_buff[i]==0); + } + + if (nod_flag) + _ma_store_keypage_flag(share, new_page_buff, KEYPAGE_FLAG_ISNOD); + _ma_store_keynr(share, new_page_buff, keyinfo->key_nr); + new_page_length= share->keypage_header + n2 * full_length; + _ma_store_page_used(share, new_page_buff, new_page_length); + page->size= share->keypage_header + n1 * full_length; + page_store_size(share, page); + + if ((*new_page_offs= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) + err_code= -1; + else + { + MARIA_PAGE new_page; + _ma_page_setup(&new_page, info, keyinfo, *new_page_offs, new_page_buff); + + if (transactional && + ( /* log change to split page */ + _ma_log_rt_split(page, key->data - nod_flag, + full_length, log_internal_copy, + (uint)(log_internal_copy_ptr - log_internal_copy), + log_key_copy, (uint)(org_length - page->size)) || + /* and to new page */ + _ma_log_new(&new_page, 0))) + err_code= -1; + + if (_ma_write_keypage(&new_page, page_link->write_lock, + DFLT_INIT_HITS)) + err_code= -1; + } + DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs)); + +split_err: + stack_alloc_free(new_page_buff, new_page_buff_alloced); + stack_alloc_free(coord_buf, coord_buf_alloced); + DBUG_RETURN(err_code); +} + +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c new file mode 100644 index 00000000..3af7d938 --- /dev/null +++ b/storage/maria/ma_rt_test.c @@ -0,0 +1,696 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Testing of the basic functions of a MARIA rtree table */ +/* Written by Alex Barkov who has a shared copyright to this code */ + + +#include "maria_def.h" +#include "ma_control_file.h" +#include "ma_loghandler.h" +#include "ma_checkpoint.h" +#include "trnman.h" +#include <my_getopt.h> + +#ifdef HAVE_RTREE_KEYS + +#include "ma_rt_index.h" + +#define MAX_REC_LENGTH 1024 +#define ndims 2 +#define KEYALG HA_KEY_ALG_RTREE + +static int read_with_pos(MARIA_HA * file); +static void create_record(uchar *record,uint rownr); +static void create_record1(uchar *record,uint rownr); +static void print_record(uchar * record,my_off_t offs,const char * tail); +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void usage(); + +static double rt_data[]= +{ + /*1*/ 0,10,0,10, + /*2*/ 5,15,0,10, + /*3*/ 0,10,5,15, + /*4*/ 10,20,10,20, + /*5*/ 0,10,0,10, + /*6*/ 5,15,0,10, + /*7*/ 0,10,5,15, + /*8*/ 10,20,10,20, + /*9*/ 0,10,0,10, + /*10*/ 5,15,0,10, + /*11*/ 0,10,5,15, + /*12*/ 10,20,10,20, + /*13*/ 0,10,0,10, + /*14*/ 5,15,0,10, + /*15*/ 0,10,5,15, + /*16*/ 10,20,10,20, + /*17*/ 5,15,0,10, + /*18*/ 0,10,5,15, + /*19*/ 10,20,10,20, + /*20*/ 0,10,0,10, + + /*1*/ 100,110,0,10, + /*2*/ 105,115,0,10, + /*3*/ 100,110,5,15, + /*4*/ 110,120,10,20, + /*5*/ 100,110,0,10, + /*6*/ 105,115,0,10, + /*7*/ 100,110,5,15, + /*8*/ 110,120,10,20, + /*9*/ 100,110,0,10, + /*10*/ 105,115,0,10, + /*11*/ 100,110,5,15, + /*12*/ 110,120,10,20, + /*13*/ 100,110,0,10, + /*14*/ 105,115,0,10, + /*15*/ 100,110,5,15, + /*16*/ 110,120,10,20, + /*17*/ 105,115,0,10, + /*18*/ 100,110,5,15, + /*19*/ 110,120,10,20, + /*20*/ 100,110,0,10, + -1 +}; + +static int testflag, checkpoint, create_flag; +static my_bool silent, transactional, die_in_middle_of_transaction, + opt_versioning; +static enum data_file_type record_type= DYNAMIC_RECORD; + +int main(int argc, char *argv[]) +{ + char buff[FN_REFLEN]; + MY_INIT(argv[0]); + maria_data_root= "."; + get_options(argc, argv); + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0, + maria_block_size, 0, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) + { + fprintf(stderr, "Error in initialization\n"); + exit(1); + } + + exit(run_test(fn_format(buff, "test1", maria_data_root, "", MYF(0)))); +} + + +static int run_test(const char *filename) +{ + MARIA_HA *file; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + MARIA_COLUMNDEF recinfo[20]; + MARIA_KEYDEF keyinfo[20]; + HA_KEYSEG keyseg[20]; + key_range range; + + int opt_unique=0; + int key_type=HA_KEYTYPE_DOUBLE; + int key_length=8; + int null_fields=0; + int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 40 */ + int uniques=0; + int i, max_i; + int error; + int row_count=0; + uchar record[MAX_REC_LENGTH]; + uchar read_record[MAX_REC_LENGTH]; + int upd= 10; + ha_rows hrows; + page_range pages; + + bzero(&uniquedef, sizeof(uniquedef)); + bzero(&create_info, sizeof(create_info)); + bzero(recinfo, sizeof(recinfo)); + bzero(keyinfo, sizeof(keyinfo)); + bzero(keyseg, sizeof(keyseg)); + + /* Define a column for NULLs and DEL markers*/ + + recinfo[0].type=FIELD_NORMAL; + recinfo[0].length=1; /* For NULL bits */ + + /* Define 2*ndims columns for coordinates*/ + + for (i=1; i<=2*ndims ;i++) + { + recinfo[i].type=FIELD_NORMAL; + recinfo[i].length=key_length; + } + + /* Define a key with 2*ndims segments */ + + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=2*ndims; + keyinfo[0].flag=0; + keyinfo[0].key_alg=KEYALG; + + for (i=0; i<2*ndims; i++) + { + keyinfo[0].seg[i].type= key_type; + keyinfo[0].seg[i].flag=0; /* Things like HA_REVERSE_SORT */ + keyinfo[0].seg[i].start= (key_length*i)+1; + keyinfo[0].seg[i].length=key_length; + keyinfo[0].seg[i].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[i].null_pos=0; + keyinfo[0].seg[i].language=default_charset_info->number; + } + + if (!silent) + printf("- Creating isam-file\n"); + + create_info.max_rows=10000000; + create_info.transactional= transactional; + + if (maria_create(filename, + record_type, + 1, /* keys */ + keyinfo, + 1+2*ndims+opt_unique, /* columns */ + recinfo,uniques,&uniquedef,&create_info,create_flag)) + goto err; + + if (!silent) + printf("- Open isam-file\n"); + + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED,0))) + goto err; + maria_begin(file); + if (opt_versioning) + maria_versioning(file, 1); + if (testflag == 1) + goto end; + if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + if (!silent) + printf("- Writing key:s\n"); + + for (i=0; i<nrecords; i++ ) + { + create_record(record,i); + error=maria_write(file,record); + print_record(record,maria_position(file),"\n"); + if (!error) + { + row_count++; + } + else + { + fprintf(stderr, "maria_write: %d\n", error); + goto err; + } + } + + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + if ((error=read_with_pos(file))) + goto err; + maria_scan_end(file); + + if (!silent) + printf("- Reading rows with key\n"); + + for (i=0 ; i < nrecords ; i++) + { + my_errno=0; + create_record(record,i); + + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,HA_READ_MBR_EQUAL); + + if (error && error!=HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr," maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + if (error == HA_ERR_KEY_NOT_FOUND) + { + print_record(record,maria_position(file)," NOT FOUND\n"); + continue; + } + print_record(read_record,maria_position(file),"\n"); + } + + if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 2) + goto end; + + if (!silent) + printf("- Deleting rows\n"); + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + for (i=0; i < nrecords/4; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_scan(file,read_record); + if (error) + { + fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n", i, error, + my_errno); + goto err; + } + print_record(read_record,maria_position(file),"\n"); + + error=maria_delete(file,read_record); + if (error) + { + fprintf(stderr, "pos: %2d maria_delete: %3d errno: %3d\n", i, error, + my_errno); + goto err; + } + } + maria_scan_end(file); + + if (testflag == 3) + goto end; + if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (!silent) + printf("- Updating rows with position\n"); + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + /* We are looking for nrecords-necords/2 non-deleted records */ + for (i=0, max_i= nrecords - nrecords/2; i < max_i ; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_scan(file,read_record); + if (error) + { + if (error==HA_ERR_RECORD_DELETED) + { + if (!silent) + printf("found deleted record\n"); + /* + In BLOCK_RECORD format, maria_scan() never returns deleted records, + while in DYNAMIC format it can. Don't count such record: + */ + max_i++; + continue; + } + fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n",i , error, + my_errno); + goto err; + } + print_record(read_record,maria_position(file),""); + create_record1(record,i+nrecords*upd); + if (!silent) + printf("\t-> "); + print_record(record,maria_position(file),"\n"); + error=maria_update(file,read_record,record); + if (error) + { + fprintf(stderr, "pos: %2d maria_update: %3d errno: %3d\n",i, error, + my_errno); + goto err; + } + } + + if (testflag == 4) + goto end; + if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + if ((error=read_with_pos(file))) + goto err; + maria_scan_end(file); + + if (!silent) + printf("- Test maria_rkey then a sequence of maria_rnext_same\n"); + + create_record(record, nrecords*4/5); + print_record(record,0," search for\n"); + + if ((error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY, + HA_READ_MBR_INTERSECT))) + { + fprintf(stderr, "maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rkey\n"); + row_count=1; + + for (;;) + { + if ((error=maria_rnext_same(file,read_record))) + { + if (error==HA_ERR_END_OF_FILE) + break; + fprintf(stderr, "maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext_same\n"); + row_count++; + } + if (!silent) + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_rfirst then a sequence of maria_rnext\n"); + + error=maria_rfirst(file,read_record,0); + if (error) + { + fprintf(stderr, "maria_rfirst: %3d errno: %3d\n",error,my_errno); + goto err; + } + row_count=1; + print_record(read_record,maria_position(file)," maria_frirst\n"); + + for (i=0;i<nrecords;i++) + { + if ((error=maria_rnext(file,read_record,0))) + { + if (error==HA_ERR_END_OF_FILE) + break; + fprintf(stderr, "maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext\n"); + row_count++; + } + if (!silent) + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_records_in_range()\n"); + + create_record1(record, nrecords*4/5); + print_record(record,0,"\n"); + + range.key= record+1; + range.length= 1000; /* Big enough */ + range.flag= HA_READ_MBR_INTERSECT; + hrows= maria_records_in_range(file,0, &range, (key_range*) 0, &pages); + if (!silent) + printf(" %ld rows\n", (long) hrows); + +end: + maria_scan_end(file); + if (die_in_middle_of_transaction) + { + /* see similar code in ma_test2.c for comments */ + switch (die_in_middle_of_transaction) { + case 1: + _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE); + break; + case 2: + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + break; + case 4: + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + } + if (!silent) + printf("Dying on request without maria_commit()/maria_close()\n"); + exit(0); + } + if (maria_commit(file)) + goto err; + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return 0; + +err: + fprintf(stderr, "got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + + +static int read_with_pos (MARIA_HA * file) +{ + int error; + int i; + uchar read_record[MAX_REC_LENGTH]; + + if (!silent) + printf("- Reading rows with position\n"); + for (i=0;;i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_scan(file,read_record); + if (error) + { + if (error==HA_ERR_END_OF_FILE) + break; + if (error==HA_ERR_RECORD_DELETED) + continue; + fprintf(stderr, "pos: %2d maria_rrnd: %3d errno: %3d\n", i, error, + my_errno); + return error; + } + print_record(read_record,maria_position(file),"\n"); + } + return 0; +} + + +#ifdef NOT_USED +static void bprint_record(char * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + if (silent) + return; + i=(unsigned char)record[0]; + printf("%02X ",i); + + for( pos=record+1, i=0; i<32; i++,pos++){ + int b=(unsigned char)*pos; + printf("%02X",b); + } + printf("%s",tail); +} +#endif + + +static void print_record(uchar *record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + uchar *pos; + double c; + + if (silent) + return; + printf(" rec=(%d)",(unsigned char)record[0]); + for ( pos=record+1, i=0; i<2*ndims; i++) + { + memcpy(&c,pos,sizeof(c)); + float8get(c,pos); + printf(" %.14g ",c); + pos+=sizeof(c); + } + printf("pos=%ld",(long int)offs); + printf("%s",tail); +} + + + +static void create_record1(uchar *record, uint rownr) +{ + int i; + uchar *pos; + double c=rownr+10; + + bzero((char*) record,MAX_REC_LENGTH); + record[0]=0x01; /* DEL marker */ + + for ( pos=record+1, i=0; i<2*ndims; i++) + { + memcpy(pos,&c,sizeof(c)); + float8store(pos,c); + pos+=sizeof(c); + } +} + +#ifdef NOT_USED + +static void create_record0(char *record,uint rownr) +{ + int i; + char * pos; + double c=rownr+10; + double c0=0; + + bzero((char*) record,MAX_REC_LENGTH); + record[0]=0x01; /* DEL marker */ + + for ( pos=record+1, i=0; i<ndims; i++) + { + memcpy(pos,&c0,sizeof(c0)); + float8store(pos,c0); + pos+=sizeof(c0); + memcpy(pos,&c,sizeof(c)); + float8store(pos,c); + pos+=sizeof(c); + } +} + +#endif + +static void create_record(uchar *record, uint rownr) +{ + int i; + uchar *pos; + double *data= rt_data+rownr*4; + record[0]=0x01; /* DEL marker */ + for ( pos=record+1, i=0; i<ndims*2; i++) + { + float8store(pos,data[i]); + pos+=8; + } +} + + +static struct my_option my_long_options[] = +{ + {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint, + (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"checksum", 'c', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Undocumented", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"datadir", 'h', "Path to the database root.", (char**) &maria_data_root, + (char**) &maria_data_root, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"row-fixed-size", 'S', "Fixed size records", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"rows-in-block", 'M', "Store rows in block format", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Undocumented", + (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, + 0, 0}, + {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag, + (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test-undo", 'A', + "Abort hard. Used for testing recovery with undo", + (uchar**) &die_in_middle_of_transaction, + (uchar**) &die_in_middle_of_transaction, + 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transactional", 'T', + "Test in transactional mode. (Only works with block format)", + (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"versioning", 'C', "Use row versioning (only works with block format)", + (uchar**) &opt_versioning, (uchar**) &opt_versioning, 0, GET_BOOL, + NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument __attribute__((unused)), + const char *filename __attribute__((unused))) +{ + switch(opt->id) { + case 'c': + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'S': + record_type= STATIC_RECORD; + break; + case '#': + DBUG_PUSH(argument); + break; + case '?': + usage(); + exit(1); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + + return; +} /* get options */ + + +static void usage() +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} + +#include "ma_check_standalone.h" + +#else +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + exit(0); +} +#endif /*HAVE_RTREE_KEYS*/ diff --git a/storage/maria/ma_scan.c b/storage/maria/ma_scan.c new file mode 100644 index 00000000..5f2945a3 --- /dev/null +++ b/storage/maria/ma_scan.c @@ -0,0 +1,75 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Read through all rows sequntially */ + +#include "maria_def.h" + +int maria_scan_init(register MARIA_HA *info) +{ + DBUG_ENTER("maria_scan_init"); + + info->cur_row.nextpos= info->s->pack.header_length; /* Read first record */ + info->lastinx= -1; /* Can't forward or backward */ + if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + + if ((*info->s->scan_init)(info)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); +} + +/* + Read a row based on position. + + SYNOPSIS + maria_scan() + info Maria handler + record Read data here + + RETURN + 0 ok + HA_ERR_END_OF_FILE End of file + HA_ERR_RECORD_DELETED Record was deleted (can only happen for static rec) + # Error code +*/ + +int maria_scan(MARIA_HA *info, uchar *record) +{ + DBUG_ENTER("maria_scan"); + /* Init all but update-flag */ + info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED); + DBUG_RETURN((*info->s->scan)(info, record, info->cur_row.nextpos, 1)); +} + + +void maria_scan_end(MARIA_HA *info) +{ + (*info->s->scan_end)(info); +} + + +int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos) +{ + *lastpos= info->cur_row.lastpos; + return 0; +} + + +int _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos) +{ + info->cur_row.nextpos= lastpos; + return 0; +} diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c new file mode 100644 index 00000000..e6f8b6bc --- /dev/null +++ b/storage/maria/ma_search.c @@ -0,0 +1,2395 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* key handling functions */ + +#include "ma_fulltext.h" +#include "m_ctype.h" + +static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, register my_off_t pos, + MARIA_PINNED_PAGE **res_page_link, + uchar **res_page_buff); +static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page, + uchar *keypos); + + +/* Check that new index is ok */ + +int _ma_check_index(MARIA_HA *info, int inx) +{ + if (inx < 0 || ! maria_is_key_active(info->s->state.key_map, inx)) + { + my_errno=HA_ERR_WRONG_INDEX; + return -1; + } + if (info->lastinx != inx) /* Index changed */ + { + info->lastinx = inx; + info->last_key.keyinfo= info->s->keyinfo + inx; + info->last_key.flag= 0; + info->page_changed=1; + info->update= ((info->update & (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED)) | + HA_STATE_NEXT_FOUND | HA_STATE_PREV_FOUND); + } + if ((info->opt_flag & WRITE_CACHE_USED) && flush_io_cache(&info->rec_cache)) + { + if (unlikely(!my_errno)) + my_errno= HA_ERR_INTERNAL_ERROR; /* Impossible */ + return(-1); + } + if (unlikely(maria_is_crashed(info))) + { + my_errno= HA_ERR_CRASHED; + return(-1); + } + + return(inx); +} /* _ma_check_index */ + + +/** + @breif Search after row by a key + + @note + Position to row is stored in info->lastpos + + @return + @retval 0 ok (key found) + @retval -1 Not found + @retval 1 If one should continue search on higher level +*/ + +int _ma_search(register MARIA_HA *info, MARIA_KEY *key, uint32 nextflag, + my_off_t pos) +{ + int error; + MARIA_PINNED_PAGE *page_link; + uchar *page_buff; + + info->page_changed= 1; /* If page not saved */ + if (!(error= _ma_search_no_save(info, key, nextflag, pos, &page_link, + &page_buff))) + { + if (nextflag & SEARCH_SAVE_BUFF) + { + memcpy(info->keyread_buff, page_buff, info->s->block_size); + + /* Save position for a possible read next / previous */ + info->int_keypos= info->keyread_buff + info->keypos_offset; + info->int_maxpos= info->keyread_buff + info->maxpos_offset; + info->int_keytree_version= key->keyinfo->version; + info->last_search_keypage= info->last_keypage; + info->page_changed= 0; + info->keyread_buff_used= 0; + } + } + _ma_unpin_all_pages(info, LSN_IMPOSSIBLE); + return (error); +} + +/** + @breif Search after row by a key + + ret_page_link Will contain pointer to page where we found key + + @note + Position to row is stored in info->lastpos + Last used key is stored in info->last_key + + @return + @retval 0 ok (key found) + @retval -1 Not found + @retval 1 If one should continue search on higher level +*/ + +static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, register my_off_t pos, + MARIA_PINNED_PAGE **res_page_link, + uchar **res_page_buff) +{ + my_bool last_key_not_used, buff_alloced; + int error,flag; + uint page_flag, nod_flag, used_length; + uchar *keypos,*maxpos; + uchar *lastkey; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + MARIA_PINNED_PAGE *page_link; + DBUG_ENTER("_ma_search"); + DBUG_PRINT("enter",("page: %lu nextflag: %u lastpos: %lu", + (ulong) (pos / info->s->block_size), + nextflag, (ulong) info->cur_row.lastpos)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key);); + DBUG_ASSERT(info->last_key.keyinfo == key->keyinfo); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + if (!(nextflag & (SEARCH_SMALLER | SEARCH_BIGGER | SEARCH_LAST))) + DBUG_RETURN(-1); /* Not found ; return error */ + DBUG_RETURN(1); /* Search at upper levels */ + } + + alloc_on_stack(*info->stack_end_ptr, lastkey, buff_alloced, + keyinfo->max_store_length); + if (!lastkey) + DBUG_RETURN(1); + + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_READ, DFLT_INIT_HITS, 0, 0)) + goto err; + page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE*); + DBUG_DUMP("page", page.buff, page.size); + + flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos, lastkey, + &last_key_not_used); + if (flag == MARIA_FOUND_WRONG_KEY) + { + _ma_print_error(info, HA_ERR_CRASHED, 0); + my_errno= HA_ERR_CRASHED; + goto err; + } + page_flag= page.flag; + used_length= page.size; + nod_flag= page.node; + maxpos= page.buff + used_length -1; + + if (flag) + { + if ((error= _ma_search_no_save(info, key, nextflag, + _ma_kpos(nod_flag,keypos), + res_page_link, res_page_buff)) <= 0) + goto ret_error; + + error= 1; /* Default return value */ + if (flag >0) + { + if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) && + keypos == page.buff + info->s->keypage_header + nod_flag) + goto ret_error; /* Bigger than key */ + } + else if (nextflag & SEARCH_BIGGER && keypos >= maxpos) + goto ret_error; /* Smaller than key */ + } + else + { + /* Found matching key */ + if ((nextflag & SEARCH_FIND) && nod_flag && + ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME || + (key->flag & SEARCH_PART_KEY) || info->s->base.born_transactional)) + { + if ((error= _ma_search_no_save(info, key, (nextflag | SEARCH_FIND) & + ~(SEARCH_BIGGER | SEARCH_SMALLER | + SEARCH_LAST), + _ma_kpos(nod_flag,keypos), + res_page_link, res_page_buff)) >= 0 || + my_errno != HA_ERR_KEY_NOT_FOUND) + goto ret_error; + } + } + + if ((nextflag & (SEARCH_SMALLER | SEARCH_LAST)) && flag != 0) + { + uint not_used[2]; + if (_ma_get_prev_key(&info->last_key, &page, keypos)) + goto err; + /* + We have to use key->flag >> 1 here to transform + SEARCH_PAGE_KEY_HAS_TRANSID to SEARCH_USER_KEY_HAS_TRANSID + */ + if (!(nextflag & SEARCH_SMALLER) && + ha_key_cmp(keyinfo->seg, info->last_key.data, key->data, + key->data_length + key->ref_length, + SEARCH_FIND | (key->flag >> 1) | info->last_key.flag, + not_used)) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + goto err; + } + } + else + { + /* Set info->last_key to temporarily point to last key value */ + info->last_key.data= lastkey; + /* Get key value (if not packed key) and position after key */ + if (!(*keyinfo->get_key)(&info->last_key, page_flag, nod_flag, &keypos)) + goto err; + memcpy(info->lastkey_buff, lastkey, + info->last_key.data_length + info->last_key.ref_length); + info->last_key.data= info->lastkey_buff; + } + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + + /* Store offset to key */ + info->keypos_offset= (uint) (keypos - page.buff); + info->maxpos_offset= (uint) (maxpos - page.buff); + info->int_nod_flag= nod_flag; + info->last_keypage= pos; + *res_page_link= page_link; + *res_page_buff= page.buff; + + stack_alloc_free(lastkey, buff_alloced); + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); + +err: + DBUG_PRINT("exit",("Error: %d",my_errno)); + info->cur_row.lastpos= HA_OFFSET_ERROR; + info->page_changed=1; + error= -1; + +ret_error: + stack_alloc_free(lastkey, buff_alloced); + DBUG_RETURN(error); +} + + +/* + Search after key in page-block + + @fn _ma_bin_search + @param key Search after this key + @param page Start of data page + @param comp_flag How key should be compared + @param ret_pos + @param buff Buffer for holding a key (not used here) + @param last_key + + @note + If keys are packed, then smaller or identical key is stored in buff + + @return + @retval <0, 0 , >0 depending on if if found is smaller, equal or bigger than + 'key' + @retval ret_pos Points to where the identical or bigger key starts + @retval last_key Set to 1 if key is the last key in the page. +*/ + +int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page, + uint32 comp_flag, uchar **ret_pos, + uchar *buff __attribute__((unused)), my_bool *last_key) +{ + int UNINIT_VAR(flag); + uint page_flag; + uint start, mid, end, save_end, totlength, nod_flag; + uint not_used[2]; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_SHARE *share= keyinfo->share; + uchar *page; + DBUG_ENTER("_ma_bin_search"); + + page_flag= ma_page->flag; + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + /* Keys have varying length, can't use binary search */ + DBUG_RETURN(_ma_seq_search(key, ma_page, comp_flag, ret_pos, buff, + last_key)); + } + + nod_flag= ma_page->node; + totlength= keyinfo->keylength + nod_flag; + DBUG_ASSERT(ma_page->size >= share->keypage_header + nod_flag + totlength); + + start=0; + mid=1; + save_end= end= ((ma_page->size - nod_flag - share->keypage_header) / + totlength-1); + DBUG_PRINT("test",("page_length: %u end: %u", ma_page->size, end)); + page= ma_page->buff + share->keypage_header + nod_flag; + + while (start != end) + { + mid= (start+end)/2; + if ((flag=ha_key_cmp(keyinfo->seg, page + (uint) mid * totlength, + key->data, key->data_length + key->ref_length, + comp_flag, not_used)) + >= 0) + end=mid; + else + start=mid+1; + } + if (mid != start) + flag=ha_key_cmp(keyinfo->seg, page + (uint) start * totlength, + key->data, key->data_length + key->ref_length, comp_flag, + not_used); + if (flag < 0) + start++; /* point at next, bigger key */ + *ret_pos= (page + (uint) start * totlength); + *last_key= end == save_end; + DBUG_PRINT("exit",("flag: %d keypos: %d",flag,start)); + DBUG_RETURN(flag); +} /* _ma_bin_search */ + + +/** + Locate a packed key in a key page. + + @fn _ma_seq_search() + @param key Search key. + @param page Key page (beginning). + @param comp_flag Search flags like SEARCH_SAME etc. + @param ret_pos + @param buff Buffer for holding temp keys + @param last_key + + @description + Used instead of _ma_bin_search() when key is packed. + Puts smaller or identical key in buff. + Key is searched sequentially. + + @todo + Don't copy key to buffer if we are not using key with prefix packing + + @return + @retval > 0 Key in 'buff' is smaller than search key. + @retval 0 Key in 'buff' is identical to search key. + @retval < 0 Not found. + + @retval ret_pos Points to where the identical or bigger key starts + @retval last_key Set to 1 if key is the last key in the page + @retval buff Copy of previous or identical unpacked key +*/ + +int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page, + uint32 comp_flag, uchar **ret_pos, + uchar *buff, my_bool *last_key) +{ + int UNINIT_VAR(flag); + uint page_flag, nod_flag, UNINIT_VAR(length), not_used[2]; + uchar t_buff[MARIA_MAX_KEY_BUFF], *end; + uchar *page; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_SHARE *share= keyinfo->share; + MARIA_KEY tmp_key; + DBUG_ENTER("_ma_seq_search"); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + page= ma_page->buff; + end= page + ma_page->size; + page+= share->keypage_header + nod_flag; + *ret_pos= page; + t_buff[0]=0; /* Avoid bugs */ + + tmp_key.data= t_buff; + tmp_key.keyinfo= keyinfo; + while (page < end) + { + length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &page); + if (length == 0 || page > end) + { + _ma_set_fatal_error_with_share(share, HA_ERR_CRASHED); + DBUG_PRINT("error", + ("Found wrong key: length: %u page: %p end: %p", + length, page, end)); + DBUG_RETURN(MARIA_FOUND_WRONG_KEY); + } + if ((flag= ha_key_cmp(keyinfo->seg, t_buff, key->data, + key->data_length + key->ref_length, + comp_flag | tmp_key.flag, + not_used)) >= 0) + break; + DBUG_PRINT("loop_extra",("page:%p key: '%s' flag: %d", + page, t_buff, flag)); + memcpy(buff,t_buff,length); + *ret_pos=page; + } + if (flag == 0) + memcpy(buff,t_buff,length); /* Result is first key */ + *last_key= page == end; + DBUG_PRINT("exit",("flag: %d ret_pos: %p", flag, *ret_pos)); + DBUG_RETURN(flag); +} /* _ma_seq_search */ + + +/** + Search for key on key page with string prefix compression + + @notes + This is an optimized function compared to calling _ma_get_pack_key() + for each key in the buffer + + Same interface as for _ma_seq_search() +*/ + +int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page, + uint32 nextflag, uchar **ret_pos, uchar *buff, + my_bool *last_key) +{ + /* + my_flag is raw comparison result to be changed according to + SEARCH_NO_FIND,SEARCH_LAST and HA_REVERSE_SORT flags. + flag is the value returned by ha_key_cmp and as treated as final + */ + int flag=0, my_flag=-1; + uint nod_flag, UNINIT_VAR(length), len, matched, cmplen, kseg_len; + uint page_flag, UNINIT_VAR(prefix_len),suffix_len; + int key_len_skip, UNINIT_VAR(seg_len_pack), key_len_left; + uchar *end, *vseg, *UNINIT_VAR(saved_vseg), *UNINIT_VAR(saved_from); + uchar *page; + uchar tt_buff[MARIA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2; + uchar *UNINIT_VAR(saved_to); + const uchar *kseg; + uint saved_length=0, saved_prefix_len=0; + uint length_pack; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_SHARE *share= keyinfo->share; + const uchar *sort_order= keyinfo->seg->charset->sort_order; + const int reverse = keyinfo->seg->flag & HA_REVERSE_SORT; + DBUG_ENTER("_ma_prefix_search"); + + t_buff[0]=0; /* Avoid bugs */ + page_flag= ma_page->flag; + nod_flag= ma_page->node; + page_flag&= KEYPAGE_FLAG_HAS_TRANSID; /* For faster test in loop */ + page= ma_page->buff; + end= page + ma_page->size; + page+= share->keypage_header + nod_flag; + *ret_pos= page; + kseg= key->data; + + get_key_pack_length(kseg_len, length_pack, kseg); + key_len_skip=length_pack+kseg_len; + key_len_left=(int) (key->data_length + key->ref_length) - (int) key_len_skip; + /* If key_len is 0, then length_pack is 1, then key_len_left is -1. */ + cmplen= ((key_len_left>=0) ? kseg_len : + (key->data_length + key->ref_length - length_pack)); + DBUG_PRINT("info",("key: '%.*s'",kseg_len,kseg)); + + /* + Keys are compressed the following way: + + If the max length of first key segment <= 127 bytes the prefix is + 1 uchar else it's 2 byte + + (prefix) length The high bit is set if this is a prefix for the prev key. + [suffix length] Packed length of suffix if the previous was a prefix. + (suffix) data Key data bytes (past the common prefix or whole segment). + [next-key-seg] Next key segments (([packed length], data), ...) + pointer Reference to the data file (last_keyseg->length). + */ + + matched=0; /* how many char's from prefix were alredy matched */ + len=0; /* length of previous key unpacked */ + + while (page < end) + { + uint packed= *page & 128; + uint key_flag; + + vseg= page; + if (keyinfo->seg->length >= 127) + { + suffix_len=mi_uint2korr(vseg) & 32767; + vseg+=2; + } + else + suffix_len= *vseg++ & 127; + + if (packed) + { + if (suffix_len == 0) + { + /* == 0x80 or 0x8000, same key, prefix length == old key length. */ + prefix_len=len; + } + else + { + /* > 0x80 or 0x8000, this is prefix lgt, packed suffix lgt follows. */ + prefix_len=suffix_len; + get_key_length(suffix_len,vseg); + } + } + else + { + /* Not packed. No prefix used from last key. */ + prefix_len=0; + } + + len=prefix_len+suffix_len; + seg_len_pack=get_pack_length(len); + t_buff=tt_buff+3-seg_len_pack; + store_key_length(t_buff,len); + + if (prefix_len > saved_prefix_len) + memcpy(t_buff+seg_len_pack+saved_prefix_len,saved_vseg, + prefix_len-saved_prefix_len); + saved_vseg=vseg; + saved_prefix_len=prefix_len; + + DBUG_PRINT("loop",("page: '%.*s%.*s'",prefix_len,t_buff+seg_len_pack, + suffix_len,vseg)); + { + /* Calculate length of one key */ + uchar *from= vseg+suffix_len; + HA_KEYSEG *keyseg; + + for (keyseg=keyinfo->seg+1 ; keyseg->type ; keyseg++ ) + { + if (keyseg->flag & HA_NULL_PART) + { + if (!(*from++)) + continue; + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + uint key_part_length; + get_key_length(key_part_length,from); + from+= key_part_length; + } + else + from+= keyseg->length; + } + from+= keyseg->length; + key_flag=0; + + if (page_flag && key_has_transid(from-1)) + { + from+= transid_packed_length(from); + key_flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + page= from + nod_flag; + length= (uint) (from-vseg); + } + + if (page > end) + { + _ma_set_fatal_error_with_share(share, HA_ERR_CRASHED); + DBUG_PRINT("error", + ("Found wrong key: length: %u page: %p end: %p", + length, page, end)); + DBUG_RETURN(MARIA_FOUND_WRONG_KEY); + } + + if (matched >= prefix_len) + { + /* We have to compare. But we can still skip part of the key */ + uint left; + const uchar *k= kseg+prefix_len; + + /* + If prefix_len > cmplen then we are in the end-space comparison + phase. Do not try to access the key any more ==> left= 0. + */ + left= ((len <= cmplen) ? suffix_len : + ((prefix_len < cmplen) ? cmplen - prefix_len : 0)); + + matched=prefix_len+left; + + if (sort_order) + { + for (my_flag=0;left;left--) + if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++])) + break; + } + else + { + for (my_flag=0;left;left--) + if ((my_flag= (int) *vseg++ - (int) *k++)) + break; + } + + if (my_flag==0) /* match */ + { + /* + ** len cmplen seg_left_len more_segs + ** < matched=len; continue search + ** > = prefix ? found : (matched=len; + * continue search) + ** > < - ok, found + ** = < - ok, found + ** = = - ok, found + ** = = + next seg + */ + if (len < cmplen) + { + if ((keyinfo->seg->type != HA_KEYTYPE_TEXT && + keyinfo->seg->type != HA_KEYTYPE_VARTEXT1 && + keyinfo->seg->type != HA_KEYTYPE_VARTEXT2)) + my_flag= -1; + else + { + /* We have to compare k and vseg as if they were space extended */ + const uchar *k_end= k+ (cmplen - len); + for ( ; k < k_end && *k == ' '; k++) ; + if (k == k_end) + goto cmp_rest; /* should never happen */ + my_flag= (uchar)' ' - *k; + } + } + else if (len > cmplen) + { + uchar *vseg_end; + if ((nextflag & SEARCH_PREFIX) && key_len_left == 0) + goto fix_flag; + + /* We have to compare k and vseg as if they were space extended */ + for (vseg_end= vseg + (len-cmplen) ; + vseg < vseg_end && *vseg == (uchar) ' '; + vseg++, matched++) ; + DBUG_ASSERT(vseg < vseg_end); + my_flag= *vseg - (uchar)' '; + } + else + { + cmp_rest: + if (key_len_left>0) + { + uint not_used[2]; + if ((flag = ha_key_cmp(keyinfo->seg+1,vseg, + k, key_len_left, nextflag | key_flag, + not_used)) >= 0) + break; + } + else + { + /* + at this line flag==-1 if the following lines were already + visited and 0 otherwise, i.e. flag <=0 here always !!! + */ + fix_flag: + DBUG_ASSERT(flag <= 0); + if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST)) + flag=(nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1; + if (flag>=0) + break; + } + } + } + if ((reverse ? -my_flag : my_flag) > 0) /* mismatch */ + break; + matched-=left; + } + /* else (matched < prefix_len) ---> do nothing. */ + + memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len); + saved_to= buff+saved_length; + saved_from= saved_vseg; + saved_length=length; + *ret_pos=page; + } + if (my_flag) + flag= reverse ? -my_flag : my_flag; + if (flag == 0) + { + memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len); + saved_to= buff+saved_length; + saved_from= saved_vseg; + saved_length=length; + } + if (saved_length) + memcpy(saved_to, saved_from, saved_length); + + *last_key= page == end; + + DBUG_PRINT("exit",("flag: %d ret_pos: %p", flag, *ret_pos)); + DBUG_RETURN(flag); +} /* _ma_prefix_search */ + + +/* Get pos to a key_block */ + +my_off_t _ma_kpos(uint nod_flag, const uchar *after_key) +{ + after_key-=nod_flag; + switch (nod_flag) { +#if SIZEOF_OFF_T > 4 + case 7: + return mi_uint7korr(after_key)*maria_block_size; + case 6: + return mi_uint6korr(after_key)*maria_block_size; + case 5: + return mi_uint5korr(after_key)*maria_block_size; +#else + case 7: + after_key++; + case 6: + after_key++; + case 5: + after_key++; +#endif + case 4: + return ((my_off_t) mi_uint4korr(after_key))*maria_block_size; + case 3: + return ((my_off_t) mi_uint3korr(after_key))*maria_block_size; + case 2: + return (my_off_t) (mi_uint2korr(after_key)*maria_block_size); + case 1: + return (uint) (*after_key)*maria_block_size; + case 0: /* At leaf page */ + default: /* Impossible */ + return(HA_OFFSET_ERROR); + } +} /* _kpos */ + + +/* Save pos to a key_block */ + +void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos) +{ + pos/=maria_block_size; + switch (info->s->base.key_reflength) { +#if SIZEOF_OFF_T > 4 + case 7: mi_int7store(buff,pos); break; + case 6: mi_int6store(buff,pos); break; + case 5: mi_int5store(buff,pos); break; +#else + case 7: *buff++=0; + /* fall through */ + case 6: *buff++=0; + /* fall through */ + case 5: *buff++=0; + /* fall through */ +#endif + case 4: mi_int4store(buff,pos); break; + case 3: mi_int3store(buff,pos); break; + case 2: mi_int2store(buff,(uint) pos); break; + case 1: buff[0]= (uchar) pos; break; + default: abort(); /* impossible */ + } +} /* _ma_kpointer */ + + +/* Calc pos to a data-record from a key */ + +MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key) +{ + my_off_t pos; + const uchar *after_key= key->data + key->data_length; + MARIA_SHARE *share= key->keyinfo->share; + switch (share->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: pos= (my_off_t) mi_uint8korr(after_key); break; + case 7: pos= (my_off_t) mi_uint7korr(after_key); break; + case 6: pos= (my_off_t) mi_uint6korr(after_key); break; + case 5: pos= (my_off_t) mi_uint5korr(after_key); break; +#else + case 8: pos= (my_off_t) mi_uint4korr(after_key+4); break; + case 7: pos= (my_off_t) mi_uint4korr(after_key+3); break; + case 6: pos= (my_off_t) mi_uint4korr(after_key+2); break; + case 5: pos= (my_off_t) mi_uint4korr(after_key+1); break; +#endif + case 4: pos= (my_off_t) mi_uint4korr(after_key); break; + case 3: pos= (my_off_t) mi_uint3korr(after_key); break; + case 2: pos= (my_off_t) mi_uint2korr(after_key); break; + case 0: /* NO_RECORD */ + default: + pos=0L; /* Shut compiler up */ + } + return (*share->keypos_to_recpos)(share, pos); +} + + +/** + Get trid from a key + + @param key Maria key read from a page + + @retval 0 If key doesn't have a trid + @retval trid +*/ + +TrID _ma_trid_from_key(const MARIA_KEY *key) +{ + if (!(key->flag & (SEARCH_PAGE_KEY_HAS_TRANSID | + SEARCH_USER_KEY_HAS_TRANSID))) + return 0; + return transid_get_packed(key->keyinfo->share, + key->data + key->data_length + + key->keyinfo->share->rec_reflength); +} + + +/* Calc position from a record pointer ( in delete link chain ) */ + +MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr) +{ + my_off_t pos; + switch (share->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: + pos= (my_off_t) mi_uint8korr(ptr); + if (pos == HA_OFFSET_ERROR) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 7: + pos= (my_off_t) mi_uint7korr(ptr); + if (pos == (((my_off_t) 1) << 56) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 6: + pos= (my_off_t) mi_uint6korr(ptr); + if (pos == (((my_off_t) 1) << 48) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; + case 5: + pos= (my_off_t) mi_uint5korr(ptr); + if (pos == (((my_off_t) 1) << 40) -1) + return HA_OFFSET_ERROR; /* end of list */ + break; +#else + case 8: + case 7: + case 6: + case 5: + ptr+= (share->rec_reflength-4); + /* fall through */ +#endif + case 4: + pos= (my_off_t) mi_uint4korr(ptr); + if (pos == (my_off_t) (uint32) ~0L) + return HA_OFFSET_ERROR; + break; + case 3: + pos= (my_off_t) mi_uint3korr(ptr); + if (pos == (my_off_t) (1 << 24) -1) + return HA_OFFSET_ERROR; + break; + case 2: + pos= (my_off_t) mi_uint2korr(ptr); + if (pos == (my_off_t) (1 << 16) -1) + return HA_OFFSET_ERROR; + break; + default: abort(); /* Impossible */ + } + return (*share->keypos_to_recpos)(share, pos); +} + + +/* save position to record */ + +void _ma_dpointer(MARIA_SHARE *share, uchar *buff, my_off_t pos) +{ + if (pos != HA_OFFSET_ERROR) + pos= (*share->recpos_to_keypos)(share, pos); + + switch (share->rec_reflength) { +#if SIZEOF_OFF_T > 4 + case 8: mi_int8store(buff,pos); break; + case 7: mi_int7store(buff,pos); break; + case 6: mi_int6store(buff,pos); break; + case 5: mi_int5store(buff,pos); break; +#else + case 8: *buff++=0; + /* fall through */ + case 7: *buff++=0; + /* fall through */ + case 6: *buff++=0; + /* fall through */ + case 5: *buff++=0; + /* fall through */ +#endif + case 4: mi_int4store(buff,pos); break; + case 3: mi_int3store(buff,pos); break; + case 2: mi_int2store(buff,(uint) pos); break; + case 0: break; /* For NO_RECORD */ + default: abort(); /* Impossible */ + } +} /* _ma_dpointer */ + + +my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos) +{ + return pos * share->base.pack_reclength; +} + + +my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos) +{ + return pos / share->base.pack_reclength; +} + +my_off_t _ma_transparent_recpos(MARIA_SHARE *share __attribute__((unused)), + my_off_t pos) +{ + return pos; +} + +my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *share + __attribute__((unused)), + my_off_t pos) +{ + /* We need one bit to store if there is transid's after position */ + return pos >> 1; +} + +my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *share + __attribute__((unused)), + my_off_t pos) +{ + return pos << 1; +} + +/* + @brief Get key from key-block + + @param key Should contain previous key. Will contain new key + @param page_flag Flag on page block + @param nod_flag Is set to nod length if we on nod + @param page Points at previous key; Its advanced to point at next key + + @notes + Same as _ma_get_key but used with fixed length keys + + @return + @retval key_length + length of data pointer (without nod length) + */ + +uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag, + register uchar **page) +{ + register MARIA_KEYDEF *keyinfo= key->keyinfo; + uint key_length= keyinfo->keylength; + + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= key_length - key->ref_length; + key->flag= 0; + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + uchar *end= *page + keyinfo->keylength; + if (key_has_transid(end-1)) + { + uint trans_length= transid_packed_length(end); + key->ref_length+= trans_length; + key_length+= trans_length; + key->flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + } + key_length+= nod_flag; + memcpy(key->data, *page, key_length); + *page+= key_length; + return key_length - nod_flag; +} /* _ma_get_static_key */ + + +/** + Skip over static length key from key-block + + @fn _ma_skip_static_key() + @param key Keyinfo and buffer that can be used + @param nod_flag If nod: Length of node pointer, else zero. + @param key Points at key + + @retval pointer to next key +*/ + +uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page) +{ + page+= key->keyinfo->keylength; + if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1)) + page+= transid_packed_length(page); + return page+ nod_flag; +} + + +/* + get key which is packed against previous key or key with a NULL column. + + SYNOPSIS + _ma_get_pack_key() + @param int_key Should contain previous key. Will contain new key + @param page_flag page_flag from page + @param nod_flag If nod: Length of node pointer, else zero. + @param page_pos Points at previous key; Its advanced to point at next key + + @return + @retval key_length + length of data pointer +*/ + +uint _ma_get_pack_key(MARIA_KEY *int_key, uint page_flag, + uint nod_flag, uchar **page_pos) +{ + reg1 HA_KEYSEG *keyseg; + uchar *page= *page_pos; + uint length; + uchar *key= int_key->data; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + + for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++) + { + if (keyseg->flag & HA_PACK_KEY) + { + /* key with length, packed to previous key */ + uchar *start= key; + uint packed= *page & 128,tot_length,rest_length; + if (keyseg->length >= 127) + { + length=mi_uint2korr(page) & 32767; + page+=2; + } + else + length= *page++ & 127; + + if (packed) + { + if (length > (uint) keyseg->length) + { + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + return 0; /* Error */ + } + if (length == 0) /* Same key */ + { + if (keyseg->flag & HA_NULL_PART) + *key++=1; /* Can't be NULL */ + get_key_length(length,key); + key+= length; /* Same diff_key as prev */ + if (length > keyseg->length) + { + DBUG_PRINT("error", + ("Found too long null packed key: %u of %u at %p", + length, keyseg->length, *page_pos)); + DBUG_DUMP("key", *page_pos, 16); + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + return 0; + } + continue; + } + if (keyseg->flag & HA_NULL_PART) + { + key++; /* Skip null marker*/ + start++; + } + + get_key_length(rest_length,page); + tot_length=rest_length+length; + + /* If the stored length has changed, we must move the key */ + if (tot_length >= 255 && *start != 255) + { + /* length prefix changed from a length of one to a length of 3 */ + bmove_upp(key+length+3, key+length+1, length); + *key=255; + mi_int2store(key+1,tot_length); + key+=3+length; + } + else if (tot_length < 255 && *start == 255) + { + bmove(key+1,key+3,length); + *key=tot_length; + key+=1+length; + } + else + { + store_key_length_inc(key,tot_length); + key+=length; + } + memcpy(key,page,rest_length); + page+=rest_length; + key+=rest_length; + continue; + } + else + { + /* Key that is not packed against previous key */ + if (keyseg->flag & HA_NULL_PART) + { + if (!length--) /* Null part */ + { + *key++=0; + continue; + } + *key++=1; /* Not null */ + } + } + if (length > (uint) keyseg->length) + { + DBUG_PRINT("error",("Found too long packed key: %u of %u at %p", + length, keyseg->length, *page_pos)); + DBUG_DUMP("key", *page_pos, 16); + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + return 0; /* Error */ + } + store_key_length_inc(key,length); + } + else + { + if (keyseg->flag & HA_NULL_PART) + { + if (!(*key++ = *page++)) + continue; + } + if (keyseg->flag & + (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + uchar *tmp=page; + get_key_length(length,tmp); + length+=(uint) (tmp-page); + } + else + length=keyseg->length; + } + memcpy(key, page,(size_t) length); + key+=length; + page+=length; + } + + int_key->data_length= (uint)(key - int_key->data); + int_key->flag= 0; + length= keyseg->length; + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + uchar *end= page + length; + if (key_has_transid(end-1)) + { + length+= transid_packed_length(end); + int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + } + int_key->ref_length= length; + length+= nod_flag; + bmove(key, page, length); + *page_pos= page+length; + + return (int_key->data_length + int_key->ref_length); +} /* _ma_get_pack_key */ + + +/** + skip key which is packed against previous key or key with a NULL column. + + @fn _ma_skip_pack_key() + @param key Keyinfo and buffer that can be used + @param nod_flag If nod: Length of node pointer, else zero. + @param key Points at key + + @note + This is in principle a simpler version of _ma_get_pack_key() + + @retval pointer to next key +*/ + +uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page) +{ + reg1 HA_KEYSEG *keyseg; + for (keyseg= key->keyinfo->seg ; keyseg->type ; keyseg++) + { + if (keyseg->flag & HA_PACK_KEY) + { + /* key with length, packed to previous key */ + uint packed= *page & 128, length; + if (keyseg->length >= 127) + { + length= mi_uint2korr(page) & 32767; + page+= 2; + } + else + length= *page++ & 127; + + if (packed) + { + if (length == 0) /* Same key */ + continue; + get_key_length(length,page); + page+= length; + continue; + } + if ((keyseg->flag & HA_NULL_PART) && length) + { + /* + Keys that can have null use length+1 as the length for date as the + number 0 is reserved for keys that have a NULL value + */ + length--; + } + page+= length; + } + else + { + if (keyseg->flag & HA_NULL_PART) + if (!*page++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,page); + page+=length; + } + else + page+= keyseg->length; + } + } + page+= keyseg->length; + if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1)) + page+= transid_packed_length(page); + return page + nod_flag; +} + + +/* Read key that is packed relatively to previous */ + +uint _ma_get_binary_pack_key(MARIA_KEY *int_key, uint page_flag, uint nod_flag, + register uchar **page_pos) +{ + reg1 HA_KEYSEG *keyseg; + uchar *page, *page_end, *from, *from_end, *key; + uint length,tmp; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + DBUG_ENTER("_ma_get_binary_pack_key"); + + page= *page_pos; + page_end=page + MARIA_MAX_KEY_BUFF + 1; + key= int_key->data; + + /* + Keys are compressed the following way: + + prefix length Packed length of prefix common with prev key. + (1 or 3 bytes) + for each key segment: + [is null] Null indicator if can be null (1 byte, zero means null) + [length] Packed length if varlength (1 or 3 bytes) + key segment 'length' bytes of key segment value + pointer Reference to the data file (last_keyseg->length). + + get_key_length() is a macro. It gets the prefix length from 'page' + and puts it into 'length'. It increments 'page' by 1 or 3, depending + on the packed length of the prefix length. + */ + get_key_length(length,page); + if (length) + { + if (length > keyinfo->maxlength) + { + DBUG_PRINT("error", + ("Found too long binary packed key: %u of %u at %p", + length, keyinfo->maxlength, *page_pos)); + DBUG_DUMP("key", *page_pos, 16); + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + DBUG_RETURN(0); /* Wrong key */ + } + /* Key is packed against prev key, take prefix from prev key. */ + from= key; + from_end= key + length; + } + else + { + /* Key is not packed against prev key, take all from page buffer. */ + from= page; + from_end= page_end; + } + + /* + The trouble is that key can be split in two parts: + The first part (prefix) is in from .. from_end - 1. + The second part starts at page. + The split can be at every byte position. So we need to check for + the end of the first part before using every byte. + */ + for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) + { + from=page; + from_end=page_end; + } + if (!(*key++ = *from++)) + continue; /* Null part */ + } + if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + /* Get length of dynamic length key part */ + if ((length= (uint) (uchar) (*key++ = *from++)) == 255) + { + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + length= ((uint) (uchar) ((*key++ = *from++))) << 8; + /* If prefix is used up, switch to rest. */ + if (from == from_end) { from=page; from_end=page_end; } + length+= (uint) (uchar) ((*key++ = *from++)); + } + } + else + length=keyseg->length; + + if ((tmp=(uint) (from_end-from)) <= length) + { + key+=tmp; /* Use old key */ + length-=tmp; + from=page; from_end=page_end; + } + DBUG_ASSERT((int) length >= 0); + DBUG_PRINT("info",("key: %p from: %p length: %u", + key, from, length)); + memmove(key, from, (size_t) length); + key+=length; + from+=length; + } + /* + Last segment (type == 0) contains length of data pointer. + If we have mixed key blocks with data pointer and key block pointer, + we have to copy both. + */ + int_key->data_length= (uint)(key - int_key->data); + int_key->ref_length= length= keyseg->length; + int_key->flag= 0; + if ((tmp=(uint) (from_end-from)) <= length) + { + /* Skip over the last common part of the data */ + key+= tmp; + length-= tmp; + from= page; + } + else + { + /* + Remaining length is greater than max possible length. + This can happen only if we switched to the new key bytes already. + 'page_end' is calculated with MARIA_MAX_KEY_BUFF. So it can be far + behind the real end of the key. + */ + if (from_end != page_end) + { + DBUG_PRINT("error",("Error when unpacking key")); + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + DBUG_RETURN(0); /* Error */ + } + } + if (page_flag & KEYPAGE_FLAG_HAS_TRANSID) + { + uchar *end= from + length; + if (key_has_transid(end-1)) + { + uint trans_length= transid_packed_length(end); + length+= trans_length; + int_key->ref_length+= trans_length; + int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID; + } + } + + /* Copy rest of data ptr and, if appropriate, trans_id and node_ptr */ + memcpy(key, from, length + nod_flag); + *page_pos= from + length + nod_flag; + +#ifdef USEFUL_FOR_DEBUGGING + DBUG_DUMP("key", int_key->data, + (uint) (int_key->data_length + int_key->ref_length)); +#endif + DBUG_RETURN(int_key->data_length + int_key->ref_length); +} + +/** + skip key which is ptefix packed against previous key + + @fn _ma_skip_binary_key() + @param key Keyinfo and buffer that can be used + @param nod_flag If nod: Length of node pointer, else zero. + @param key Points at key + + @note + We have to copy the key as otherwise we don't know how much left + data there is of the key. + + @todo + Implement more efficient version of this. We can ignore to copy any rest + key parts that are not null or not packed. We also don't have to copy + rowid or transid. + + @retval pointer to next key +*/ + +uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page) +{ + if (!_ma_get_binary_pack_key(key, page_flag, nod_flag, &page)) + return 0; + return page; +} + + +/** + @brief Get key at position without knowledge of previous key + + @return pointer to next key +*/ + +uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *keypos) +{ + uint page_flag, nod_flag; + MARIA_KEYDEF *keyinfo= key->keyinfo; + uchar *page; + DBUG_ENTER("_ma_get_key"); + + page= ma_page->buff; + page_flag= ma_page->flag; + nod_flag= ma_page->node; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + bmove(key->data, keypos, keyinfo->keylength+nod_flag); + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= keyinfo->keylength - key->ref_length; + key->flag= 0; + DBUG_RETURN(keypos+keyinfo->keylength+nod_flag); + } + else + { + page+= keyinfo->share->keypage_header + nod_flag; + key->data[0]= 0; /* safety */ + while (page <= keypos) + { + if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page)) + { + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + DBUG_RETURN(0); + } + } + } + DBUG_PRINT("exit",("page: %p length: %u", page, + key->data_length + key->ref_length)); + DBUG_RETURN(page); +} /* _ma_get_key */ + + +/* + @brief Get key at position without knowledge of previous key + + @return + @retval 0 ok + @retval 1 error +*/ + +static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page, + uchar *keypos) +{ + uint page_flag, nod_flag; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_get_prev_key"); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + bmove(key->data, keypos - keyinfo->keylength - nod_flag, + keyinfo->keylength); + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= keyinfo->keylength - key->ref_length; + key->flag= 0; + DBUG_RETURN(0); + } + else + { + uchar *page; + + page= ma_page->buff + keyinfo->share->keypage_header + nod_flag; + key->data[0]= 0; /* safety */ + DBUG_ASSERT(page != keypos); + while (page < keypos) + { + if (! (*keyinfo->get_key)(key, page_flag, nod_flag, &page)) + { + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + DBUG_RETURN(1); + } + } + } + DBUG_RETURN(0); +} /* _ma_get_prev_key */ + + +/* + @brief Get last key from key-page before 'endpos' + + @note + endpos may be either end of buffer or start of a key + + @return + @retval pointer to where key starts +*/ + +uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *endpos) +{ + uint page_flag,nod_flag; + uchar *lastpos, *page; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_get_last_key"); + DBUG_PRINT("enter",("page: %p endpos: %p", ma_page->buff, + endpos)); + + page_flag= ma_page->flag; + nod_flag= ma_page->node; + page= ma_page->buff + keyinfo->share->keypage_header + nod_flag; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + lastpos= endpos-keyinfo->keylength-nod_flag; + key->ref_length= keyinfo->share->rec_reflength; + key->data_length= keyinfo->keylength - key->ref_length; + key->flag= 0; + if (lastpos >= page) + bmove(key->data, lastpos, keyinfo->keylength + nod_flag); + } + else + { + lastpos= page; + key->data[0]=0; /* safety */ + while (page < endpos) + { + lastpos= page; + if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page)) + { + DBUG_PRINT("error",("Couldn't find last key: page: %p", + page)); + _ma_set_fatal_error_with_share(keyinfo->share, HA_ERR_CRASHED); + DBUG_RETURN(0); + } + } + } + DBUG_PRINT("exit",("lastpos: %p length: %u", lastpos, + key->data_length + key->ref_length)); + DBUG_RETURN(lastpos); +} /* _ma_get_last_key */ + + +/** + Calculate length of unpacked key + + @param info Maria handler + @param keyinfo key handler + @param key data for key + + @notes + This function is very seldom used. It's mainly used for debugging + or when calculating a key length from a stored key in batch insert. + + This function does *NOT* calculate length of transid size! + This function can't be used against a prefix packed key on a page + + @return + @retval total length for key +*/ + +uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key) +{ + reg1 HA_KEYSEG *keyseg; + const uchar *start; + + if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) + return (keyinfo->keylength); + + start= key; + for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + if (!*key++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,key); + key+=length; + } + else + key+= keyseg->length; + } + return((uint) (key-start)+keyseg->length); +} /* _ma_keylength */ + + +/* + Calculate length of part key. + + Used in maria_rkey() to find the key found for the key-part that was used. + This is needed in case of multi-byte character sets where we may search + after '0xDF' but find 'ss' +*/ + +uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key, + HA_KEYSEG *end) +{ + reg1 HA_KEYSEG *keyseg; + const uchar *start= key; + + for (keyseg=keyinfo->seg ; keyseg != end ; keyseg++) + { + if (keyseg->flag & HA_NULL_PART) + if (!*key++) + continue; + if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + uint length; + get_key_length(length,key); + key+=length; + } + else + key+= keyseg->length; + } + return (uint) (key-start); +} + + +/* + Find next/previous record with same key + + WARNING + This can't be used when database is touched after last read +*/ + +int _ma_search_next(register MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, my_off_t pos) +{ + int error; + uchar lastkey[MARIA_MAX_KEY_BUFF]; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_KEY tmp_key; + MARIA_PAGE page; + DBUG_ENTER("_ma_search_next"); + DBUG_PRINT("enter",("nextflag: %u lastpos: %lu int_keypos:%p page_changed %d keyread_buff_used: %d", + nextflag, (ulong) info->cur_row.lastpos, + info->int_keypos, + info->page_changed, info->keyread_buff_used)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key);); + + /* + Force full read if we are at last key or if we are not on a leaf + and the key tree has changed since we used it last time + Note that even if the key tree has changed since last read, we can use + the last read data from the leaf if we haven't used the buffer for + something else. + */ + + if (((nextflag & SEARCH_BIGGER) && info->int_keypos >= info->int_maxpos) || + info->page_changed || + (info->int_keytree_version != keyinfo->version && + (info->int_nod_flag || info->keyread_buff_used))) + DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + pos)); + + if (info->keyread_buff_used) + { + if (_ma_fetch_keypage(&page, info, keyinfo, info->last_search_keypage, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0)) + DBUG_RETURN(-1); + info->keyread_buff_used=0; + } + else + { + /* Last used buffer is in info->keyread_buff */ + /* Todo: Add info->keyread_page to keep track of this */ + _ma_page_setup(&page, info, keyinfo, 0, info->keyread_buff); + } + + tmp_key.data= lastkey; + tmp_key.keyinfo= keyinfo; + + if (nextflag & SEARCH_BIGGER) /* Next key */ + { + if (page.node) + { + my_off_t tmp_pos= _ma_kpos(page.node, info->int_keypos); + + if ((error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + tmp_pos)) <=0) + DBUG_RETURN(error); + } + if (keyinfo->flag & (HA_PACK_KEY | HA_BINARY_PACK_KEY) && + info->last_key.data != key->data) + memcpy(info->last_key.data, key->data, + key->data_length + key->ref_length); + if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node, + &info->int_keypos)) + DBUG_RETURN(-1); + } + else /* Previous key */ + { + /* Find start of previous key */ + info->int_keypos= _ma_get_last_key(&tmp_key, &page, info->int_keypos); + if (!info->int_keypos) + DBUG_RETURN(-1); + if (info->int_keypos == info->keyread_buff + info->s->keypage_header) + { + /* Previous key was first key, read key before this one */ + DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + pos)); + } + if (page.node && + (error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF, + _ma_kpos(page.node,info->int_keypos))) <= 0) + DBUG_RETURN(error); + + /* QQ: We should be able to optimize away the following call */ + if (! _ma_get_last_key(&info->last_key, &page, info->int_keypos)) + DBUG_RETURN(-1); + } + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_next */ + + +/** + Search after position for the first row in an index + + @return + Found row is stored in info->cur_row.lastpos +*/ + +int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos) +{ + uchar *first_pos; + MARIA_PAGE page; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_search_first"); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + + do + { + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0)) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + first_pos= page.buff + share->keypage_header + page.node; + } while ((pos= _ma_kpos(page.node, first_pos)) != HA_OFFSET_ERROR); + + if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node, &first_pos)) + DBUG_RETURN(-1); /* Crashed */ + + info->int_keypos= first_pos; + info->int_maxpos= (page.buff + page.size -1); + info->int_nod_flag= page.node; + info->int_keytree_version= keyinfo->version; + info->last_search_keypage= info->last_keypage; + info->page_changed=info->keyread_buff_used=0; + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + + DBUG_PRINT("exit",("found key at %lu", (ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_first */ + + +/** + Search after position for the last row in an index + + @return + Found row is stored in info->cur_row.lastpos +*/ + +int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos) +{ + uchar *end_of_page; + MARIA_PAGE page; + DBUG_ENTER("_ma_search_last"); + + if (pos == HA_OFFSET_ERROR) + { + my_errno=HA_ERR_KEY_NOT_FOUND; /* Didn't find key */ + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + + do + { + if (_ma_fetch_keypage(&page, info, keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0)) + { + info->cur_row.lastpos= HA_OFFSET_ERROR; + DBUG_RETURN(-1); + } + end_of_page= page.buff + page.size; + } while ((pos= _ma_kpos(page.node, end_of_page)) != HA_OFFSET_ERROR); + + if (!_ma_get_last_key(&info->last_key, &page, end_of_page)) + DBUG_RETURN(-1); + info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key); + info->cur_row.trid= _ma_trid_from_key(&info->last_key); + info->int_keypos= info->int_maxpos= end_of_page; + info->int_nod_flag= page.node; + info->int_keytree_version= keyinfo->version; + info->last_search_keypage= info->last_keypage; + info->page_changed=info->keyread_buff_used=0; + + DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos)); + DBUG_RETURN(0); +} /* _ma_search_last */ + + + +/**************************************************************************** +** +** Functions to store and pack a key in a page +** +** maria_calc_xx_key_length takes the following arguments: +** nod_flag If nod: Length of nod-pointer +** next_key Position to pos after the new key in buffer +** org_key Key that was before the next key in buffer +** prev_key Last key before current key +** key Key that will be stored +** s_temp Information how next key will be packed +****************************************************************************/ + +/* Static length key */ + +int +_ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *next_pos __attribute__((unused)), + uchar *org_key __attribute__((unused)), + uchar *prev_key __attribute__((unused)), + MARIA_KEY_PARAM *s_temp) +{ + s_temp->key= key->data; + return (int) (s_temp->move_length= key->data_length + key->ref_length + + nod_flag); +} + +/* Variable length key */ + +int +_ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *next_pos __attribute__((unused)), + uchar *org_key __attribute__((unused)), + uchar *prev_key __attribute__((unused)), + MARIA_KEY_PARAM *s_temp) +{ + s_temp->key= key->data; + return (int) (s_temp->move_length= key->data_length + key->ref_length + + nod_flag); +} + +/** + @brief Calc length needed to store prefixed compressed keys + + @info + Variable length first segment which is prefix compressed + (maria_chk reports 'packed + stripped') + + Keys are compressed the following way: + + If the max length of first key segment <= 127 bytes the prefix is + 1 uchar else it's 2 byte + + prefix byte(s) The high bit is set if this is a prefix for the prev key + length Packed length if the previous was a prefix byte + [data_length] data bytes ('length' bytes) + next-key-seg Next key segments + + If the first segment can have NULL: + If key was packed + data_length is length of rest of key + If key was not packed + The data_length is 0 for NULLS and 1+data_length for not null columns +*/ + +int +_ma_calc_var_pack_key_length(const MARIA_KEY *int_key, uint nod_flag, + uchar *next_key, uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp) +{ + reg1 HA_KEYSEG *keyseg; + int length; + uint key_length,ref_length,org_key_length=0, + length_pack,new_key_length,diff_flag,pack_marker; + const uchar *key, *start, *end, *key_end; + const uchar *sort_order; + my_bool same_length; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + + key= int_key->data; + length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0; + same_length=0; keyseg=keyinfo->seg; + key_length= int_key->data_length + int_key->ref_length + nod_flag; + + sort_order=0; + if ((keyinfo->flag & HA_FULLTEXT) && + ((keyseg->type == HA_KEYTYPE_TEXT) || + (keyseg->type == HA_KEYTYPE_VARTEXT1) || + (keyseg->type == HA_KEYTYPE_VARTEXT2)) && + !use_strnxfrm(keyseg->charset)) + sort_order= keyseg->charset->sort_order; + + /* diff flag contains how many bytes is needed to pack key */ + if (keyseg->length >= 127) + { + diff_flag=2; + pack_marker=32768; + } + else + { + diff_flag= 1; + pack_marker=128; + } + s_temp->pack_marker=pack_marker; + + /* Handle the case that the first part have NULL values */ + if (keyseg->flag & HA_NULL_PART) + { + if (!*key++) + { + s_temp->key= key; + s_temp->key_length= 0; + s_temp->totlength= key_length-1+diff_flag; + s_temp->next_key_pos= 0; /* No next key */ + return (s_temp->move_length= s_temp->totlength); + } + s_temp->store_not_null=1; + key_length--; /* We don't store NULL */ + if (prev_key && !*prev_key++) + org_key=prev_key=0; /* Can't pack against prev */ + else if (org_key) + org_key++; /* Skip NULL */ + } + else + s_temp->store_not_null=0; + s_temp->prev_key= org_key; + + /* The key part will start with a packed length */ + + get_key_pack_length(new_key_length,length_pack,key); + end= key_end= key+ new_key_length; + start= key; + + /* Calc how many characters are identical between this and the prev. key */ + if (prev_key) + { + get_key_length(org_key_length,prev_key); + s_temp->prev_key=prev_key; /* Pointer at data */ + /* Don't use key-pack if length == 0 */ + if (new_key_length && new_key_length == org_key_length) + same_length=1; + else if (new_key_length > org_key_length) + end= key + org_key_length; + + if (sort_order) /* SerG */ + { + while (key < end && + sort_order[*key] == sort_order[*prev_key]) + { + key++; prev_key++; + } + } + else + { + while (key < end && *key == *prev_key) + { + key++; prev_key++; + } + } + } + + s_temp->key=key; + s_temp->key_length= (uint) (key_end-key); + + if (same_length && key == key_end) + { + /* identical variable length key */ + s_temp->ref_length= pack_marker; + length=(int) key_length-(int) (key_end-start)-length_pack; + length+= diff_flag; + if (next_key) + { /* Can't combine with next */ + s_temp->n_length= *next_key; /* Needed by _ma_store_key */ + next_key=0; + } + } + else + { + if (start != key) + { /* Starts as prev key */ + ref_length= (uint) (key-start); + s_temp->ref_length= ref_length + pack_marker; + length= (int) (key_length - ref_length); + + length-= length_pack; + length+= diff_flag; + length+= ((new_key_length-ref_length) >= 255) ? 3 : 1;/* Rest_of_key */ + } + else + { + s_temp->key_length+=s_temp->store_not_null; /* If null */ + length= key_length - length_pack+ diff_flag; + } + } + s_temp->totlength=(uint) length; + s_temp->prev_length=0; + DBUG_PRINT("test",("tot_length: %u length: %d uniq_key_length: %u", + key_length, length, s_temp->key_length)); + + /* If something after that hasn't length=0, test if we can combine */ + if ((s_temp->next_key_pos=next_key)) + { + uint packed,n_length; + + packed = *next_key & 128; + if (diff_flag == 2) + { + n_length= mi_uint2korr(next_key) & 32767; /* Length of next key */ + next_key+=2; + } + else + n_length= *next_key++ & 127; + if (!packed && n_length) + n_length-= s_temp->store_not_null; + + if (n_length || packed) /* Don't pack 0 length keys */ + { + uint next_length_pack, new_ref_length=s_temp->ref_length; + + if (packed) + { + /* If first key and next key is packed (only on delete) */ + if (!prev_key && org_key) + { + get_key_length(org_key_length,org_key); + key=start; + if (sort_order) /* SerG */ + { + while (key < end && + sort_order[*key] == sort_order[*org_key]) + { + key++; org_key++; + } + } + else + { + while (key < end && *key == *org_key) + { + key++; org_key++; + } + } + if ((new_ref_length= (uint) (key - start))) + new_ref_length+=pack_marker; + } + + if (!n_length) + { + /* + We put a different key between two identical variable length keys + Extend next key to have same prefix as this key + */ + if (new_ref_length) /* prefix of previus key */ + { /* make next key longer */ + s_temp->part_of_prev_key= new_ref_length; + s_temp->prev_length= org_key_length - + (new_ref_length-pack_marker); + s_temp->n_ref_length= s_temp->part_of_prev_key; + s_temp->n_length= s_temp->prev_length; + n_length= get_pack_length(s_temp->prev_length); + s_temp->prev_key+= (new_ref_length - pack_marker); + length+= s_temp->prev_length + n_length; + } + else + { /* Can't use prev key */ + s_temp->part_of_prev_key=0; + s_temp->prev_length= org_key_length; + s_temp->n_ref_length=s_temp->n_length= org_key_length; + length+= org_key_length; + } + return (s_temp->move_length= (int) length); + } + + ref_length=n_length; + /* Get information about not packed key suffix */ + get_key_pack_length(n_length,next_length_pack,next_key); + + /* Test if new keys has fewer characters that match the previous key */ + if (!new_ref_length) + { /* Can't use prev key */ + s_temp->part_of_prev_key= 0; + s_temp->prev_length= ref_length; + s_temp->n_ref_length= s_temp->n_length= n_length+ref_length; + return s_temp->move_length= ((int) length+ref_length- + next_length_pack); + } + if (ref_length+pack_marker > new_ref_length) + { + uint new_pack_length=new_ref_length-pack_marker; + /* We must copy characters from the original key to the next key */ + s_temp->part_of_prev_key= new_ref_length; + s_temp->prev_length= ref_length - new_pack_length; + s_temp->n_ref_length=s_temp->n_length=n_length + s_temp->prev_length; + s_temp->prev_key+= new_pack_length; + length-= (next_length_pack - get_pack_length(s_temp->n_length)); + return s_temp->move_length= ((int) length + s_temp->prev_length); + } + } + else + { + /* Next key wasn't a prefix of previous key */ + ref_length=0; + next_length_pack=0; + } + DBUG_PRINT("test",("length: %d next_key: %p", length, + next_key)); + + { + uint tmp_length; + key=(start+=ref_length); + if (key+n_length < key_end) /* Normalize length based */ + key_end= key+n_length; + if (sort_order) /* SerG */ + { + while (key < key_end && + sort_order[*key] == sort_order[*next_key]) + { + key++; next_key++; + } + } + else + { + while (key < key_end && *key == *next_key) + { + key++; next_key++; + } + } + if (!(tmp_length=(uint) (key-start))) + { /* Key can't be re-packed */ + s_temp->next_key_pos=0; + return (s_temp->move_length= length); + } + ref_length+=tmp_length; + n_length-=tmp_length; + length-=tmp_length+next_length_pack; /* We gained these chars */ + } + if (n_length == 0 && ref_length == new_key_length) + { + s_temp->n_ref_length=pack_marker; /* Same as prev key */ + } + else + { + s_temp->n_ref_length=ref_length | pack_marker; + length+= get_pack_length(n_length); + s_temp->n_length=n_length; + } + } + } + return (s_temp->move_length= length); +} + + +/* Length of key which is prefix compressed */ + +int _ma_calc_bin_pack_key_length(const MARIA_KEY *int_key, + uint nod_flag, + uchar *next_key, + uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp) +{ + uint length,key_length,ref_length; + const uchar *key= int_key->data; + + s_temp->totlength= key_length= (int_key->data_length + int_key->ref_length+ + nod_flag); +#ifdef HAVE_valgrind + s_temp->n_length= s_temp->n_ref_length=0; /* For valgrind */ +#endif + s_temp->key=key; + s_temp->prev_key=org_key; + if (prev_key) /* If not first key in block */ + { + /* pack key against previous key */ + /* + As keys may be identical when running a sort in maria_chk, we + have to guard against the case where keys may be identical + */ + const uchar *end; + end=key+key_length; + for ( ; *key == *prev_key && key < end; key++,prev_key++) ; + s_temp->ref_length= ref_length=(uint) (key-s_temp->key); + length=key_length - ref_length + get_pack_length(ref_length); + } + else + { + /* No previous key */ + s_temp->ref_length=ref_length=0; + length=key_length+1; + } + if ((s_temp->next_key_pos=next_key)) /* If another key after */ + { + /* pack key against next key */ + uint next_length,next_length_pack; + get_key_pack_length(next_length,next_length_pack,next_key); + + /* If first key and next key is packed (only on delete) */ + if (!prev_key && org_key && next_length) + { + const uchar *end; + for (key= s_temp->key, end=key+next_length ; + *key == *org_key && key < end; + key++,org_key++) ; + ref_length= (uint) (key - s_temp->key); + } + + if (next_length > ref_length) + { + /* + We put a key with different case between two keys with the same prefix + Extend next key to have same prefix as this key + */ + s_temp->n_ref_length= ref_length; + s_temp->prev_length= next_length-ref_length; + s_temp->prev_key+= ref_length; + return s_temp->move_length= ((int) (length+ s_temp->prev_length - + next_length_pack + + get_pack_length(ref_length))); + } + /* Check how many characters are identical to next key */ + key= s_temp->key+next_length; + s_temp->prev_length= 0; + while (*key++ == *next_key++) ; + if ((ref_length= (uint) (key - s_temp->key)-1) == next_length) + { + s_temp->next_key_pos=0; + return (s_temp->move_length= length); /* Can't pack next key */ + } + s_temp->n_ref_length=ref_length; + return s_temp->move_length= (int) (length-(ref_length - next_length) - + next_length_pack + + get_pack_length(ref_length)); + } + return (s_temp->move_length= (int) length); +} + + +/* +** store a key packed with _ma_calc_xxx_key_length in page-buffert +*/ + +/* store key without compression */ + +void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + memcpy(key_pos, s_temp->key,(size_t) s_temp->move_length); + s_temp->changed_length= s_temp->move_length; +} + + +/* store variable length key with prefix compression */ + +#define store_pack_length(test,pos,length) { \ + if (test) { *((pos)++) = (uchar) (length); } else \ + { *((pos)++) = (uchar) ((length) >> 8); *((pos)++) = (uchar) (length); } } + + +void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + uint length; + uchar *org_key_pos= key_pos; + + if (s_temp->ref_length) + { + /* Packed against previous key */ + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->ref_length); + /* If not same key after */ + if (s_temp->ref_length != s_temp->pack_marker) + store_key_length_inc(key_pos,s_temp->key_length); + } + else + { + /* Not packed against previous key */ + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length); + } + bmove(key_pos, s_temp->key, + (length= s_temp->totlength - (uint) (key_pos-org_key_pos))); + + key_pos+= length; + + if (!s_temp->next_key_pos) /* No following key */ + goto end; + + if (s_temp->prev_length) + { + /* Extend next key because new key didn't have same prefix as prev key */ + if (s_temp->part_of_prev_key) + { + store_pack_length(s_temp->pack_marker == 128,key_pos, + s_temp->part_of_prev_key); + store_key_length_inc(key_pos,s_temp->n_length); + } + else + { + s_temp->n_length+= s_temp->store_not_null; + store_pack_length(s_temp->pack_marker == 128,key_pos, + s_temp->n_length); + } + memcpy(key_pos, s_temp->prev_key, s_temp->prev_length); + key_pos+= s_temp->prev_length; + } + else if (s_temp->n_ref_length) + { + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length); + if (s_temp->n_ref_length != s_temp->pack_marker) + { + /* Not identical key */ + store_key_length_inc(key_pos,s_temp->n_length); + } + } + else if (s_temp->n_length) + { + s_temp->n_length+= s_temp->store_not_null; + store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length); + } + +end: + s_temp->changed_length= (uint) (key_pos - org_key_pos); +} + + +/* variable length key with prefix compression */ + +void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), + register uchar *key_pos, + register MARIA_KEY_PARAM *s_temp) +{ + uchar *org_key_pos= key_pos; + size_t length= s_temp->totlength - s_temp->ref_length; + + store_key_length_inc(key_pos,s_temp->ref_length); + memcpy(key_pos, s_temp->key+s_temp->ref_length, length); + key_pos+= length; + + if (s_temp->next_key_pos) + { + store_key_length_inc(key_pos,s_temp->n_ref_length); + if (s_temp->prev_length) /* If we must extend key */ + { + memcpy(key_pos,s_temp->prev_key,s_temp->prev_length); + key_pos+= s_temp->prev_length; + } + } + s_temp->changed_length= (uint) (key_pos - org_key_pos); +} diff --git a/storage/maria/ma_servicethread.c b/storage/maria/ma_servicethread.c new file mode 100644 index 00000000..f5af1725 --- /dev/null +++ b/storage/maria/ma_servicethread.c @@ -0,0 +1,123 @@ +/* + Copyright (c) 2009, 2011, Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "ma_servicethread.h" + +/** + Initializes the service thread + + @param control control block + + @return Operation status + @retval 0 OK + @retval 1 error +*/ + +int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control) +{ + int res= 0; + DBUG_ENTER("ma_service_thread_control_init"); + DBUG_PRINT("init", ("control %p", control)); + control->inited= TRUE; + control->killed= FALSE; + res= (mysql_mutex_init(key_SERVICE_THREAD_CONTROL_lock, + control->LOCK_control, MY_MUTEX_INIT_SLOW) || + mysql_cond_init(key_SERVICE_THREAD_CONTROL_cond, + control->COND_control, 0)); + DBUG_PRINT("info", ("init: %s", (res ? "Error" : "OK"))); + DBUG_RETURN(res); +} + + +/** + Kill the service thread + + @param control control block + + @note The service thread should react on condition and status equal + THREAD_DYING, by setting status THREAD_DEAD, and issuing message to + control thread via condition and exiting. The base way to do so is using + my_service_thread_sleep() and my_service_thread_signal_end() +*/ + +void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control) +{ + DBUG_ENTER("ma_service_thread_control_end"); + DBUG_PRINT("init", ("control %p", control)); + DBUG_ASSERT(control->inited); + mysql_mutex_lock(control->LOCK_control); + if (!control->killed) + { + DBUG_PRINT("info",("killing Maria background thread")); + control->killed= TRUE; /* kill it */ + mysql_cond_broadcast(control->COND_control); + mysql_mutex_unlock(control->LOCK_control); + DBUG_PRINT("info", ("waiting for Maria background thread to die")); + pthread_join(control->thread, NULL); + } + else + mysql_mutex_unlock(control->LOCK_control); + mysql_mutex_destroy(control->LOCK_control); + mysql_cond_destroy(control->COND_control); + control->inited= FALSE; + DBUG_VOID_RETURN; +} + + +/** + Sleep for given number of nanoseconds with reaction on thread kill + + @param control control block + @param sleep_time time of sleeping + + @return Operation status + @retval FALSE Time out + @retval TRUE Thread should be killed +*/ + +my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control, + ulonglong sleep_time) +{ + struct timespec abstime; + my_bool res= FALSE; + DBUG_ENTER("my_service_thread_sleep"); + DBUG_PRINT("init", ("control %p", control)); + mysql_mutex_lock(control->LOCK_control); + if (control->killed) + { + mysql_mutex_unlock(control->LOCK_control); + DBUG_RETURN(TRUE); + } +#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */ + mysql_mutex_unlock(&control->LOCK_control); + my_sleep(100000); /* a tenth of a second */ + mysql_mutex_lock(&control->LOCK_control); +#else + /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */ + DBUG_PRINT("info", ("sleeping %llu nano seconds", sleep_time)); + if (sleep_time) + { + set_timespec_nsec(abstime, sleep_time); + mysql_cond_timedwait(control->COND_control, + control->LOCK_control, &abstime); + } +#endif + if (control->killed) + res= TRUE; + mysql_mutex_unlock(control->LOCK_control); + DBUG_RETURN(res); +} diff --git a/storage/maria/ma_servicethread.h b/storage/maria/ma_servicethread.h new file mode 100644 index 00000000..a04a71fd --- /dev/null +++ b/storage/maria/ma_servicethread.h @@ -0,0 +1,35 @@ +/* + Copyright (c) 2009, 2011, Monty Program Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <my_pthread.h> + +typedef struct st_ma_service_thread_control +{ + pthread_t thread; + my_bool killed; + /** if thread module was inited or not */ + my_bool inited; + /** for killing the background thread */ + mysql_mutex_t *LOCK_control; + /** for killing the background thread */ + mysql_cond_t *COND_control; +} MA_SERVICE_THREAD_CONTROL; + + +int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control); +void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control); +my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control, + ulonglong sleep_time); diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c new file mode 100644 index 00000000..de6f5b8b --- /dev/null +++ b/storage/maria/ma_sort.c @@ -0,0 +1,1185 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Creates a index for a database by reading keys, sorting them and outputing + them in sorted order through MARIA_SORT_INFO functions. +*/ + +#include "ma_fulltext.h" +#include <my_check_opt.h> +#if defined(_WIN32) +#include <fcntl.h> +#else +#include <stddef.h> +#endif +#include <queues.h> + +/* static variables */ + +#undef DISK_BUFFER_SIZE + +#define MERGEBUFF 15 +#define MERGEBUFF2 31 +#define DISK_BUFFER_SIZE (IO_SIZE*128) + +/* How many keys we can keep in memory */ +typedef ulonglong ha_keys; + +/* + Pointers of functions for store and read keys from temp file +*/ + +extern void print_error(const char *fmt,...); + +/* Functions defined in this file */ + +static ha_rows find_all_keys(MARIA_SORT_PARAM *info, ha_keys keys, + uchar **sort_keys, + DYNAMIC_ARRAY *buffpek,size_t *maxbuffer, + IO_CACHE *tempfile, + IO_CACHE *tempfile_for_exceptions); +static int write_keys(MARIA_SORT_PARAM *info,uchar **sort_keys, + ha_keys count, BUFFPEK *buffpek,IO_CACHE *tempfile); +static int write_key(MARIA_SORT_PARAM *info, uchar *key, + IO_CACHE *tempfile); +static int write_index(MARIA_SORT_PARAM *info, uchar **sort_keys, + ha_keys count); +static int merge_many_buff(MARIA_SORT_PARAM *info, ha_keys keys, + uchar **sort_keys, + BUFFPEK *buffpek, size_t *maxbuffer, + IO_CACHE *t_file); +static my_off_t read_to_buffer(IO_CACHE *fromfile,BUFFPEK *buffpek, + uint sort_length); +static int merge_buffers(MARIA_SORT_PARAM *info, ha_keys keys, + IO_CACHE *from_file, IO_CACHE *to_file, + uchar **sort_keys, BUFFPEK *lastbuff, + BUFFPEK *Fb, BUFFPEK *Tb); +static int merge_index(MARIA_SORT_PARAM *,ha_keys,uchar **,BUFFPEK *, size_t, + IO_CACHE *); +static int flush_maria_ft_buf(MARIA_SORT_PARAM *info); + +static int write_keys_varlen(MARIA_SORT_PARAM *info,uchar **sort_keys, + ha_keys count, BUFFPEK *buffpek, + IO_CACHE *tempfile); +static my_off_t read_to_buffer_varlen(IO_CACHE *fromfile,BUFFPEK *buffpek, + uint sort_length); +static int write_merge_key(MARIA_SORT_PARAM *info, IO_CACHE *to_file, + uchar *key, uint sort_length, ha_keys count); +static int write_merge_key_varlen(MARIA_SORT_PARAM *info, + IO_CACHE *to_file, + uchar* key, uint sort_length, + ha_keys count); +static inline int +my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs); + +/* + Sets the appropriate read and write methods for the MARIA_SORT_PARAM + based on the variable length key flag. +*/ +static void set_sort_param_read_write(MARIA_SORT_PARAM *sort_param) +{ + if (sort_param->keyinfo->flag & HA_VAR_LENGTH_KEY) + { + sort_param->write_keys= write_keys_varlen; + sort_param->read_to_buffer= read_to_buffer_varlen; + sort_param->write_key= write_merge_key_varlen; + } + else + { + sort_param->write_keys= write_keys; + sort_param->read_to_buffer= read_to_buffer; + sort_param->write_key= write_merge_key; + } +} + + +/* + Creates a index of sorted keys + + SYNOPSIS + _ma_create_index_by_sort() + info Sort parameters + no_messages Set to 1 if no output + sortbuff_size Size of sortbuffer to allocate + + RESULT + 0 ok + <> 0 Error +*/ + +int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, + size_t sortbuff_size) +{ + int error; + uint sort_length; + size_t memavl, old_memavl, maxbuffer; + DYNAMIC_ARRAY buffpek; + ha_rows records, UNINIT_VAR(keys); + uchar **sort_keys; + IO_CACHE tempfile, tempfile_for_exceptions; + DBUG_ENTER("_ma_create_index_by_sort"); + DBUG_PRINT("enter",("sort_buff_size: %lu sort_length: %d max_records: %lu", + (ulong) sortbuff_size, info->key_length, + (ulong) info->sort_info->max_records)); + + set_sort_param_read_write(info); + + my_b_clear(&tempfile); + my_b_clear(&tempfile_for_exceptions); + bzero((char*) &buffpek,sizeof(buffpek)); + sort_keys= (uchar **) NULL; error= 1; + maxbuffer=1; + + memavl=MY_MAX(sortbuff_size,MARIA_MIN_SORT_MEMORY); + records= info->sort_info->max_records; + sort_length= info->key_length; + + while (memavl >= MARIA_MIN_SORT_MEMORY) + { + /* Check if we can fit all keys into memory */ + if (((ulonglong) (records + 1) * + (sort_length + sizeof(char*)) <= memavl)) + keys= records+1; + else if ((info->sort_info->param->testflag & + (T_FORCE_SORT_MEMORY | T_CREATE_MISSING_KEYS)) == + T_FORCE_SORT_MEMORY) + { + /* + Use all of the given sort buffer for key data. + Allocate 1000 buffers at a start for new data. More buffers + will be allocated when needed. + */ + keys= memavl / (sort_length+sizeof(char*)); + maxbuffer= (size_t) MY_MIN((ulonglong) 1000, (records / keys)+1); + } + else + { + /* + All keys can't fit in memory. + Calculate how many keys + buffers we can keep in memory + */ + size_t maxbuffer_org; + do + { + maxbuffer_org= maxbuffer; + if (memavl < sizeof(BUFFPEK) * maxbuffer || + (keys= (memavl-sizeof(BUFFPEK)*maxbuffer)/ + (sort_length+sizeof(char*))) <= 1) + { + _ma_check_print_error(info->sort_info->param, + "aria_sort_buffer_size is too small. Current aria_sort_buffer_size: %llu rows: %llu sort_length: %u", + (ulonglong) sortbuff_size, (ulonglong) records, + sort_length); + my_errno= ENOMEM; + goto err; + } + if (keys < maxbuffer) + { + /* + There must be sufficient memory for at least one key per BUFFPEK, + otherwise repair by sort/parallel repair cannot operate. + */ + maxbuffer= (uint) keys; + break; + } + } + while ((maxbuffer= (size_t) (records/(keys-1)+1)) != maxbuffer_org); + } + + if ((sort_keys= ((uchar**) + my_malloc(PSI_INSTRUMENT_ME, (size_t) (keys*(sort_length+sizeof(char*))+ + HA_FT_MAXBYTELEN), + MYF(0))))) + { + if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &buffpek, sizeof(BUFFPEK), + maxbuffer, MY_MIN(maxbuffer/2, 1000), MYF(0))) + { + my_free(sort_keys); + sort_keys= 0; + } + else + break; + } + old_memavl=memavl; + if ((memavl=memavl/4*3) < MARIA_MIN_SORT_MEMORY && old_memavl > MARIA_MIN_SORT_MEMORY) + memavl=MARIA_MIN_SORT_MEMORY; + } + if (memavl < MARIA_MIN_SORT_MEMORY) + { + /* purecov: begin inspected */ + _ma_check_print_error(info->sort_info->param, + "aria_sort_buffer_size is too small. Current aria_sort_buffer_size: %llu rows: %llu sort_length: %u", + (ulonglong) sortbuff_size, (ulonglong) records, sort_length); + my_errno= ENOMEM; + goto err; + /* purecov: end inspected */ + } + (*info->lock_in_memory)(info->sort_info->param);/* Everything is allocated */ + + if (!no_messages) + my_fprintf(stdout, + " - Searching for keys, allocating buffer for %llu keys\n", + (ulonglong) keys); + + if ((records=find_all_keys(info,keys,sort_keys,&buffpek,&maxbuffer, + &tempfile,&tempfile_for_exceptions)) + == HA_POS_ERROR) + goto err; /* purecov: tested */ + + if (maxbuffer >= keys) + { + /* + merge_many_buff will crash if maxbuffer > keys as then we cannot store in memory + the keys for each buffer. + */ + keys= maxbuffer + 1; + if (!(sort_keys= ((uchar **) + my_realloc(PSI_INSTRUMENT_ME, sort_keys, + (size_t) (keys*(sort_length+sizeof(char*))+ + HA_FT_MAXBYTELEN), MYF(MY_FREE_ON_ERROR))))) + goto err; + } + + info->sort_info->param->stage++; /* Merge stage */ + + if (maxbuffer == 0) + { + if (!no_messages) + my_fprintf(stdout, " - Dumping %llu keys\n", (ulonglong) records); + if (write_index(info, sort_keys, (ha_keys) records)) + goto err; /* purecov: inspected */ + } + else + { + keys=(keys*(sort_length+sizeof(char*)))/sort_length; + if (maxbuffer >= MERGEBUFF2) + { + if (!no_messages) + my_fprintf(stdout, " - Merging %llu keys\n", + (ulonglong) records); /* purecov: tested */ + if (merge_many_buff(info,keys,sort_keys, + dynamic_element(&buffpek,0,BUFFPEK *),&maxbuffer,&tempfile)) + goto err; /* purecov: inspected */ + } + if (flush_io_cache(&tempfile) || + reinit_io_cache(&tempfile,READ_CACHE,0L,0,0)) + goto err; /* purecov: inspected */ + if (!no_messages) + printf(" - Last merge and dumping keys\n"); /* purecov: tested */ + if (merge_index(info,keys,sort_keys,dynamic_element(&buffpek,0,BUFFPEK *), + maxbuffer,&tempfile)) + goto err; /* purecov: inspected */ + } + + if (flush_maria_ft_buf(info) || _ma_flush_pending_blocks(info)) + goto err; + + if (my_b_inited(&tempfile_for_exceptions)) + { + MARIA_HA *idx=info->sort_info->info; + uint16 key_length; + MARIA_KEY key; + key.keyinfo= idx->s->keyinfo + info->key; + + if (!no_messages) + printf(" - Adding exceptions\n"); /* purecov: tested */ + if (flush_io_cache(&tempfile_for_exceptions) || + reinit_io_cache(&tempfile_for_exceptions,READ_CACHE,0L,0,0)) + goto err; + + while (!my_b_read(&tempfile_for_exceptions,(uchar*)&key_length, + sizeof(key_length)) + && !my_b_read(&tempfile_for_exceptions,(uchar*)sort_keys, + (uint) key_length)) + { + key.data= (uchar*) sort_keys; + key.ref_length= idx->s->rec_reflength; + key.data_length= key_length - key.ref_length; + key.flag= 0; + if (_ma_ck_write(idx, &key)) + goto err; + } + } + + error =0; + +err: + my_free(sort_keys); + delete_dynamic(&buffpek); + close_cached_file(&tempfile); + close_cached_file(&tempfile_for_exceptions); + + DBUG_RETURN(error ? -1 : 0); +} /* _ma_create_index_by_sort */ + + +/* Search after all keys and place them in a temp. file */ + +static ha_rows find_all_keys(MARIA_SORT_PARAM *info, ha_rows keys, + uchar **sort_keys, DYNAMIC_ARRAY *buffpek, + size_t *maxbuffer, IO_CACHE *tempfile, + IO_CACHE *tempfile_for_exceptions) +{ + int error; + ha_rows idx; + DBUG_ENTER("find_all_keys"); + + idx=error=0; + sort_keys[0]= (uchar*) (sort_keys+keys); + + info->sort_info->info->in_check_table= 1; + while (!(error=(*info->key_read)(info,sort_keys[idx]))) + { + if (info->real_key_length > info->key_length) + { + if (write_key(info,sort_keys[idx],tempfile_for_exceptions)) + goto err; /* purecov: inspected */ + continue; + } + + if (++idx == keys) + { + if (info->write_keys(info,sort_keys,idx-1, + (BUFFPEK *)alloc_dynamic(buffpek), + tempfile)) + goto err; /* purecov: inspected */ + + sort_keys[0]=(uchar*) (sort_keys+keys); + memcpy(sort_keys[0],sort_keys[idx-1],(size_t) info->key_length); + idx=1; + } + sort_keys[idx]=sort_keys[idx-1]+info->key_length; + } + if (error > 0) + goto err; /* purecov: inspected */ + if (buffpek->elements) + { + if (info->write_keys(info,sort_keys,idx,(BUFFPEK *)alloc_dynamic(buffpek), + tempfile)) + goto err; /* purecov: inspected */ + *maxbuffer=buffpek->elements-1; + } + else + *maxbuffer=0; + + info->sort_info->info->in_check_table= 0; + DBUG_RETURN((*maxbuffer)*(keys-1)+idx); + +err: + info->sort_info->info->in_check_table= 0; /* purecov: inspected */ + DBUG_RETURN(HA_POS_ERROR); /* purecov: inspected */ +} /* find_all_keys */ + + +static my_bool _ma_thr_find_all_keys_exec(MARIA_SORT_PARAM* sort_param) +{ + int error= 0; + ulonglong memavl, old_memavl; + longlong sortbuff_size; + ha_keys UNINIT_VAR(keys), idx; + uint sort_length; + size_t maxbuffer; + uchar **sort_keys= NULL; + DBUG_ENTER("_ma_thr_find_all_keys_exec"); + DBUG_PRINT("enter", ("master: %d", sort_param->master)); + + if (sort_param->sort_info->got_error) + DBUG_RETURN(TRUE); + + set_sort_param_read_write(sort_param); + + my_b_clear(&sort_param->tempfile); + my_b_clear(&sort_param->tempfile_for_exceptions); + bzero((char*) &sort_param->buffpek, sizeof(sort_param->buffpek)); + bzero((char*) &sort_param->unique, sizeof(sort_param->unique)); + + sortbuff_size= sort_param->sortbuff_size; + memavl= MY_MAX(sortbuff_size, MARIA_MIN_SORT_MEMORY); + idx= (ha_keys) sort_param->sort_info->max_records; + sort_length= sort_param->key_length; + maxbuffer= 1; + + while (memavl >= MARIA_MIN_SORT_MEMORY) + { + if ((my_off_t) (idx+1)*(sort_length+sizeof(char*)) <= (my_off_t) memavl) + keys= idx+1; + else if ((sort_param->sort_info->param->testflag & + (T_FORCE_SORT_MEMORY | T_CREATE_MISSING_KEYS)) == + T_FORCE_SORT_MEMORY) + { + /* + Use all of the given sort buffer for key data. + Allocate 1000 buffers at a start for new data. More buffers + will be allocated when needed. + */ + keys= memavl / (sort_length+sizeof(char*)); + maxbuffer= (size_t) MY_MIN((ulonglong) 1000, (idx / keys)+1); + } + else + { + size_t maxbuffer_org; + do + { + maxbuffer_org= maxbuffer; + if (memavl < sizeof(BUFFPEK)*maxbuffer || + (keys=(memavl-sizeof(BUFFPEK)*maxbuffer)/ + (sort_length+sizeof(char*))) <= 1 || + keys < maxbuffer) + { + _ma_check_print_error(sort_param->sort_info->param, + "aria_sort_buffer_size is too small. Current aria_sort_buffer_size: %llu rows: %llu sort_length: %u", + sortbuff_size, (ulonglong) idx, sort_length); + goto err; + } + } + while ((maxbuffer= (uint) (idx/(keys-1)+1)) != maxbuffer_org); + } + if ((sort_keys= (uchar **) + my_malloc(PSI_INSTRUMENT_ME, (size_t)(keys*(sort_length+sizeof(char*))+ + ((sort_param->keyinfo->flag & HA_FULLTEXT) ? + HA_FT_MAXBYTELEN : 0)), MYF(0)))) + { + if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &sort_param->buffpek, sizeof(BUFFPEK), + maxbuffer, MY_MIN(maxbuffer / 2, 1000), MYF(0))) + { + my_free(sort_keys); + sort_keys= NULL; /* Safety against double free on error. */ + } + else + break; + } + old_memavl= memavl; + if ((memavl= memavl/4*3) < MARIA_MIN_SORT_MEMORY && + old_memavl > MARIA_MIN_SORT_MEMORY) + memavl= MARIA_MIN_SORT_MEMORY; + } + if (memavl < MARIA_MIN_SORT_MEMORY) + { + /* purecov: begin inspected */ + _ma_check_print_error(sort_param->sort_info->param, + "aria_sort_buffer_size is too small. Current aria_sort_buffer_size: %llu rows: %llu sort_length: %u", + sortbuff_size, (ulonglong) idx, sort_length); + my_errno= ENOMEM; + goto err; + /* purecov: end inspected */ + } + + if (sort_param->sort_info->param->testflag & T_VERBOSE) + my_fprintf(stdout, + "Key %d - Allocating buffer for %llu keys\n", + sort_param->key + 1, (ulonglong) keys); + sort_param->sort_keys= sort_keys; + + idx= error= 0; + sort_keys[0]= (uchar*) (sort_keys+keys); + + DBUG_PRINT("info", ("reading keys")); + while (!(error= sort_param->sort_info->got_error) && + !(error= (*sort_param->key_read)(sort_param, sort_keys[idx]))) + { + if (sort_param->real_key_length > sort_param->key_length) + { + if (write_key(sort_param, sort_keys[idx], + &sort_param->tempfile_for_exceptions)) + goto err; + continue; + } + + if (++idx == keys) + { + if (sort_param->write_keys(sort_param, sort_keys, idx - 1, + (BUFFPEK *)alloc_dynamic(&sort_param->buffpek), + &sort_param->tempfile)) + goto err; + sort_keys[0]= (uchar*) (sort_keys+keys); + memcpy(sort_keys[0], sort_keys[idx - 1], (size_t) sort_param->key_length); + idx= 1; + } + sort_keys[idx]= sort_keys[idx - 1] + sort_param->key_length; + } + if (error > 0) + goto err; + if (sort_param->buffpek.elements) + { + if (sort_param->write_keys(sort_param,sort_keys, idx, + (BUFFPEK *) alloc_dynamic(&sort_param->buffpek), + &sort_param->tempfile)) + goto err; + sort_param->keys= (uint)((sort_param->buffpek.elements - 1) * (keys - 1) + idx); + } + else + sort_param->keys= (uint)idx; + + DBUG_RETURN(FALSE); + +err: + DBUG_PRINT("error", ("got some error")); + my_free(sort_keys); + sort_param->sort_keys= 0; + delete_dynamic(& sort_param->buffpek); + close_cached_file(&sort_param->tempfile); + close_cached_file(&sort_param->tempfile_for_exceptions); + + DBUG_RETURN(TRUE); +} + +/* Search after all keys and place them in a temp. file */ + +pthread_handler_t _ma_thr_find_all_keys(void *arg) +{ + MARIA_SORT_PARAM *sort_param= (MARIA_SORT_PARAM*) arg; + my_bool error= FALSE; + /* If my_thread_init fails */ + if (my_thread_init() || _ma_thr_find_all_keys_exec(sort_param)) + error= TRUE; + + /* + Thread must clean up after itself. + */ + free_root(&sort_param->wordroot, MYF(0)); + /* + Detach from the share if the writer is involved. Avoid others to + be blocked. This includes a flush of the write buffer. This will + also indicate EOF to the readers. + That means that a writer always gets here first and readers - + only when they see EOF. But if a reader finishes prematurely + because of an error it may reach this earlier - don't allow it + to detach the writer thread. + */ + if (sort_param->master && sort_param->sort_info->info->rec_cache.share) + remove_io_thread(&sort_param->sort_info->info->rec_cache); + + /* Readers detach from the share if any. Avoid others to be blocked. */ + if (sort_param->read_cache.share) + remove_io_thread(&sort_param->read_cache); + + mysql_mutex_lock(&sort_param->sort_info->mutex); + if (error) + sort_param->sort_info->got_error= 1; + + if (!--sort_param->sort_info->threads_running) + mysql_cond_signal(&sort_param->sort_info->cond); + mysql_mutex_unlock(&sort_param->sort_info->mutex); + + my_thread_end(); + return NULL; +} + + +int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param) +{ + MARIA_SORT_INFO *sort_info=sort_param->sort_info; + HA_CHECK *param=sort_info->param; + size_t UNINIT_VAR(length), keys; + double *rec_per_key_part= param->new_rec_per_key_part; + int got_error=sort_info->got_error; + uint i; + MARIA_HA *info=sort_info->info; + MARIA_SHARE *share= info->s; + MARIA_SORT_PARAM *sinfo; + uchar *mergebuf=0; + DBUG_ENTER("_ma_thr_write_keys"); + + for (i= 0, sinfo= sort_param ; + i < sort_info->total_keys ; + i++, sinfo++) + { + if (!sinfo->sort_keys) + { + got_error=1; + my_free(sinfo->rec_buff); + continue; + } + if (!got_error) + { + maria_set_key_active(share->state.key_map, sinfo->key); + + if (!sinfo->buffpek.elements) + { + if (param->testflag & T_VERBOSE) + { + my_fprintf(stdout, + "Key %d - Dumping %llu keys\n", sinfo->key+1, + (ulonglong) sinfo->keys); + fflush(stdout); + } + if (write_index(sinfo, sinfo->sort_keys, sinfo->keys) || + flush_maria_ft_buf(sinfo) || _ma_flush_pending_blocks(sinfo)) + got_error=1; + } + } + my_free(sinfo->sort_keys); + my_free(sinfo->rec_buff); + sinfo->sort_keys=0; + } + + for (i= 0, sinfo= sort_param ; + i < sort_info->total_keys ; + i++, + delete_dynamic(&sinfo->buffpek), + close_cached_file(&sinfo->tempfile), + close_cached_file(&sinfo->tempfile_for_exceptions), + rec_per_key_part+= sinfo->keyinfo->keysegs, + sinfo++) + { + if (got_error) + continue; + + set_sort_param_read_write(sinfo); + + if (sinfo->buffpek.elements) + { + size_t maxbuffer=sinfo->buffpek.elements-1; + if (!mergebuf) + { + length=(size_t)param->sort_buffer_length; + while (length >= MARIA_MIN_SORT_MEMORY) + { + if ((mergebuf= my_malloc(PSI_INSTRUMENT_ME, (size_t) length, MYF(0)))) + break; + length=length*3/4; + } + if (!mergebuf) + { + got_error=1; + continue; + } + } + keys=length/sinfo->key_length; + if (maxbuffer >= MERGEBUFF2) + { + if (param->testflag & T_VERBOSE) + my_fprintf(stdout, + "Key %d - Merging %llu keys\n", + sinfo->key+1, (ulonglong) sinfo->keys); + if (merge_many_buff(sinfo, keys, (uchar **)mergebuf, + dynamic_element(&sinfo->buffpek, 0, BUFFPEK *), + &maxbuffer, &sinfo->tempfile)) + { + got_error=1; + continue; + } + } + if (flush_io_cache(&sinfo->tempfile) || + reinit_io_cache(&sinfo->tempfile,READ_CACHE,0L,0,0)) + { + got_error=1; + continue; + } + if (param->testflag & T_VERBOSE) + printf("Key %d - Last merge and dumping keys\n", sinfo->key+1); + if (merge_index(sinfo, keys, (uchar**) mergebuf, + dynamic_element(&sinfo->buffpek,0,BUFFPEK *), + maxbuffer,&sinfo->tempfile) || + flush_maria_ft_buf(sinfo) || + _ma_flush_pending_blocks(sinfo)) + { + got_error=1; + continue; + } + } + if (my_b_inited(&sinfo->tempfile_for_exceptions)) + { + uint16 key_length; + + if (param->testflag & T_VERBOSE) + printf("Key %d - Dumping 'long' keys\n", sinfo->key+1); + + if (flush_io_cache(&sinfo->tempfile_for_exceptions) || + reinit_io_cache(&sinfo->tempfile_for_exceptions,READ_CACHE,0L,0,0)) + { + got_error=1; + continue; + } + + while (!got_error && + !my_b_read(&sinfo->tempfile_for_exceptions,(uchar*)&key_length, + sizeof(key_length))) + { + uchar maria_ft_buf[HA_FT_MAXBYTELEN + HA_FT_WLEN + 10]; + if (key_length > sizeof(maria_ft_buf) || + my_b_read(&sinfo->tempfile_for_exceptions, (uchar*)maria_ft_buf, + (uint) key_length)) + got_error= 1; + else + { + MARIA_KEY tmp_key; + tmp_key.keyinfo= info->s->keyinfo + sinfo->key; + tmp_key.data= maria_ft_buf; + tmp_key.ref_length= info->s->rec_reflength; + tmp_key.data_length= key_length - info->s->rec_reflength; + tmp_key.flag= 0; + if (_ma_ck_write(info, &tmp_key)) + got_error=1; + } + } + } + if (!got_error && (param->testflag & T_STATISTICS)) + maria_update_key_parts(sinfo->keyinfo, rec_per_key_part, sinfo->unique, + param->stats_method == + MI_STATS_METHOD_IGNORE_NULLS ? + sinfo->notnull : NULL, + (ulonglong) share->state.state.records); + + } + my_free(mergebuf); + DBUG_RETURN(got_error); +} + + +/* Write all keys in memory to file for later merge */ + +static int write_keys(MARIA_SORT_PARAM *info, register uchar **sort_keys, + ha_keys count, BUFFPEK *buffpek, IO_CACHE *tempfile) +{ + uchar **end; + uint sort_length=info->key_length; + DBUG_ENTER("write_keys"); + + if (!buffpek) + DBUG_RETURN(1); /* Out of memory */ + + my_qsort2((uchar*) sort_keys,(size_t) count, sizeof(uchar*), + (qsort2_cmp) info->key_cmp, info); + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + buffpek->file_pos=my_b_tell(tempfile); + buffpek->count=count; + + for (end=sort_keys+count ; sort_keys != end ; sort_keys++) + { + if (my_b_write(tempfile, *sort_keys, sort_length)) + DBUG_RETURN(1); /* purecov: inspected */ + } + DBUG_RETURN(0); +} /* write_keys */ + + +static inline int +my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs) +{ + int err; + uint16 len= _ma_keylength(info->keyinfo, bufs); + + /* The following is safe as this is a local file */ + if ((err= my_b_write(to_file, (uchar*)&len, sizeof(len)))) + return (err); + if ((err= my_b_write(to_file,bufs, (uint) len))) + return (err); + return (0); +} + + +static int write_keys_varlen(MARIA_SORT_PARAM *info, + register uchar **sort_keys, + ha_keys count, BUFFPEK *buffpek, + IO_CACHE *tempfile) +{ + uchar **end; + int err; + DBUG_ENTER("write_keys_varlen"); + + if (!buffpek) + DBUG_RETURN(1); /* Out of memory */ + + my_qsort2((uchar*) sort_keys, (size_t) count, sizeof(uchar*), + (qsort2_cmp) info->key_cmp, info); + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + buffpek->file_pos=my_b_tell(tempfile); + buffpek->count=count; + for (end=sort_keys+count ; sort_keys != end ; sort_keys++) + { + if ((err= my_var_write(info,tempfile, *sort_keys))) + DBUG_RETURN(err); + } + DBUG_RETURN(0); +} /* write_keys_varlen */ + + +static int write_key(MARIA_SORT_PARAM *info, uchar *key, + IO_CACHE *tempfile) +{ + uint16 key_length=info->real_key_length; + DBUG_ENTER("write_key"); + + if (!my_b_inited(tempfile) && + open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); + + if (my_b_write(tempfile, (uchar*)&key_length,sizeof(key_length)) || + my_b_write(tempfile, key, (uint) key_length)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} /* write_key */ + + +/* Write index */ + +static int write_index(MARIA_SORT_PARAM *info, register uchar **sort_keys, + register ha_keys count) +{ + DBUG_ENTER("write_index"); + + my_qsort2((uchar*) sort_keys,(size_t) count,sizeof(uchar*), + (qsort2_cmp) info->key_cmp,info); + while (count--) + { + if ((*info->key_write)(info, *sort_keys++)) + DBUG_RETURN(-1); /* purecov: inspected */ + } + if (info->sort_info->param->max_stage != 1) /* If not parallel */ + _ma_report_progress(info->sort_info->param, 1, 1); + DBUG_RETURN(0); +} /* write_index */ + + + /* Merge buffers to make < MERGEBUFF2 buffers */ + +static int merge_many_buff(MARIA_SORT_PARAM *info, ha_keys keys, + uchar **sort_keys, BUFFPEK *buffpek, + size_t *maxbuffer, IO_CACHE *t_file) +{ + size_t tmp, merges, max_merges; + IO_CACHE t_file2, *from_file, *to_file, *temp; + BUFFPEK *lastbuff; + DBUG_ENTER("merge_many_buff"); + + if (*maxbuffer < MERGEBUFF2) + DBUG_RETURN(0); /* purecov: inspected */ + if (flush_io_cache(t_file) || + open_cached_file(&t_file2,my_tmpdir(info->tmpdir),"ST", + DISK_BUFFER_SIZE, info->sort_info->param->myf_rw)) + DBUG_RETURN(1); /* purecov: inspected */ + + /* Calculate how many merges are needed */ + max_merges= 1; /* Count merge_index */ + tmp= *maxbuffer; + while (tmp >= MERGEBUFF2) + { + merges= (tmp-MERGEBUFF*3/2 + 1) / MERGEBUFF + 1; + max_merges+= merges; + tmp= merges; + } + merges= 0; + + from_file= t_file ; to_file= &t_file2; + while (*maxbuffer >= MERGEBUFF2) + { + size_t i; + reinit_io_cache(from_file,READ_CACHE,0L,0,0); + reinit_io_cache(to_file,WRITE_CACHE,0L,0,0); + lastbuff=buffpek; + for (i=0 ; i + MERGEBUFF*3/2 <= *maxbuffer ; i+=MERGEBUFF) + { + if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++, + buffpek+i,buffpek+i+MERGEBUFF-1)) + goto cleanup; + if (info->sort_info->param->max_stage != 1) /* If not parallel */ + _ma_report_progress(info->sort_info->param, merges++, max_merges); + } + if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++, + buffpek+i,buffpek+ *maxbuffer)) + break; /* purecov: inspected */ + if (flush_io_cache(to_file)) + break; /* purecov: inspected */ + temp=from_file; from_file=to_file; to_file=temp; + *maxbuffer= (size_t) (lastbuff-buffpek)-1; + if (info->sort_info->param->max_stage != 1) /* If not parallel */ + _ma_report_progress(info->sort_info->param, merges++, max_merges); + } +cleanup: + close_cached_file(to_file); /* This holds old result */ + if (to_file == t_file) + { + DBUG_ASSERT(t_file2.type == WRITE_CACHE); + *t_file=t_file2; /* Copy result file */ + } + + DBUG_RETURN(*maxbuffer >= MERGEBUFF2); /* Return 1 if interrupted */ +} /* merge_many_buff */ + + +/* + Read data to buffer + + SYNOPSIS + read_to_buffer() + fromfile File to read from + buffpek Where to read from + sort_length max length to read + RESULT + > 0 Ammount of bytes read + -1 Error +*/ + +static my_off_t read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, + uint sort_length) +{ + register ha_keys count; + size_t length; + + if ((count= (ha_keys) MY_MIN((ha_rows) buffpek->max_keys, + (ha_rows) buffpek->count))) + { + if (my_b_pread(fromfile, (uchar*) buffpek->base, + (length= sort_length * (size_t)count), buffpek->file_pos)) + return(HA_OFFSET_ERROR); /* purecov: inspected */ + buffpek->key=buffpek->base; + buffpek->file_pos+= length; /* New filepos */ + buffpek->count-= count; + buffpek->mem_count= count; + } + return (((my_off_t) count) * sort_length); +} /* read_to_buffer */ + + +static my_off_t read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek, + uint sort_length) +{ + register ha_keys count; + uint idx; + uchar *buffp; + + if ((count= (ha_keys) MY_MIN((ha_rows) buffpek->max_keys,buffpek->count))) + { + buffp= buffpek->base; + + for (idx=1;idx<=count;idx++) + { + uint16 length_of_key; + if (my_b_pread(fromfile, (uchar*)&length_of_key, + sizeof(length_of_key), buffpek->file_pos)) + return(HA_OFFSET_ERROR); + buffpek->file_pos+=sizeof(length_of_key); + if (my_b_pread(fromfile, (uchar*) buffp, + length_of_key, buffpek->file_pos)) + return((uint) -1); + buffpek->file_pos+=length_of_key; + buffp = buffp + sort_length; + } + buffpek->key=buffpek->base; + buffpek->count-= count; + buffpek->mem_count= count; + } + return (((my_off_t) count) * sort_length); +} /* read_to_buffer_varlen */ + + +static int write_merge_key_varlen(MARIA_SORT_PARAM *info, + IO_CACHE *to_file, uchar* key, + uint sort_length, ha_keys count) +{ + ha_keys idx; + uchar *bufs = key; + + for (idx=1;idx<=count;idx++) + { + int err; + if ((err= my_var_write(info, to_file, bufs))) + return (err); + bufs=bufs+sort_length; + } + return(0); +} + + +static int write_merge_key(MARIA_SORT_PARAM *info __attribute__((unused)), + IO_CACHE *to_file, uchar *key, + uint sort_length, ha_keys count) +{ + return my_b_write(to_file, key, (size_t) (sort_length * count)); +} + +/* + Merge buffers to one buffer + If to_file == 0 then use info->key_write + + Return: + 0 ok + 1 error +*/ + +static int +merge_buffers(MARIA_SORT_PARAM *info, ha_keys keys, IO_CACHE *from_file, + IO_CACHE *to_file, uchar **sort_keys, BUFFPEK *lastbuff, + BUFFPEK *Fb, BUFFPEK *Tb) +{ + int error= 1; + uint sort_length; + ha_keys maxcount; + ha_rows count; + my_off_t UNINIT_VAR(to_start_filepos), read_length; + uchar *strpos; + BUFFPEK *buffpek,**refpek; + QUEUE queue; + DBUG_ENTER("merge_buffers"); + + count= 0; + maxcount= keys/((uint) (Tb-Fb) +1); + DBUG_ASSERT(maxcount > 0); + if (to_file) + to_start_filepos=my_b_tell(to_file); + strpos= (uchar*) sort_keys; + sort_length=info->key_length; + + if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0, + (int (*)(void*, uchar *,uchar*)) info->key_cmp, + (void*) info, 0, 0)) + DBUG_RETURN(1); /* purecov: inspected */ + + for (buffpek= Fb ; buffpek <= Tb ; buffpek++) + { + count+= buffpek->count; + buffpek->base= strpos; + buffpek->max_keys= maxcount; + strpos+= (read_length= info->read_to_buffer(from_file,buffpek, + sort_length)); + if (read_length == HA_OFFSET_ERROR) + goto err; /* purecov: inspected */ + queue_insert(&queue,(uchar*) buffpek); + } + + while (queue.elements > 1) + { + for (;;) + { + buffpek=(BUFFPEK*) queue_top(&queue); + if (to_file) + { + if (info->write_key(info,to_file, buffpek->key, + sort_length, 1)) + goto err; /* purecov: inspected */ + } + else + { + if ((*info->key_write)(info,(void*) buffpek->key)) + goto err; /* purecov: inspected */ + } + buffpek->key+=sort_length; + if (! --buffpek->mem_count) + { + /* It's enough to check for killedptr before a slow operation */ + if (_ma_killed_ptr(info->sort_info->param)) + goto err; + if (!(read_length= info->read_to_buffer(from_file,buffpek,sort_length))) + { + uchar *base= buffpek->base; + ha_keys max_keys=buffpek->max_keys; + + queue_remove_top(&queue); + + /* Put room used by buffer to use in other buffer */ + for (refpek= (BUFFPEK**) &queue_top(&queue); + refpek <= (BUFFPEK**) &queue_end(&queue); + refpek++) + { + buffpek= *refpek; + if (buffpek->base+buffpek->max_keys*sort_length == base) + { + buffpek->max_keys+=max_keys; + break; + } + else if (base+max_keys*sort_length == buffpek->base) + { + buffpek->base=base; + buffpek->max_keys+=max_keys; + break; + } + } + break; /* One buffer have been removed */ + } + else if (read_length == HA_OFFSET_ERROR) + goto err; /* purecov: inspected */ + } + queue_replace_top(&queue); /* Top element has been replaced */ + } + } + buffpek=(BUFFPEK*) queue_top(&queue); + buffpek->base= (uchar*) sort_keys; + buffpek->max_keys=keys; + do + { + if (to_file) + { + if (info->write_key(info, to_file, buffpek->key, + sort_length,buffpek->mem_count)) + { + error=1; goto err; /* purecov: inspected */ + } + } + else + { + register uchar *end; + strpos= buffpek->key; + for (end= strpos+buffpek->mem_count*sort_length; + strpos != end ; + strpos+=sort_length) + { + if ((*info->key_write)(info, strpos)) + { + error=1; goto err; /* purecov: inspected */ + } + } + } + } + while ((read_length= info->read_to_buffer(from_file,buffpek,sort_length)) != HA_OFFSET_ERROR && read_length != 0); + if (read_length == 0) + error= 0; + + lastbuff->count=count; + if (to_file) + lastbuff->file_pos=to_start_filepos; +err: + delete_queue(&queue); + DBUG_RETURN(error); +} /* merge_buffers */ + + + /* Do a merge to output-file (save only positions) */ + +static int +merge_index(MARIA_SORT_PARAM *info, ha_keys keys, uchar **sort_keys, + BUFFPEK *buffpek, size_t maxbuffer, IO_CACHE *tempfile) +{ + DBUG_ENTER("merge_index"); + if (merge_buffers(info,keys,tempfile,(IO_CACHE*) 0,sort_keys,buffpek,buffpek, + buffpek+maxbuffer)) + DBUG_RETURN(1); /* purecov: inspected */ + if (info->sort_info->param->max_stage != 1) /* If not parallel */ + _ma_report_progress(info->sort_info->param, 1, 1); + DBUG_RETURN(0); +} /* merge_index */ + + +static int flush_maria_ft_buf(MARIA_SORT_PARAM *info) +{ + int err=0; + if (info->sort_info->ft_buf) + { + err=_ma_sort_ft_buf_flush(info); + my_free(info->sort_info->ft_buf); + info->sort_info->ft_buf=0; + } + return err; +} diff --git a/storage/maria/ma_sp_defs.h b/storage/maria/ma_sp_defs.h new file mode 100644 index 00000000..8c2430e0 --- /dev/null +++ b/storage/maria/ma_sp_defs.h @@ -0,0 +1,48 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB + & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _SP_DEFS_H +#define _SP_DEFS_H + +#define SPDIMS 2 +#define SPTYPE HA_KEYTYPE_DOUBLE +#define SPLEN 8 + +#ifdef HAVE_SPATIAL + +enum wkbType +{ + wkbPoint = 1, + wkbLineString = 2, + wkbPolygon = 3, + wkbMultiPoint = 4, + wkbMultiLineString = 5, + wkbMultiPolygon = 6, + wkbGeometryCollection = 7 +}; + +enum wkbByteOrder +{ + wkbXDR = 0, /* Big Endian */ + wkbNDR = 1 /* Little Endian */ +}; + +MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr, + uchar *key, const uchar *record, my_off_t filepos, + ulonglong trid); + +#endif /*HAVE_SPATIAL*/ +#endif /* _SP_DEFS_H */ diff --git a/storage/maria/ma_sp_key.c b/storage/maria/ma_sp_key.c new file mode 100644 index 00000000..1a9abc98 --- /dev/null +++ b/storage/maria/ma_sp_key.c @@ -0,0 +1,303 @@ +/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "maria_def.h" +#include "ma_blockrec.h" /* For ROW_FLAG_TRANSID */ +#include "trnman.h" + +#ifdef HAVE_SPATIAL + +#include "ma_sp_defs.h" + +static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr); +static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims, + double *mbr, int top); +static int sp_mbr_from_wkb(uchar (*wkb), uint size, uint n_dims, double *mbr); + + +/** + Create spactial key +*/ + +MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr, + uchar *key, const uchar *record, my_off_t filepos, + ulonglong trid) +{ + HA_KEYSEG *keyseg; + MARIA_KEYDEF *keyinfo = &info->s->keyinfo[keynr]; + uint len = 0; + const uchar *pos; + uint dlen; + uchar *dptr; + double mbr[SPDIMS * 2]; + uint i; + DBUG_ENTER("_ma_sp_make_key"); + + keyseg = &keyinfo->seg[-1]; + pos = record + keyseg->start; + ret_key->data= key; + + dlen = _ma_calc_blob_length(keyseg->bit_start, pos); + memcpy(&dptr, pos + keyseg->bit_start, sizeof(char*)); + if (!dptr) + { + my_errno= HA_ERR_NULL_IN_SPATIAL; + DBUG_RETURN(0); + } + + sp_mbr_from_wkb(dptr + 4, dlen - 4, SPDIMS, mbr); /* SRID */ + + for (i = 0, keyseg = keyinfo->seg; keyseg->type; keyseg++, i++) + { + uint length = keyseg->length, start= keyseg->start; + double val; + + DBUG_ASSERT(length == 8); + DBUG_ASSERT(!(start % 8)); + DBUG_ASSERT(start < sizeof(mbr)); + DBUG_ASSERT(keyseg->type == HA_KEYTYPE_DOUBLE); + + val= mbr[start / sizeof (double)]; + if (isnan(val)) + { + bzero(key, length); + key+= length; + len+= length; + continue; + } + + if (keyseg->flag & HA_SWAP_KEY) + { + mi_float8store(key, val); + } + else + { + float8store((uchar *)key, val); + } + key += length; + len+= length; + } + _ma_dpointer(info->s, key, filepos); + ret_key->keyinfo= keyinfo; + ret_key->data_length= len; + ret_key->ref_length= info->s->rec_reflength; + ret_key->flag= 0; + if (_ma_have_versioning(info) && trid) + { + ret_key->ref_length+= transid_store_packed(info, + key + ret_key->ref_length, + trid); + } + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, ret_key);); + DBUG_RETURN(ret_key); +} + + +/* + Calculate minimal bounding rectangle (mbr) of the spatial object + stored in "well-known binary representation" (wkb) format. +*/ + +static int sp_mbr_from_wkb(uchar *wkb, uint size, uint n_dims, double *mbr) +{ + uint i; + + for (i=0; i < n_dims; ++i) + { + mbr[i * 2] = DBL_MAX; + mbr[i * 2 + 1] = -DBL_MAX; + } + + return sp_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1); +} + +/* + Add one point stored in wkb to mbr +*/ + +static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order __attribute__((unused)), + double *mbr) +{ + double ord; + double *mbr_end= mbr + n_dims * 2; + + while (mbr < mbr_end) + { + if ((*wkb) > end - 8) + return -1; + float8get(ord, (const uchar*) *wkb); + (*wkb)+= 8; + if (ord < *mbr) + *mbr= ord; + mbr++; + if (ord > *mbr) + *mbr= ord; + mbr++; + } + return 0; +} + + +static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + return sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr); +} + + +static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + uint n_points; + + n_points = uint4korr(*wkb); + (*wkb) += 4; + for (; n_points > 0; --n_points) + { + /* Add next point to mbr */ + if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + return 0; +} + + +static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims, + uchar byte_order, double *mbr) +{ + uint n_linear_rings; + uint n_points; + + n_linear_rings = uint4korr((*wkb)); + (*wkb) += 4; + + for (; n_linear_rings > 0; --n_linear_rings) + { + n_points = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_points > 0; --n_points) + { + /* Add next point to mbr */ + if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + } + return 0; +} + +static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims, + double *mbr, int top) +{ + int res; + uchar byte_order; + uint wkb_type; + + byte_order = *(*wkb); + ++(*wkb); + + wkb_type = uint4korr((*wkb)); + (*wkb) += 4; + + switch ((enum wkbType) wkb_type) + { + case wkbPoint: + res = sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbLineString: + res = sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbPolygon: + res = sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr); + break; + case wkbMultiPoint: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbMultiLineString: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbMultiPolygon: + { + uint n_items; + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + byte_order = *(*wkb); + ++(*wkb); + (*wkb) += 4; + if (sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr)) + return -1; + } + res = 0; + break; + } + case wkbGeometryCollection: + { + uint n_items; + + if (!top) + return -1; + + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) + { + if (sp_get_geometry_mbr(wkb, end, n_dims, mbr, 0)) + return -1; + } + res = 0; + break; + } + default: + res = -1; + } + return res; +} + +#endif /*HAVE_SPATIAL*/ diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c new file mode 100644 index 00000000..ae8f3575 --- /dev/null +++ b/storage/maria/ma_sp_test.c @@ -0,0 +1,572 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Testing of the basic functions of a MARIA spatial table */ +/* Written by Alex Barkov, who has a shared copyright to this code */ + +#include <my_global.h> +#include "maria_def.h" + +#ifdef HAVE_SPATIAL +#include "ma_sp_defs.h" + +#define MAX_REC_LENGTH 1024 +#define KEYALG HA_KEY_ALG_RTREE + +static void create_linestring(uchar *record,uint rownr); +static void print_record(uchar * record,my_off_t offs,const char * tail); + +static void create_key(uchar *key,uint rownr); +static void print_key(const uchar *key,const char * tail); + +static int run_test(const char *filename); +static int read_with_pos(MARIA_HA * file, int silent); + +static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points, + uchar *wkb); +static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims); + +static char blob_key[MAX_REC_LENGTH]; + + +int main(int argc __attribute__((unused)),char *argv[]) +{ + MY_INIT(argv[0]); + maria_init(); + exit(run_test("sp_test")); +} + + +int run_test(const char *filename) +{ + MARIA_HA *file; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + MARIA_COLUMNDEF recinfo[20]; + MARIA_KEYDEF keyinfo[20]; + HA_KEYSEG keyseg[20]; + key_range min_range, max_range; + int silent=0; + int create_flag=0; + int null_fields=0; + int nrecords=30; + int uniques=0; + int i; + int error; + int row_count=0; + uchar record[MAX_REC_LENGTH]; + uchar key[MAX_REC_LENGTH]; + uchar read_record[MAX_REC_LENGTH]; + int upd=10; + ha_rows hrows; + page_range pages; + + /* Define a column for NULLs and DEL markers*/ + + recinfo[0].type=FIELD_NORMAL; + recinfo[0].length=1; /* For NULL bits */ + + + /* Define spatial column */ + + recinfo[1].type=FIELD_BLOB; + recinfo[1].length=4 + portable_sizeof_char_ptr; + + + + /* Define a key with 1 spatial segment */ + + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].flag=HA_SPATIAL; + keyinfo[0].key_alg=KEYALG; + + keyinfo[0].seg[0].type= HA_KEYTYPE_BINARY; + keyinfo[0].seg[0].flag=0; + keyinfo[0].seg[0].start= 1; + keyinfo[0].seg[0].length=1; /* Spatial ignores it anyway */ + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language=default_charset_info->number; + keyinfo[0].seg[0].bit_start=4; /* Long BLOB */ + + + if (!silent) + printf("- Creating isam-file\n"); + + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=10000000; + + if (maria_create(filename, + DYNAMIC_RECORD, + 1, /* keys */ + keyinfo, + 2, /* columns */ + recinfo,uniques,&uniquedef,&create_info,create_flag)) + goto err; + + if (!silent) + printf("- Open isam-file\n"); + + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0))) + goto err; + + if (!silent) + printf("- Writing key:s\n"); + + for (i=0; i<nrecords; i++ ) + { + create_linestring(record,i); + error=maria_write(file,record); + print_record(record,maria_position(file),"\n"); + if (!error) + { + row_count++; + } + else + { + printf("maria_write: %d\n", error); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Deleting rows with position\n"); + for (i=0; i < nrecords/4; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),"\n"); + error=maria_delete(file,read_record); + if (error) + { + printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if (!silent) + printf("- Updating rows with position\n"); + for (i=0; i < nrecords/2 ; i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + print_record(read_record,maria_position(file),""); + create_linestring(record,i+nrecords*upd); + printf("\t-> "); + print_record(record,maria_position(file),"\n"); + error=maria_update(file,read_record,record); + if (error) + { + printf("pos: %2d maria_update: %3d errno: %3d\n",i,error,my_errno); + goto err; + } + } + + if ((error=read_with_pos(file,silent))) + goto err; + + if (!silent) + printf("- Test maria_rkey then a sequence of maria_rnext_same\n"); + + create_key(key, nrecords*4/5); + print_key(key," search for INTERSECT\n"); + + if ((error=maria_rkey(file,read_record,0,key,0,HA_READ_MBR_INTERSECT))) + { + printf("maria_rkey: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rkey\n"); + row_count=1; + + for (;;) + { + if ((error=maria_rnext_same(file,read_record))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext_same\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_rfirst then a sequence of maria_rnext\n"); + + error=maria_rfirst(file,read_record,0); + if (error) + { + printf("maria_rfirst: %3d errno: %3d\n",error,my_errno); + goto err; + } + row_count=1; + print_record(read_record,maria_position(file)," maria_frirst\n"); + + for(i=0;i<nrecords;i++) { + if ((error=maria_rnext(file,read_record,0))) + { + if (error==HA_ERR_END_OF_FILE) + break; + printf("maria_next: %3d errno: %3d\n",error,my_errno); + goto err; + } + print_record(read_record,maria_position(file)," maria_rnext\n"); + row_count++; + } + printf(" %d rows\n",row_count); + + if (!silent) + printf("- Test maria_records_in_range()\n"); + + create_key(key, nrecords*upd); + print_key(key," INTERSECT\n"); + min_range.key= key; + min_range.length= 1000; /* Big enough */ + min_range.flag= HA_READ_MBR_INTERSECT; + max_range.key= record+1; + max_range.length= 1000; /* Big enough */ + max_range.flag= HA_READ_KEY_EXACT; + hrows= maria_records_in_range(file,0, &min_range, &max_range, &pages); + printf(" %ld rows\n", (long) hrows); + + if (maria_close(file)) goto err; + maria_end(); + my_end(MY_CHECK_ERROR); + + return 0; + +err: + printf("got error: %3d when using maria-database\n",my_errno); + maria_end(); + return 1; /* skip warning */ +} + + +static int read_with_pos (MARIA_HA * file,int silent) +{ + int error; + int i; + uchar read_record[MAX_REC_LENGTH]; + int rows=0; + + if (!silent) + printf("- Reading rows with position\n"); + for (i=0;;i++) + { + my_errno=0; + bzero((char*) read_record,MAX_REC_LENGTH); + error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR); + if (error) + { + if (error==HA_ERR_END_OF_FILE) + break; + if (error==HA_ERR_RECORD_DELETED) + continue; + printf("pos: %2d maria_rrnd: %3d errno: %3d\n",i,error,my_errno); + return error; + } + rows++; + print_record(read_record,maria_position(file),"\n"); + } + printf(" %d rows\n",rows); + return 0; +} + + +#ifdef NOT_USED +static void bprint_record(uchar * record, + my_off_t offs __attribute__((unused)), + const char * tail) +{ + int i; + char * pos; + i=(unsigned char)record[0]; + printf("%02X ",i); + + for( pos=record+1, i=0; i<32; i++,pos++) + { + int b=(unsigned char)*pos; + printf("%02X",b); + } + printf("%s",tail); +} +#endif + + +static void print_record(uchar * record, my_off_t offs,const char * tail) +{ + uchar *pos; + char *ptr; + uint len; + + printf(" rec=(%d)",(unsigned char)record[0]); + pos=record+1; + len=sint4korr(pos); + pos+=4; + printf(" len=%d ",len); + memcpy(&ptr,pos,sizeof(char*)); + if (ptr) + maria_rtree_PrintWKB((uchar*) ptr,SPDIMS); + else + printf("<NULL> "); + printf(" offs=%ld ",(long int)offs); + printf("%s",tail); +} + + +#ifdef NOT_USED +static void create_point(uchar *record,uint rownr) +{ + uint tmp; + char *ptr; + char *pos=record; + double x[200]; + int i; + + for(i=0;i<SPDIMS;i++) + x[i]=rownr; + + bzero((char*) record,MAX_REC_LENGTH); + *pos=0x01; /* DEL marker */ + pos++; + + memset(blob_key,0,sizeof(blob_key)); + tmp=maria_rtree_CreatePointWKB(x,SPDIMS,blob_key); + + int4store(pos,tmp); + pos+=4; + + ptr=blob_key; + memcpy(pos,&ptr,sizeof(char*)); +} +#endif + + +static void create_linestring(uchar *record,uint rownr) +{ + uint tmp; + char *ptr; + uchar *pos= record; + double x[200]; + int i,j; + int npoints=2; + + for(j=0;j<npoints;j++) + for(i=0;i<SPDIMS;i++) + x[i+j*SPDIMS]=rownr*j; + + bzero((char*) record,MAX_REC_LENGTH); + *pos=0x01; /* DEL marker */ + pos++; + + memset(blob_key,0,sizeof(blob_key)); + tmp=maria_rtree_CreateLineStringWKB(x,SPDIMS,npoints, (uchar*) blob_key); + + int4store(pos,tmp); + pos+=4; + + ptr=blob_key; + memcpy(pos,&ptr,sizeof(char*)); +} + + +static void create_key(uchar *key,uint rownr) +{ + double c=rownr; + uchar *pos; + uint i; + + bzero(key,MAX_REC_LENGTH); + for ( pos=key, i=0; i<2*SPDIMS; i++) + { + float8store(pos,c); + pos+=sizeof(c); + } +} + +static void print_key(const uchar *key,const char * tail) +{ + double c; + uint i; + + printf(" key="); + for (i=0; i<2*SPDIMS; i++) + { + float8get(c,key); + key+=sizeof(c); + printf("%.14g ",c); + } + printf("%s",tail); +} + + +#ifdef NOT_USED + +static int maria_rtree_CreatePointWKB(double *ords, uint n_dims, uchar *wkb) +{ + uint i; + + *wkb = wkbXDR; + ++wkb; + int4store(wkb, wkbPoint); + wkb += 4; + + for (i=0; i < n_dims; ++i) + { + float8store(wkb, ords[i]); + wkb += 8; + } + return 5 + n_dims * 8; +} +#endif + + +static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points, + uchar *wkb) +{ + uint i; + uint n_ords = n_dims * n_points; + + *wkb = wkbXDR; + ++wkb; + int4store(wkb, wkbLineString); + wkb += 4; + int4store(wkb, n_points); + wkb += 4; + for (i=0; i < n_ords; ++i) + { + float8store(wkb, ords[i]); + wkb += 8; + } + return 9 + n_points * n_dims * 8; +} + + +static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims) +{ + uint wkb_type; + + ++wkb; + wkb_type = uint4korr(wkb); + wkb += 4; + + switch ((enum wkbType)wkb_type) + { + case wkbPoint: + { + uint i; + double ord; + + printf("POINT("); + for (i=0; i < n_dims; ++i) + { + float8get(ord, wkb); + wkb += 8; + printf("%.14g", ord); + if (i < n_dims - 1) + printf(" "); + else + printf(")"); + } + break; + } + case wkbLineString: + { + uint p, i; + uint n_points; + double ord; + + printf("LineString("); + n_points = uint4korr(wkb); + wkb += 4; + for (p=0; p < n_points; ++p) + { + for (i=0; i < n_dims; ++i) + { + float8get(ord, wkb); + wkb += 8; + printf("%.14g", ord); + if (i < n_dims - 1) + printf(" "); + } + if (p < n_points - 1) + printf(", "); + else + printf(")"); + } + break; + } + case wkbPolygon: + { + printf("POLYGON(...)"); + break; + } + case wkbMultiPoint: + { + printf("MULTIPOINT(...)"); + break; + } + case wkbMultiLineString: + { + printf("MULTILINESTRING(...)"); + break; + } + case wkbMultiPolygon: + { + printf("MULTIPOLYGON(...)"); + break; + } + case wkbGeometryCollection: + { + printf("GEOMETRYCOLLECTION(...)"); + break; + } + default: + { + printf("UNKNOWN GEOMETRY TYPE"); + break; + } + } +} + +#include "ma_check_standalone.h" + +#else +int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused))) +{ + exit(0); +} +#endif /*HAVE_SPATIAL*/ diff --git a/storage/maria/ma_state.c b/storage/maria/ma_state.c new file mode 100644 index 00000000..c781f996 --- /dev/null +++ b/storage/maria/ma_state.c @@ -0,0 +1,879 @@ +/* Copyright (C) 2008 Sun AB and Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Functions to maintain live statistics for Maria transactional tables + and versioning for not transactional tables + + See WL#3138; Maria - fast "SELECT COUNT(*) FROM t;" and "CHECKSUM TABLE t" + for details about live number of rows and live checksums + + TODO + - Allocate MA_USED_TABLES and MA_HISTORY_STATE from a global pool (to + avoid calls to malloc() + - In trnamn_end_trans_hook(), don't call _ma_remove_not_visible_states() + every time. One could for example call it if there has been more than + 10 ended transactions since last time it was called. +*/ + +#include "maria_def.h" +#include "trnman.h" +#include "ma_trnman.h" +#include "ma_blockrec.h" + +/** + @brief Setup initial start-of-transaction state for a table + + @fn _ma_setup_live_state + @param info Maria handler + + @notes + This function ensures that trn->used_tables contains a list of + start and live states for tables that are part of the transaction + and that info->state points to the current live state for the table. + + @TODO + Change trn->table_list to a hash and share->state_history to a binary tree + + @return + @retval 0 ok + @retval 1 error (out of memory) +*/ + +my_bool _ma_setup_live_state(MARIA_HA *info) +{ + TRN *trn; + MARIA_SHARE *share= info->s; + MARIA_USED_TABLES *tables; + MARIA_STATE_HISTORY *history; + DBUG_ENTER("_ma_setup_live_state"); + DBUG_PRINT("enter", ("info: %p", info)); + + DBUG_ASSERT(share->lock_key_trees); + + if (maria_create_trn_hook(info)) + DBUG_RETURN(1); + + trn= info->trn; + for (tables= (MARIA_USED_TABLES*) trn->used_tables; + tables; + tables= tables->next) + { + if (tables->share == share) + { + /* Table is already used by transaction */ + goto end; + } + } + + /* Table was not used before, create new table state entry */ + if (!(tables= (MARIA_USED_TABLES*) my_malloc(PSI_INSTRUMENT_ME, + sizeof(*tables), MYF(MY_WME | MY_ZEROFILL)))) + DBUG_RETURN(1); + tables->next= trn->used_tables; + trn->used_tables= tables; + tables->share= share; + + mysql_mutex_lock(&share->intern_lock); + share->in_trans++; + DBUG_PRINT("info", ("share: %p in_trans: %d", + share, share->in_trans)); + + history= share->state_history; + + /* + We must keep share locked to ensure that we don't access a history + link that is deleted by concurrently running checkpoint. + + It's enough to compare trids here (instead of calling + tranman_can_read_from) as history->trid is a commit_trid + */ + while (trn->trid <= history->trid) + history= history->next; + mysql_mutex_unlock(&share->intern_lock); + /* The current item can't be deleted as it's the first one visible for us */ + tables->state_start= tables->state_current= history->state; + tables->state_current.changed= tables->state_current.no_transid= 0; + + DBUG_PRINT("info", ("records: %ld", (ulong) tables->state_start.records)); + +end: + info->state_start= &tables->state_start; + info->state= &tables->state_current; + info->used_tables= tables; + tables->use_count++; + + /* + Mark in transaction state if we are not using transid (versioning) + on rows. If not, then we will in _ma_trnman_end_trans_hook() + ensure that the state is visible for all at end of transaction + */ + tables->state_current.no_transid|= !(info->row_flag & ROW_FLAG_TRANSID); + + DBUG_PRINT("exit", ("tables: %p info->state: %p", tables, info->state)); + DBUG_RETURN(0); +} + + +/** + @brief Remove states that are not visible by anyone + + @fn _ma_remove_not_visible_states() + @param org_history List to history + @param all 1 if we should delete the first state if it's + visible for all. For the moment this is only used + on close() of table. + @param trnman_is_locked Set to 1 if we have already a lock on trnman. + + @notes + The assumption is that items in the history list is ordered by + commit_trid. + + A state is not visible anymore if there is no new transaction + that has been started between the commit_trid's of two states + + As long as some states exists, we keep the newest = (last commit) + state as first state in the history. This is to allow us to just move + the history from the global list to the share when we open the table. + + Note that if 'all' is set trnman_is_locked must be 0, becasue + trnman_get_min_trid() will take a lock on trnman. + + @return + @retval Pointer to new history list +*/ + +MARIA_STATE_HISTORY +*_ma_remove_not_visible_states(MARIA_STATE_HISTORY *org_history, + my_bool all, + my_bool trnman_is_locked) +{ + TrID last_trid; + MARIA_STATE_HISTORY *history, **parent, *next; + DBUG_ENTER("_ma_remove_not_visible_states"); + + if (!org_history) + DBUG_RETURN(0); /* Not versioned table */ + + last_trid= org_history->trid; + parent= &org_history->next; + for (history= org_history->next; history; history= next) + { + next= history->next; + if (!trnman_exists_active_transactions(history->trid, last_trid, + trnman_is_locked)) + { + DBUG_PRINT("info", ("removing history->trid: %lu next: %lu", + (ulong) history->trid, (ulong) last_trid)); + my_free(history); + continue; + } + *parent= history; + parent= &history->next; + last_trid= history->trid; + } + *parent= 0; + + if (all && parent == &org_history->next) + { + /* There is only one state left. Delete this if it's visible for all */ + if (last_trid < trnman_get_min_trid()) + { + my_free(org_history); + org_history= 0; + } + } + DBUG_RETURN(org_history); +} + + +/** + @brief Remove not used state history + + @param share Maria table information + @param all 1 if we should delete the first state if it's + visible for all. For the moment this is only used + on close() of table. + + @notes + share and trnman are not locked. + + We must first lock trnman and then share->intern_lock. This is becasue + _ma_trnman_end_trans_hook() has a lock on trnman and then + takes share->intern_lock. +*/ + +void _ma_remove_not_visible_states_with_lock(MARIA_SHARE *share, + my_bool all) +{ + my_bool is_lock_trman; + if ((is_lock_trman= trman_is_inited())) + trnman_lock(); + + mysql_mutex_lock(&share->intern_lock); + share->state_history= _ma_remove_not_visible_states(share->state_history, + all, 1); + mysql_mutex_unlock(&share->intern_lock); + if (is_lock_trman) + trnman_unlock(); +} + + +/* + Free state history information from share->history and reset information + to current state. + + @notes + Used after repair/rename/drop as then all rows are visible for everyone +*/ + +void _ma_reset_state(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + MARIA_STATE_HISTORY *history= share->state_history; + DBUG_ENTER("_ma_reset_state"); + + /* Always true if share->now_transactional is set */ + if (history && share->have_versioning) + { + MARIA_STATE_HISTORY *next; + DBUG_PRINT("info", ("resetting history")); + + /* Set the current history to current state */ + share->state_history->state= share->state.state; + /* Set current table handler to point to new history state */ + info->state= info->state_start= &share->state_history->state; + for (history= history->next ; history ; history= next) + { + next= history->next; + my_free(history); + } + share->state_history->next= 0; + share->state_history->trid= 0; /* Visible for all */ + } + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + The following functions are called by thr_lock() in threaded applications + for not transactional tables +****************************************************************************/ + +/* + Create a copy of the current status for the table + + SYNOPSIS + _ma_get_status() + param Pointer to Myisam handler + concurrent_insert Set to 1 if we are going to do concurrent inserts + (THR_WRITE_CONCURRENT_INSERT was used) +*/ + +my_bool _ma_get_status(void* param, my_bool concurrent_insert) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_get_status"); + DBUG_PRINT("info",("key_file: %ld data_file: %ld concurrent_insert: %d", + (long) info->s->state.state.key_file_length, + (long) info->s->state.state.data_file_length, + concurrent_insert)); +#ifndef DBUG_OFF + if (info->state->key_file_length > info->s->state.state.key_file_length || + info->state->data_file_length > info->s->state.state.data_file_length) + DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); +#endif + info->state_save= info->s->state.state; + info->state= &info->state_save; + info->state->changed= 0; + info->append_insert_at_end= concurrent_insert; + DBUG_RETURN(0); +} + + +void _ma_update_status(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + /* + Because someone may have closed the table we point at, we only + update the state if its our own state. This isn't a problem as + we are always pointing at our own lock or at a read lock. + (This is enforced by thr_multi_lock.c) + */ + if (info->state == &info->state_save) + { + MARIA_SHARE *share= info->s; +#ifndef DBUG_OFF + DBUG_PRINT("info",("updating status: key_file: %ld data_file: %ld", + (long) info->state->key_file_length, + (long) info->state->data_file_length)); + if (info->state->key_file_length < share->state.state.key_file_length || + info->state->data_file_length < share->state.state.data_file_length) + DBUG_PRINT("warning",("old info: key_file: %ld data_file: %ld", + (long) share->state.state.key_file_length, + (long) share->state.state.data_file_length)); +#endif + /* + we are going to modify the state without lock's log, this would break + recovery if done with a transactional table. + */ + DBUG_ASSERT(!info->s->base.born_transactional); + share->state.state= *info->state; + info->state= &share->state.state; +#ifdef HAVE_QUERY_CACHE + DBUG_PRINT("info", ("invalidator... '%s' (status update)", + info->s->data_file_name.str)); + DBUG_ASSERT(info->s->chst_invalidator != NULL); + (*info->s->chst_invalidator)((const char *)info->s->data_file_name.str); +#endif + + } + info->append_insert_at_end= 0; +} + + +/* + Same as ma_update_status() but take a lock in the table lock, to protect + against someone calling ma_get_status() from thr_lock() at the same time. +*/ + +void _ma_update_status_with_lock(MARIA_HA *info) +{ + my_bool locked= 0; + if (info->state == &info->state_save) + { + locked= 1; + mysql_mutex_lock(&info->s->lock.mutex); + } + (*info->s->lock.update_status)(info->lock.status_param); + if (locked) + mysql_mutex_unlock(&info->s->lock.mutex); +} + + +void _ma_restore_status(void *param) +{ + MARIA_HA *info= (MARIA_HA*) param; + info->state= &info->s->state.state; + info->append_insert_at_end= 0; +} + + +void _ma_copy_status(void* to, void *from) +{ + ((MARIA_HA*) to)->state= &((MARIA_HA*) from)->state_save; +} + + +my_bool _ma_reset_update_flag(void *param, + my_bool concurrent_insert __attribute__((unused))) +{ + MARIA_HA *info=(MARIA_HA*) param; + info->state->changed= 0; + return 0; +} + +my_bool _ma_start_trans(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + if (!info->s->lock_key_trees) + { + info->state= info->state_start; + *info->state= info->s->state.state; + } + return 0; +} + + +/** + @brief Check if should allow concurrent inserts + + @implementation + Allow concurrent inserts if we don't have a hole in the table or + if there is no active write lock and there is active read locks and + maria_concurrent_insert == 2. In this last case the new + row('s) are inserted at end of file instead of filling up the hole. + + The last case is to allow one to inserts into a heavily read-used table + even if there is holes. + + @notes + If there is a an rtree indexes in the table, concurrent inserts are + disabled in maria_open() + + @return + @retval 0 ok to use concurrent inserts + @retval 1 not ok +*/ + +my_bool _ma_check_status(void *param) +{ + MARIA_HA *info=(MARIA_HA*) param; + /* + The test for w_locks == 1 is here because this thread has already done an + external lock (in other words: w_locks == 1 means no other threads has + a write lock) + */ + DBUG_PRINT("info",("dellink: %ld r_locks: %u w_locks: %u", + (long) info->s->state.dellink, (uint) info->s->r_locks, + (uint) info->s->w_locks)); + return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR || + (maria_concurrent_insert == 2 && info->s->r_locks && + info->s->w_locks == 1)); +} + + +/** + @brief write hook at end of trans to store status for all used table + + @Notes + This function must be called under trnman_lock in trnman_end_trn() + because of the following reasons: + - After trnman_end_trn() is called, the current transaction will be + regarded as committed and all used tables state_history will be + visible to other transactions. To do this, we loop over all used + tables and create/update a history entries that contains the correct + state_history for them. +*/ + +my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit, + my_bool active_transactions) +{ + my_bool error= 0; + MARIA_USED_TABLES *tables, *next; + DBUG_ENTER("_ma_trnman_end_trans_hook"); + DBUG_PRINT("enter", ("trn: %p used_tables: %p", trn, trn->used_tables)); + + for (tables= (MARIA_USED_TABLES*) trn->used_tables; + tables; + tables= next) + { + MARIA_SHARE *share= tables->share; + next= tables->next; + if (commit) + { + MARIA_STATE_HISTORY *history; + + mysql_mutex_lock(&share->intern_lock); + + /* We only have to update history state if something changed */ + if (tables->state_current.changed) + { + if (tables->state_current.no_transid) + { + /* + The change was done without using transid on rows (like in + bulk insert). In this case this thread is the only one + that is using the table and all rows will be visible + for all transactions. + */ + _ma_reset_history(share); + } + else + { + if (active_transactions && share->now_transactional && + trnman_exists_active_transactions(share->state_history->trid, + trn->commit_trid, 1)) + { + /* + There exist transactions that are still using the current + share->state_history. Create a new history item for this + commit and add it first in the state_history list. This + ensures that all history items are stored in the list in + decresing trid order. + */ + if (!(history= my_malloc(PSI_INSTRUMENT_ME, sizeof(*history), + MYF(MY_WME)))) + { + /* purecov: begin inspected */ + error= 1; + mysql_mutex_unlock(&share->intern_lock); + my_free(tables); + continue; + /* purecov: end */ + } + history->state= share->state_history->state; + history->next= share->state_history; + share->state_history= history; + } + else + { + /* Previous history can't be seen by anyone, reuse old memory */ + history= share->state_history; + DBUG_PRINT("info", ("removing history->trid: %lu new: %lu", + (ulong) history->trid, + (ulong) trn->commit_trid)); + } + + history->state.records+= (tables->state_current.records - + tables->state_start.records); + history->state.checksum+= (tables->state_current.checksum - + tables->state_start.checksum); + history->trid= trn->commit_trid; + + share->state.last_change_trn= trn->commit_trid; + + if (history->next) + { + /* Remove not visible states */ + share->state_history= _ma_remove_not_visible_states(history, 0, 1); + } + DBUG_PRINT("info", ("share: %p in_trans: %d", + share, share->in_trans)); + } + } + /* The following calls frees &share->intern_lock */ + decrement_share_in_trans(share); + } + else + { + /* + We need to keep share->in_trans correct because of the check + in free_maria_share() + */ + mysql_mutex_lock(&share->intern_lock); + decrement_share_in_trans(share); + } + my_free(tables); + } + trn->used_tables= 0; + trn->used_instances= 0; + DBUG_RETURN(error); +} + + +/** + Remove table from trnman_list + + @notes + This is used when we unlock a table from a group of locked tables + just before doing a rename or drop table. + + share->internal_lock must be locked when function is called +*/ + +void _ma_remove_table_from_trnman(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + TRN *trn= info->trn; + MARIA_USED_TABLES *tables, **prev; + DBUG_ENTER("_ma_remove_table_from_trnman"); + DBUG_PRINT("enter", ("trn: %p used_tables: %p share: %p in_trans: %d", + trn, trn->used_tables, share, share->in_trans)); + + mysql_mutex_assert_owner(&share->intern_lock); + + if (trn == &dummy_transaction_object) + DBUG_VOID_RETURN; + + /* First remove share from used_tables */ + for (prev= (MARIA_USED_TABLES**) (char*) &trn->used_tables; + (tables= *prev); + prev= &tables->next) + { + if (tables->share == share) + { + *prev= tables->next; + /* + We don't have to and can't call decrement_share_in_trans(share) here + as we know there is an active MARIA_HA handler around. + */ + share->in_trans--; + my_free(tables); + break; + } + } + if (!tables) + { + /* + This can only happens in case of rename of intermediate table as + part of alter table + */ + DBUG_PRINT("warning", ("share: %p where not in used_tables_list", share)); + } + + /* Reset trn and remove table from used_instances */ + _ma_reset_trn_for_table(info); + + DBUG_VOID_RETURN; +} + + + +/**************************************************************************** + The following functions are called by thr_lock() in threaded applications + for transactional tables. +****************************************************************************/ + +/* + Create a copy of the current status for the table + + SYNOPSIS + _ma_get_status() + param Pointer to Aria handler + concurrent_insert Set to 1 if we are going to do concurrent inserts + (THR_WRITE_CONCURRENT_INSERT was used) +*/ + +my_bool _ma_block_get_status(void* param, my_bool concurrent_insert) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_block_get_status"); + DBUG_PRINT("enter", ("concurrent_insert %d", concurrent_insert)); + + info->row_base_length= info->s->base_length; + info->row_flag= info->s->base.default_row_flag; + DBUG_ASSERT(!concurrent_insert || + info->lock.type == TL_WRITE_CONCURRENT_INSERT); + if (concurrent_insert || !info->autocommit) + { + info->row_flag|= ROW_FLAG_TRANSID; + info->row_base_length+= TRANSID_SIZE; + } + else + { + DBUG_ASSERT(info->lock.type != TL_WRITE_CONCURRENT_INSERT); + } + DBUG_RETURN(0); +} + + +my_bool _ma_block_start_trans(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_block_start_trans"); + + if (info->s->lock_key_trees) + { + /* + Assume for now that this doesn't fail (It can only fail in + out of memory conditions) + TODO: Fix this by having one extra state pre-allocated + */ + DBUG_RETURN(_ma_setup_live_state(info)); + } + else + { + /* + We come here in the following cases: + - The table is a temporary table + - It's a table which is crash safe but not yet versioned, for + example a table with fulltext or rtree keys + + Set the current state to point to save_state so that the + block_format code don't count the same record twice. + Copy also the current state. This may have been wrong if the + same file was used several times in the last statement + */ + info->state= info->state_start; + *info->state= info->s->state.state; + } + + /* + Info->trn is set if this table is already handled and we are + called from maria_versioning() + */ + if (info->s->base.born_transactional && !info->trn) + { + /* + Assume for now that this doesn't fail (It can only fail in + out of memory conditions) + */ + DBUG_RETURN(maria_create_trn_hook(info) != 0); + } + DBUG_RETURN(0); +} + + +void _ma_block_update_status(void *param __attribute__((unused))) +{ +} + +void _ma_block_restore_status(void *param __attribute__((unused))) +{ +} + + +/** + Check if should allow concurrent inserts + + @return + @retval 0 ok to use concurrent inserts + @retval 1 not ok +*/ + +my_bool _ma_block_check_status(void *param __attribute__((unused))) +{ + return (my_bool) 0; +} + + +/* Get status when transactional but not versioned */ + +my_bool _ma_block_start_trans_no_versioning(void* param) +{ + MARIA_HA *info=(MARIA_HA*) param; + DBUG_ENTER("_ma_block_start_trans_no_versioning"); + DBUG_ASSERT(info->s->base.born_transactional && !info->s->lock_key_trees); + + info->state->changed= 0; /* from _ma_reset_update_flag() */ + info->state= info->state_start; + *info->state= info->s->state.state; + if (!info->trn) + { + /* + Assume for now that this doesn't fail (It can only fail in + out of memory conditions) + */ + DBUG_RETURN(maria_create_trn_hook(info)); + } + DBUG_RETURN(0); +} + + +/** + Enable/disable versioning +*/ + +void maria_versioning(MARIA_HA *info, my_bool versioning) +{ + MARIA_SHARE *share= info->s; + DBUG_ENTER("maria_versioning"); + + /* For now, this is a hack */ + if (share->have_versioning) + { + enum thr_lock_type save_lock_type; + share->lock_key_trees= versioning; + /* Set up info->lock.type temporary for _ma_block_get_status() */ + save_lock_type= info->lock.type; + info->lock.type= versioning ? TL_WRITE_CONCURRENT_INSERT : TL_WRITE; + _ma_block_get_status((void*) info, versioning); + info->lock.type= save_lock_type; + if (versioning) + info->state= &share->state.common; + else + info->state= &share->state.state; /* Change global values by default */ + info->state_start= info->state; /* Initial values */ + } + DBUG_VOID_RETURN; +} + + +/** + Update data_file_length to new length + + NOTES + Only used by block records +*/ + +void _ma_set_share_data_file_length(MARIA_SHARE *share, ulonglong new_length) +{ + if (!share->internal_table) + mysql_mutex_lock(&share->intern_lock); + if (share->state.state.data_file_length < new_length) + { + share->state.state.data_file_length= new_length; + if (new_length >= share->base.max_data_file_length) + { + /* Give an error on next insert */ + share->state.changed|= STATE_DATA_FILE_FULL; + } + } + if (!share->internal_table) + mysql_mutex_unlock(&share->intern_lock); +} + + +/** + Copy state information that where updated while the table was used + in not transactional mode +*/ + +void _ma_copy_nontrans_state_information(MARIA_HA *info) +{ + info->s->state.state.records= info->state->records; + info->s->state.state.checksum= info->state->checksum; +} + +/** + Reset history + This is only called during repair when we are the only one using the table. +*/ + +void _ma_reset_history(MARIA_SHARE *share) +{ + MARIA_STATE_HISTORY *history, *next; + DBUG_ENTER("_ma_reset_history"); + + share->state_history->trid= 0; /* Visibly by all */ + share->state_history->state= share->state.state; + history= share->state_history->next; + share->state_history->next= 0; + + for (; history; history= next) + { + next= history->next; + my_free(history); + } + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Virtual functions to check if row is visible +****************************************************************************/ + +/** + Row is always visible + This is for tables without concurrent insert +*/ + +my_bool _ma_row_visible_always(MARIA_HA *info __attribute__((unused))) +{ + return 1; +} + + +/** + Row visibility for non transactional tables with concurrent insert + + @implementation + When we got our table lock, we saved the current + data_file_length. Concurrent inserts always go to the end of the + file. So we can test if the found key references a new record. +*/ + +my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info) +{ + return info->cur_row.lastpos < info->state->data_file_length; +} + + +/** + Row visibility for transactional tables with versioning + + + @TODO + Add test if found key was marked deleted and it was deleted by + us. In that case we should return 0 +*/ + +my_bool _ma_row_visible_transactional_table(MARIA_HA *info) +{ + return trnman_can_read_from(info->trn, info->cur_row.trid); +} diff --git a/storage/maria/ma_state.h b/storage/maria/ma_state.h new file mode 100644 index 00000000..b27b75f5 --- /dev/null +++ b/storage/maria/ma_state.h @@ -0,0 +1,91 @@ +/* Copyright (C) 2008 Sun AB & Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef MA_STATE_INCLUDED +#define MA_STATE_INCLUDED +C_MODE_START + +/* Struct to store tables in use by one transaction */ + +typedef struct st_maria_status_info +{ + ha_rows records; /* Rows in table */ + ha_rows del; /* Removed rows */ + my_off_t empty; /* lost space in datafile */ + my_off_t key_empty; /* lost space in indexfile */ + my_off_t key_file_length; + my_off_t data_file_length; + ha_checksum checksum; + uint32 changed:1, /* Set if table was changed */ + no_transid:1; /* Set if no transid was set on rows */ +} MARIA_STATUS_INFO; + + +typedef struct st_used_tables { + struct st_used_tables *next; + struct st_maria_share *share; + MARIA_STATUS_INFO state_current; + MARIA_STATUS_INFO state_start; + uint use_count; +} MARIA_USED_TABLES; + + +/* Struct to store commit state at different times */ + +typedef struct st_state_history { + struct st_state_history *next; + TrID trid; + MARIA_STATUS_INFO state; +} MARIA_STATE_HISTORY; + + +/* struct to remember history for closed tables */ + +typedef struct st_state_history_closed { + LSN create_rename_lsn; + MARIA_STATE_HISTORY *state_history; +} MARIA_STATE_HISTORY_CLOSED; + + +my_bool _ma_setup_live_state(MARIA_HA *info); +MARIA_STATE_HISTORY *_ma_remove_not_visible_states(MARIA_STATE_HISTORY + *org_history, + my_bool all, + my_bool trman_is_locked); +void _ma_reset_state(MARIA_HA *info); +my_bool _ma_get_status(void* param, my_bool concurrent_insert); +void _ma_update_status(void* param); +void _ma_update_status_with_lock(MARIA_HA *info); +void _ma_restore_status(void *param); +void _ma_copy_status(void* to, void *from); +my_bool _ma_reset_update_flag(void *param, my_bool concurrent_insert); +my_bool _ma_start_trans(void* param); +my_bool _ma_check_status(void *param); +void maria_versioning(MARIA_HA *info, my_bool versioning); +void _ma_set_share_data_file_length(struct st_maria_share *share, + ulonglong new_length); +void _ma_copy_nontrans_state_information(MARIA_HA *info); +my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit, + my_bool active_transactions); +my_bool _ma_row_visible_always(MARIA_HA *info); +my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info); +my_bool _ma_row_visible_transactional_table(MARIA_HA *info); +void _ma_remove_not_visible_states_with_lock(struct st_maria_share *share, + my_bool all); +void _ma_remove_table_from_trnman(MARIA_HA *info); +void _ma_reset_history(struct st_maria_share *share); + +C_MODE_END +#endif diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c new file mode 100644 index 00000000..2c9f1dcc --- /dev/null +++ b/storage/maria/ma_static.c @@ -0,0 +1,157 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2010, 2020, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + +/* + Static variables for MARIA library. All definied here for easy making of + a shared library +*/ + +#ifndef MY_GLOBAL_INCLUDED +#include "maria_def.h" +#include "trnman.h" +#endif + +LIST *maria_open_list=0; +uchar maria_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 9, '\003', }; +uchar maria_pack_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 10, '\001', }; +/* Unique number for this maria instance */ +uchar maria_uuid[MY_UUID_SIZE]; +uint maria_quick_table_bits=9; +ulong __attribute__((visibility("default"))) maria_block_size= MARIA_KEY_BLOCK_LENGTH; +my_bool maria_flush= 0, maria_single_user= 0; +my_bool maria_delay_key_write= 0, maria_page_checksums= 1; +my_bool maria_inited= FALSE; +my_bool maria_in_ha_maria= FALSE; /* If used from ha_maria or not */ +my_bool maria_recovery_changed_data= 0, maria_recovery_verbose= 0; +my_bool maria_assert_if_crashed_table= 0; +my_bool maria_checkpoint_disabled= 0; +my_bool maria_encrypt_tables= 0; +my_bool aria_readonly= 0; + +mysql_mutex_t THR_LOCK_maria; +#ifdef DONT_USE_RW_LOCKS +ulong maria_concurrent_insert= 0; +#else +/* Do concurrent inserts at file end or in old holes */ +ulong maria_concurrent_insert= 2; +#endif + +my_off_t maria_max_temp_length= MAX_FILE_SIZE; +ulong maria_bulk_insert_tree_size=8192*1024; +ulong maria_data_pointer_size= 6; + +PAGECACHE maria_pagecache_var; +PAGECACHE *maria_pagecache= &maria_pagecache_var; + +PAGECACHE maria_log_pagecache_var; +PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var; +MY_TMPDIR *maria_tmpdir; /* Tempdir for redo */ +const char *maria_data_root; +HASH maria_stored_state; +int (*maria_create_trn_hook)(MARIA_HA *); + +void dummy_crash(const char *keyword __attribute__((unused))) {} +void (*ma_debug_crash_here)(const char *keyword)= dummy_crash; + +/** + @brief when transactionality does not matter we can use this transaction + + Used in external programs like ma_test*, and also internally inside + libmaria when there is no transaction around and the operation isn't + transactional (CREATE/DROP/RENAME/OPTIMIZE/REPAIR). +*/ +TRN dummy_transaction_object; + +/* a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict */ +WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0}; + +/* Enough for comparing if number is zero */ +uchar maria_zero_string[]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +/* + read_vec[] is used for converting between P_READ_KEY.. and SEARCH_ + Position is , == , >= , <= , > , < +*/ + +uint32 maria_read_vec[]= +{ + SEARCH_FIND, /* HA_READ_KEY_EXACT */ + SEARCH_FIND | SEARCH_BIGGER, /* HA_READ_KEY_OR_NEXT */ + SEARCH_FIND | SEARCH_SMALLER, /* HA_READ_KEY_OR_PREV */ + SEARCH_NO_FIND | SEARCH_BIGGER, /* HA_READ_AFTER_KEY */ + SEARCH_NO_FIND | SEARCH_SMALLER, /* HA_READ_BEFORE_KEY */ + SEARCH_FIND | SEARCH_PART_KEY, /* HA_READ_PREFIX */ + SEARCH_LAST, /* HA_READ_PREFIX_LAST */ + SEARCH_LAST | SEARCH_SMALLER, /* HA_READ_PREFIX_LAST_OR_PREV */ + MBR_CONTAIN, /* HA_READ_MBR_CONTAIN */ + MBR_INTERSECT, /* HA_READ_MBR_INTERSECT */ + MBR_WITHIN, /* HA_READ_MBR_WITHIN */ + MBR_DISJOINT, /* HA_READ_MBR_DISJOINT */ + MBR_EQUAL /* HA_READ_MBR_EQUAL */ +}; + +uint32 maria_readnext_vec[]= +{ + SEARCH_BIGGER, SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_BIGGER, SEARCH_SMALLER, + SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER +}; + +my_bool (*ma_killed)(MARIA_HA *)= ma_killed_standalone; + +#ifdef HAVE_PSI_INTERFACE + +PSI_mutex_key key_SHARE_BITMAP_lock, key_SORT_INFO_mutex, + key_THR_LOCK_maria, key_TRANSLOG_BUFFER_mutex, + key_LOCK_soft_sync, + key_TRANSLOG_DESCRIPTOR_dirty_buffer_mask_lock, + key_TRANSLOG_DESCRIPTOR_sent_to_disk_lock, + key_TRANSLOG_DESCRIPTOR_log_flush_lock, + key_TRANSLOG_DESCRIPTOR_file_header_lock, + key_TRANSLOG_DESCRIPTOR_unfinished_files_lock, + key_TRANSLOG_DESCRIPTOR_purger_lock, + key_SHARE_intern_lock, key_SHARE_key_del_lock, + key_SHARE_close_lock, key_PAGECACHE_cache_lock, + key_SERVICE_THREAD_CONTROL_lock, + key_LOCK_trn_list, key_TRN_state_lock; + +PSI_cond_key key_SHARE_key_del_cond, key_SERVICE_THREAD_CONTROL_cond, + key_SORT_INFO_cond, key_SHARE_BITMAP_cond, + key_COND_soft_sync, key_TRANSLOG_BUFFER_waiting_filling_buffer, + key_TRANSLOG_BUFFER_prev_sent_to_disk_cond, + key_TRANSLOG_DESCRIPTOR_log_flush_cond, + key_TRANSLOG_DESCRIPTOR_new_goal_cond; + +PSI_rwlock_key key_KEYINFO_root_lock, key_SHARE_mmap_lock, + key_TRANSLOG_DESCRIPTOR_open_files_lock; + +PSI_thread_key key_thread_checkpoint, key_thread_find_all_keys, + key_thread_soft_sync; + +PSI_file_key key_file_translog, key_file_kfile, key_file_dfile, + key_file_control, key_file_tmp; + +#endif /* HAVE_PSI_INTERFACE */ + +/* Note that PSI_stage_info globals must always be declared. */ +PSI_stage_info stage_waiting_for_a_resource= { 0, "Waiting for a resource", 0}; + +#ifdef WITH_S3_STORAGE_ENGINE +#include "s3_func.h" +struct s3_func __attribute__((visibility("default"))) s3f; +#endif diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c new file mode 100644 index 00000000..d8a8b0a0 --- /dev/null +++ b/storage/maria/ma_statrec.c @@ -0,0 +1,299 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + /* Functions to handle fixed-length-records */ + +#include "maria_def.h" + + +my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record) +{ + uchar temp[8]; /* max pointer length */ + if (info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) + { + my_off_t filepos=info->s->state.dellink; + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, &temp[0],info->s->base.rec_reflength, + info->s->state.dellink+1, + MYF(MY_NABP))) + goto err; + info->s->state.dellink= _ma_rec_pos(info->s, temp); + info->state->del--; + info->state->empty-=info->s->base.pack_reclength; + if (info->s->file_write(info, record, info->s->base.reclength, + filepos, MYF(MY_NABP))) + goto err; + } + else + { + if (info->state->data_file_length > info->s->base.max_data_file_length- + info->s->base.pack_reclength) + { + my_errno=HA_ERR_RECORD_FILE_FULL; + return(2); + } + if (info->opt_flag & WRITE_CACHE_USED) + { /* Cash in use */ + if (my_b_write(&info->rec_cache, record, + info->s->base.reclength)) + goto err; + if (info->s->base.pack_reclength != info->s->base.reclength) + { + uint length=info->s->base.pack_reclength - info->s->base.reclength; + bzero(temp,length); + if (my_b_write(&info->rec_cache, temp,length)) + goto err; + } + } + else + { + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_write(info, record, info->s->base.reclength, + info->state->data_file_length, + info->s->write_flag)) + goto err; + if (info->s->base.pack_reclength != info->s->base.reclength) + { + uint length=info->s->base.pack_reclength - info->s->base.reclength; + bzero(temp,length); + if (info->s->file_write(info, temp,length, + info->state->data_file_length+ + info->s->base.reclength, + info->s->write_flag)) + goto err; + } + } + info->state->data_file_length+=info->s->base.pack_reclength; + info->s->state.split++; + } + return 0; + err: + return 1; +} + +my_bool _ma_update_static_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec __attribute__ ((unused)), + const uchar *record) +{ + info->rec_cache.seek_not_done=1; /* We have done a seek */ + return (info->s->file_write(info, + record, info->s->base.reclength, + pos, + MYF(MY_NABP)) != 0); +} + + +my_bool _ma_delete_static_record(MARIA_HA *info, + const uchar *record __attribute__ ((unused))) +{ + uchar temp[9]; /* 1+sizeof(uint32) */ + info->state->del++; + info->state->empty+=info->s->base.pack_reclength; + temp[0]= '\0'; /* Mark that record is deleted */ + _ma_dpointer(info->s, temp+1, info->s->state.dellink); + info->s->state.dellink= info->cur_row.lastpos; + info->rec_cache.seek_not_done=1; + return (info->s->file_write(info, temp, 1+info->s->rec_reflength, + info->cur_row.lastpos, MYF(MY_NABP)) != 0); +} + + +my_bool _ma_cmp_static_record(register MARIA_HA *info, + register const uchar *old) +{ + DBUG_ENTER("_ma_cmp_static_record"); + + if (info->opt_flag & WRITE_CACHE_USED) + { + if (flush_io_cache(&info->rec_cache)) + { + DBUG_RETURN(1); + } + info->rec_cache.seek_not_done=1; /* We have done a seek */ + } + + if ((info->opt_flag & READ_CHECK_USED)) + { /* If check isn't disabled */ + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, info->rec_buff, info->s->base.reclength, + info->cur_row.lastpos, MYF(MY_NABP))) + DBUG_RETURN(1); + if (memcmp(info->rec_buff, old, (uint) info->s->base.reclength)) + { + DBUG_DUMP("read",old,info->s->base.reclength); + DBUG_DUMP("disk",info->rec_buff,info->s->base.reclength); + my_errno=HA_ERR_RECORD_CHANGED; /* Record have changed */ + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos) +{ + DBUG_ENTER("_ma_cmp_static_unique"); + + info->rec_cache.seek_not_done=1; /* We have done a seek */ + if (info->s->file_read(info, info->rec_buff, info->s->base.reclength, + pos, MYF(MY_NABP))) + DBUG_RETURN(1); + DBUG_RETURN(_ma_unique_comp(def, record, info->rec_buff, + def->null_are_equal)); +} + + +/* + Read a fixed-length-record + + RETURN + 0 Ok + 1 record delete + -1 on read-error or locking-error +*/ + +int _ma_read_static_record(register MARIA_HA *info, register uchar *record, + MARIA_RECORD_POS pos) +{ + int error; + DBUG_ENTER("_ma_read_static_record"); + + if (pos != HA_OFFSET_ERROR) + { + if (info->opt_flag & WRITE_CACHE_USED && + info->rec_cache.pos_in_file <= pos && + flush_io_cache(&info->rec_cache)) + DBUG_RETURN(my_errno); + info->rec_cache.seek_not_done=1; /* We have done a seek */ + + error= (int) info->s->file_read(info, record,info->s->base.reclength, + pos, MYF(MY_NABP)); + if (! error) + { + fast_ma_writeinfo(info); + if (!*record) + { + /* Record is deleted */ + DBUG_PRINT("warning", ("Record is deleted")); + DBUG_RETURN((my_errno=HA_ERR_RECORD_DELETED)); + } + info->update|= HA_STATE_AKTIV; /* Record is read */ + DBUG_RETURN(0); + } + } + fast_ma_writeinfo(info); /* No such record */ + DBUG_RETURN(my_errno); +} + + +/** + @brief Read record from given position or next record + + @note + When scanning, this function will return HA_ERR_RECORD_DELETED + for deleted rows even if skip_deleted_blocks is set. + The reason for this is to allow the caller to calculate the record + position without having to do call maria_position() for each record. +*/ + +int _ma_read_rnd_static_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks) +{ + int locked,error,cache_read; + uint cache_length; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_read_rnd_static_record"); + + cache_read=0; + cache_length=0; + if (info->opt_flag & READ_CACHE_USED) + { /* Cache in use */ + if (filepos == my_b_tell(&info->rec_cache) && + (skip_deleted_blocks || !filepos)) + { + cache_read=1; /* Read record using cache */ + cache_length= (uint) (info->rec_cache.read_end - + info->rec_cache.read_pos); + } + else + info->rec_cache.seek_not_done=1; /* Filepos is changed */ + } + locked=0; + if (info->lock_type == F_UNLCK) + { + if (filepos >= info->state->data_file_length) + { /* Test if new records */ + if (_ma_readinfo(info,F_RDLCK,0)) + DBUG_RETURN(my_errno); + locked=1; + } + else + { /* We don't nead new info */ +#ifndef UNSAFE_LOCKING + if ((! cache_read || share->base.reclength > cache_length) && + share->tot_locks == 0) + { /* record not in cache */ + locked=1; + } +#else + info->tmp_lock_type=F_RDLCK; +#endif + } + } + if (filepos >= info->state->data_file_length) + { + DBUG_PRINT("test",("filepos: %ld (%ld) records: %ld del: %ld", + (long) filepos/share->base.reclength, (long) filepos, + (long) info->state->records, (long) info->state->del)); + fast_ma_writeinfo(info); + DBUG_RETURN(my_errno=HA_ERR_END_OF_FILE); + } + info->cur_row.lastpos= filepos; + info->cur_row.nextpos= filepos+share->base.pack_reclength; + + if (! cache_read) /* No cacheing */ + { + error= _ma_read_static_record(info, buf, filepos); + DBUG_RETURN(error); + } + + /* Read record with cacheing */ + error=my_b_read(&info->rec_cache, buf, share->base.reclength); + if (info->s->base.pack_reclength != info->s->base.reclength && !error) + { + uchar tmp[8]; /* Skill fill bytes */ + error=my_b_read(&info->rec_cache, tmp, + info->s->base.pack_reclength - info->s->base.reclength); + } + if (locked) + _ma_writeinfo(info,0); /* Unlock keyfile */ + if (!error) + { + if (!buf[0]) + { /* Record is removed */ + DBUG_RETURN(my_errno=HA_ERR_RECORD_DELETED); + } + /* Found and may be updated */ + info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED; + DBUG_RETURN(0); + } + /* my_errno should be set if rec_cache.error == -1 */ + if (info->rec_cache.error != -1 || my_errno == 0) + _ma_set_fatal_error(info, HA_ERR_WRONG_IN_RECORD); + DBUG_RETURN(my_errno); /* Something wrong (EOF?) */ +} diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c new file mode 100644 index 00000000..22f80ca2 --- /dev/null +++ b/storage/maria/ma_test1.c @@ -0,0 +1,929 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Testing of the basic functions of a MARIA table */ + +#include "maria_def.h" +#include <my_getopt.h> +#include <m_string.h> +#include "ma_control_file.h" +#include "ma_loghandler.h" +#include "ma_checkpoint.h" +#include "trnman.h" + +extern PAGECACHE *maria_log_pagecache; +extern const char *maria_data_root; + +#define MAX_REC_LENGTH 1024 + +static void usage(); + +static int rec_pointer_size=0, flags[50], testflag, checkpoint; +static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE; +static int key_type=HA_KEYTYPE_NUM; +static int create_flag=0; +static ulong blob_length; +static enum data_file_type record_type= DYNAMIC_RECORD; + +static uint insert_count, update_count, remove_count; +static uint pack_keys=0, pack_seg=0, key_length; +static uint unique_key=HA_NOSAME; +static uint die_in_middle_of_transaction; +static my_bool pagecacheing, null_fields, silent, skip_update, opt_unique; +static my_bool verbose, skip_delete, transactional; +static my_bool opt_versioning= 0; +static MARIA_COLUMNDEF recinfo[4]; +static MARIA_KEYDEF keyinfo[10]; +static HA_KEYSEG keyseg[10]; +static HA_KEYSEG uniqueseg[10]; + +static int run_test(const char *filename); +static void get_options(int argc, char *argv[]); +static void create_key(uchar *key,uint rownr); +static void create_record(uchar *record,uint rownr); +static void update_record(uchar *record); + +/* + These are here only for testing of recovery with undo. We are not + including maria_def.h here as this test is also to be an example of + how to use maria outside of the maria directory +*/ + +extern int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index); +#define MARIA_FLUSH_DATA 1 + + +int main(int argc,char *argv[]) +{ + char buff[FN_REFLEN]; +#ifdef SAFE_MUTEX + safe_mutex_deadlock_detector= 1; +#endif + MY_INIT(argv[0]); + maria_data_root= "."; + get_options(argc,argv); + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0, + maria_block_size, 0, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) + { + fprintf(stderr, "Error in initialization\n"); + exit(1); + } + if (opt_versioning) + init_thr_lock(); + + exit(run_test(fn_format(buff, "test1", maria_data_root, "", MYF(0)))); +} + + +static int run_test(const char *filename) +{ + MARIA_HA *file; + int i,j= 0,error,deleted,rec_length,uniques=0; + uint offset_to_key; + ha_rows found,row_count; + uchar record[MAX_REC_LENGTH],key[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH]; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + + if (die_in_middle_of_transaction) + null_fields= 1; + + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) &create_info,sizeof(create_info)); + + /* First define 2 columns */ + create_info.null_bytes= 1; + recinfo[0].type= key_field; + recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length); + recinfo[1].type=extra_field; + recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24); + if (extra_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length); + recinfo[1].null_bit= null_fields ? 2 : 0; + + if (opt_unique) + { + recinfo[2].type=FIELD_CHECK; + recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH; + } + rec_length= recinfo[0].length + recinfo[1].length + recinfo[2].length + + create_info.null_bytes; + + if (key_type == HA_KEYTYPE_VARTEXT1 && + key_length > 255) + key_type= HA_KEYTYPE_VARTEXT2; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= pack_seg; + keyinfo[0].seg[0].start=1; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + if (pack_seg & HA_BLOB_PART) + { + keyinfo[0].seg[0].bit_start=4; /* Length of blob length */ + } + keyinfo[0].flag = (uint8) (pack_keys | unique_key); + + bzero((uchar*) flags,sizeof(flags)); + if (opt_unique) + { + uint start; + uniques=1; + bzero((char*) &uniquedef,sizeof(uniquedef)); + bzero((char*) uniqueseg,sizeof(uniqueseg)); + uniquedef.seg=uniqueseg; + uniquedef.keysegs=2; + + /* Make a unique over all columns (except first NULL fields) */ + for (i=0, start=1 ; i < 2 ; i++) + { + uniqueseg[i].start=start; + start+=recinfo[i].length; + uniqueseg[i].length=recinfo[i].length; + uniqueseg[i].language= default_charset_info->number; + } + uniqueseg[0].type= key_type; + uniqueseg[0].null_bit= null_fields ? 2 : 0; + uniqueseg[1].type= HA_KEYTYPE_TEXT; + if (extra_field == FIELD_BLOB) + { + uniqueseg[1].length=0; /* The whole blob */ + uniqueseg[1].bit_start=4; /* long blob */ + uniqueseg[1].flag|= HA_BLOB_PART; + } + else if (extra_field == FIELD_VARCHAR) + { + uniqueseg[1].flag|= HA_VAR_LENGTH_PART; + uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ? + HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2); + } + } + else + uniques=0; + + offset_to_key= MY_TEST(null_fields); + if (key_field == FIELD_BLOB || key_field == FIELD_VARCHAR) + offset_to_key+= 2; + + if (!silent) + printf("- Creating maria file\n"); + create_info.max_rows=(ulong) (rec_pointer_size ? + (1L << (rec_pointer_size*8))/40 : + 0); + create_info.transactional= transactional; + if (maria_create(filename, record_type, 1, keyinfo,2+opt_unique,recinfo, + uniques, &uniquedef, &create_info, + create_flag)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0))) + goto err; + if (!silent) + printf("- Writing key:s\n"); + + if (maria_begin(file)) + goto err; + if (opt_versioning) + maria_versioning(file, 1); + my_errno=0; + row_count=deleted=0; + for (i=49 ; i>=1 ; i-=2 ) + { + if (insert_count-- == 0) + { + if (testflag) + break; + maria_close(file); + exit(0); + } + j=i%25 +1; + create_record(record,j); + error=maria_write(file,record); + if (!error) + row_count++; + flags[j]=1; + if (verbose || error) + printf("J= %2d maria_write: %d errno: %d\n", j,error,my_errno); + } + + if (maria_commit(file) || maria_begin(file)) + goto err; + + if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 1) + goto end; + + /* Insert 2 rows with null values */ + if (null_fields) + { + create_record(record,0); + error=maria_write(file,record); + if (!error) + row_count++; + if (verbose || error) + printf("J= NULL maria_write: %d errno: %d\n", error,my_errno); + error=maria_write(file,record); + if (!error) + row_count++; + if (verbose || error) + printf("J= NULL maria_write: %d errno: %d\n", error,my_errno); + flags[0]=2; + } + + if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 2) + { + printf("Terminating after inserts\n"); + goto end; + } + + if (maria_commit(file) || maria_begin(file)) + goto err; + + if (!skip_update) + { + if (opt_unique) + { + if (!silent) + printf("- Checking unique constraint\n"); + create_record(record,j); /* Check last created row */ + if (!maria_write(file,record) || my_errno != HA_ERR_FOUND_DUPP_UNIQUE) + { + printf("unique check failed\n"); + } + } + if (!silent) + printf("- Updating rows\n"); + + create_key(key, j); + if ((maria_rkey(file, read_record, 0, key, + HA_WHOLE_KEY, HA_READ_KEY_EXACT))) + printf("Can't find last written row with maria_rkey\n"); + + /* Update first last row to force extend of file */ + if (maria_rsame(file,read_record,-1)) + { + printf("Can't find last row with maria_rsame\n"); + } + else + { + memcpy(record,read_record,rec_length); + update_record(record); + if (maria_update(file,read_record,record)) + { + printf("Can't update last row: %.*s\n", + keyinfo[0].seg[0].length,read_record+1); + } + } + + /* Read through all rows and update them */ + maria_scan_init(file); + + found=0; + while ((error= maria_scan(file,read_record)) == 0) + { + if (--update_count == 0) { maria_close(file); exit(0) ; } + memcpy(record,read_record,rec_length); + update_record(record); + if (maria_update(file,read_record,record)) + { + printf("Can't update row: %.*s, error: %d\n", + keyinfo[0].seg[0].length,record+1,my_errno); + } + found++; + } + if (found != row_count) + printf("Found %ld of %ld rows\n", (ulong) found, (ulong) row_count); + maria_scan_end(file); + } + + if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 3) + { + printf("Terminating after updates\n"); + goto end; + } + if (!silent) + printf("- Reopening file\n"); + if (maria_commit(file)) + goto err; + if (maria_close(file)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0))) + goto err; + if (maria_begin(file)) + goto err; + if (opt_versioning) + maria_versioning(file, 1); + if (!skip_delete) + { + if (!silent) + printf("- Removing keys\n"); + + for (i=0 ; i <= 10 ; i++) + { + /* + If you want to debug the problem in ma_test_recovery with BLOBs + (see @todo there), you can break out of the loop after just one + delete, it is enough, like this: + if (i==1) break; + */ + /* testing */ + if (remove_count-- == 0) + { + fprintf(stderr, + "delete-rows number of rows deleted; Going down hard!\n"); + goto end; + } + j=i*2; + if (!flags[j]) + continue; + create_key(key,j); + my_errno=0; + if ((error = maria_rkey(file, read_record, 0, key, + HA_WHOLE_KEY, HA_READ_KEY_EXACT))) + { + if (verbose || (flags[j] >= 1 || + (error && my_errno != HA_ERR_KEY_NOT_FOUND))) + printf("key: '%.*s' maria_rkey: %3d errno: %3d\n", + (int) key_length,key+offset_to_key,error,my_errno); + } + else + { + error=maria_delete(file,read_record); + if (verbose || error) + printf("key: '%.*s' maria_delete: %3d errno: %3d\n", + (int) key_length, key+offset_to_key, error, my_errno); + if (! error) + { + deleted++; + flags[j]--; + } + } + } + } + + if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (testflag == 4) + { + printf("Terminating after deletes\n"); + goto end; + } + + if (!silent) + printf("- Reading rows with key\n"); + record[1]= 0; /* For nicer printf */ + + if (record_type == NO_RECORD) + maria_extra(file, HA_EXTRA_KEYREAD, 0); + + for (i=0 ; i <= 25 ; i++) + { + create_key(key,i); + my_errno=0; + error=maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT); + if (verbose || + (error == 0 && flags[i] == 0 && unique_key) || + (error && (flags[i] != 0 || my_errno != HA_ERR_KEY_NOT_FOUND))) + { + printf("key: '%.*s' maria_rkey: %3d errno: %3d record: %s\n", + (int) key_length,key+offset_to_key,error,my_errno,record+1); + } + } + if (record_type == NO_RECORD) + { + maria_extra(file, HA_EXTRA_NO_KEYREAD, 0); + goto end; + } + + if (!silent) + printf("- Reading rows with position\n"); + + if (maria_scan_init(file)) + { + fprintf(stderr, "maria_scan_init failed\n"); + goto err; + } + + for (i=1,found=0 ; i <= 30 ; i++) + { + my_errno=0; + if ((error= maria_scan(file, read_record)) == HA_ERR_END_OF_FILE) + { + if (found != row_count-deleted) + printf("Found only %ld of %ld rows\n", (ulong) found, + (ulong) (row_count - deleted)); + break; + } + if (!error) + found++; + if (verbose || (error != 0 && error != HA_ERR_RECORD_DELETED && + error != HA_ERR_END_OF_FILE)) + { + printf("pos: %2d maria_rrnd: %3d errno: %3d record: %s\n", + i-1,error,my_errno,read_record+1); + } + } + maria_scan_end(file); + +end: + if (die_in_middle_of_transaction) + { + /* As commit record is not done, UNDO entries needs to be rolled back */ + switch (die_in_middle_of_transaction) { + case 1: + /* + Flush changed pages go to disk. That will also flush log. Recovery + will skip REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE); + break; + case 2: + /* + Just flush log. Pages are likely to not be on disk. Recovery will + then execute REDOs and UNDOs. + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + /* + Flush nothing. Pages and log are likely to not be on disk. Recovery + will then do nothing. + */ + break; + case 4: + /* + Flush changed data pages go to disk. Changed index pages are not + flushed. Recovery will skip some REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + /* + We have to flush log separately as the redo for the last key page + may not be flushed + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + } + printf("Dying on request without maria_commit()/maria_close()\n"); + sf_leaking_memory= 1; + exit(0); + } + + if (maria_commit(file)) + goto err; + if (maria_close(file)) + goto err; + maria_end(); + my_uuid_end(); + my_end(MY_CHECK_ERROR); + + return (0); +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + +static void create_key_part(uchar *key,uint rownr) +{ + if (!unique_key) + rownr&=7; /* Some identical keys */ + if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM) + { + sprintf((char*) key,"%*d",keyinfo[0].seg[0].length,rownr); + } + else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 || + keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2) + { /* Alpha record */ + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b'); + } + } + else + { /* Alpha record */ + if (keyinfo[0].seg[0].flag & HA_SPACE_PACK) + sprintf((char*) key,"%-*d",keyinfo[0].seg[0].length,rownr); + else + { + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + key[1]= (rownr < 10 ? 'a' : 'b'); + } + } + } +} + + +static void create_key(uchar *key,uint rownr) +{ + if (keyinfo[0].seg[0].null_bit) + { + if (rownr == 0) + { + key[0]=1; /* null key */ + key[1]=0; /* For easy print of key */ + return; + } + *key++=0; + } + if (keyinfo[0].seg[0].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART)) + { + size_t tmp; + create_key_part(key+2,rownr); + tmp=strlen((char*) key+2); + int2store(key,tmp); + } + else + create_key_part(key,rownr); +} + + +static uchar blob_key[MAX_REC_LENGTH]; +static uchar blob_record[MAX_REC_LENGTH+20*20]; + + +static void create_record(uchar *record,uint rownr) +{ + uchar *pos; + bzero((char*) record,MAX_REC_LENGTH); + record[0]=1; /* delete marker */ + if (rownr == 0 && keyinfo[0].seg[0].null_bit) + record[0]|=keyinfo[0].seg[0].null_bit; /* Null key */ + + pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + size_t tmp; + uchar *ptr; + create_key_part(blob_key,rownr); + tmp=strlen((char*) blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + size_t tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + create_key_part(pos+pack_length,rownr); + tmp= strlen((char*) pos+pack_length); + if (pack_length == 1) + *(uchar*) pos= (uchar) tmp; + else + int2store(pos,tmp); + pos+= recinfo[0].length; + } + else + { + create_key_part(pos,rownr); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + size_t tmp; + uchar *ptr;; + sprintf((char*) blob_record,"... row: %d", rownr); + strappend((char*) blob_record,MY_MAX(MAX_REC_LENGTH-rownr,10),' '); + tmp=strlen((char*) blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy(pos+4,&ptr,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + size_t tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + sprintf((char*) pos+pack_length, "... row: %d", rownr); + tmp= strlen((char*) pos+pack_length); + if (pack_length == 1) + *pos= (uchar) tmp; + else + int2store(pos,tmp); + } + else + { + sprintf((char*) pos,"... row: %d", rownr); + strappend((char*) pos,recinfo[1].length,' '); + } +} + +/* change row to test re-packing of rows and reallocation of keys */ + +static void update_record(uchar *record) +{ + uchar *pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + uchar *column,*ptr; + int length; + length=uint4korr(pos); /* Long blob */ + memcpy(&column,pos+4,sizeof(char*)); + memcpy(blob_key,column,length); /* Move old key */ + ptr=blob_key; + memcpy(pos+4,&ptr,sizeof(char*)); /* Store pointer to new key */ + if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM) + my_ci_casedn(default_charset_info, (char*) blob_key, length, + (char*) blob_key, length); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos); + my_ci_casedn(default_charset_info, (char*) pos + pack_length, length, + (char*) pos + pack_length, length); + pos+=recinfo[0].length; + } + else + { + if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM) + my_ci_casedn(default_charset_info, (char*) pos, keyinfo[0].seg[0].length, + (char*) pos, keyinfo[0].seg[0].length); + pos+=recinfo[0].length; + } + + if (recinfo[1].type == FIELD_BLOB) + { + uchar *column; + int length; + length=uint4korr(pos); + memcpy(&column,pos+4,sizeof(char*)); + memcpy(blob_record,column,length); + bfill(blob_record+length,20,'.'); /* Make it larger */ + length+=20; + int4store(pos,length); + column=blob_record; + memcpy(pos+4,&column,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + /* Second field is longer than 10 characters */ + uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos); + pos= record+ recinfo[1].offset; + bfill(pos+pack_length+length,recinfo[1].length-length-pack_length,'.'); + length=recinfo[1].length-pack_length; + if (pack_length == 1) + *(uchar*) pos= (uchar) length; + else + int2store(pos,length); + } + else + { + bfill(pos+recinfo[1].length-10,10,'.'); + } +} + + +static struct my_option my_long_options[] = +{ + {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint, + (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"checksum", 'c', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, +#ifndef DBUG_OFF + {"debug", '#', "Undocumented", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"datadir", 'h', "Path to the database root.", (char**) &maria_data_root, + (char**) &maria_data_root, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"delete-rows", 'd', "Abort after this many rows has been deleted", + (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG, + 1000, 0, 0, 0, 0, 0}, + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"insert-rows", 'i', "Undocumented", (uchar**) &insert_count, + (uchar**) &insert_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0}, + {"key-alpha", 'a', "Use a key of type HA_KEYTYPE_TEXT", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-binary-pack", 'B', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-blob", 'b', "Undocumented", + (uchar**) &blob_length, (uchar**) &blob_length, + 0, GET_ULONG, OPT_ARG, 0, 0, 0, 0, 0, 0}, + {"key-cache", 'K', "Undocumented", (uchar**) &pagecacheing, + (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-length", 'k', "Undocumented", (uchar**) &key_length, + (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0}, + {"key-multiple", 'm', "Don't use unique keys", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-prefix_pack", 'P', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-space_pack", 'p', "Undocumented", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"key-varchar", 'w', "Test VARCHAR keys", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"null-fields", 'N', "Define fields with NULL", + (uchar**) &null_fields, (uchar**) &null_fields, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"row-fixed-size", 'S', "Fixed size records", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"rows-in-block", 'M', "Store rows in block format", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"rows-no-data", 'n', "Don't store any data, only keys", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"row-pointer-size", 'R', "Undocumented", (uchar**) &rec_pointer_size, + (uchar**) &rec_pointer_size, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"silent", 's', "Undocumented", + (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, + 0, 0}, + {"skip-delete", 'D', "Don't test deletes", (uchar**) &skip_delete, + (uchar**) &skip_delete, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"skip-update", 'U', "Don't test updates", (uchar**) &skip_update, + (uchar**) &skip_update, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag, + (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"test-undo", 'A', + "Abort hard. Used for testing recovery with undo", + (uchar**) &die_in_middle_of_transaction, + (uchar**) &die_in_middle_of_transaction, + 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"transactional", 'T', + "Test in transactional mode. (Only works with block format)", + (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, + {"unique", 'E', "Check unique handling", (uchar**) &opt_unique, + (uchar**) &opt_unique, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"update-rows", 'u', "Max number of rows to update", (uchar**) &update_count, + (uchar**) &update_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0}, + {"verbose", 'v', "Be more verbose", (uchar**) &verbose, + (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version number and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"versioning", 'C', "Use row versioning (only works with block format)", + (uchar**) &opt_versioning, (uchar**) &opt_versioning, 0, GET_BOOL, + NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument __attribute__((unused)), + const char *filename __attribute__((unused))) +{ + switch(opt->id) { + case 'a': + key_type= HA_KEYTYPE_TEXT; + break; + case 'c': + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 'R': /* Length of record pointer */ + if (rec_pointer_size > 3) + rec_pointer_size=0; + break; + case 'P': + pack_keys= HA_PACK_KEY; /* Use prefix compression */ + break; + case 'B': + pack_keys= HA_BINARY_PACK_KEY; /* Use binary compression */ + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'n': + record_type= NO_RECORD; + break; + case 'S': + if (key_field == FIELD_VARCHAR) + { + create_flag=0; /* Static sized varchar */ + record_type= STATIC_RECORD; + } + else if (key_field != FIELD_BLOB) + { + key_field=FIELD_NORMAL; /* static-size record */ + extra_field=FIELD_NORMAL; + record_type= STATIC_RECORD; + } + break; + case 'p': + pack_keys=HA_PACK_KEY; /* Use prefix + space packing */ + pack_seg=HA_SPACE_PACK; + key_type=HA_KEYTYPE_TEXT; + break; + case 'm': + unique_key=0; + break; + case 'b': + key_field=FIELD_BLOB; /* blob key */ + extra_field= FIELD_BLOB; + pack_seg|= HA_BLOB_PART; + key_type= HA_KEYTYPE_VARTEXT1; + if (record_type == STATIC_RECORD) + record_type= DYNAMIC_RECORD; + break; + case 'k': + if (key_length < 4 || key_length > MARIA_MAX_KEY_LENGTH) + { + fprintf(stderr,"Wrong key length\n"); + exit(1); + } + break; + case 'w': + key_field=FIELD_VARCHAR; /* varchar keys */ + extra_field= FIELD_VARCHAR; + key_type= HA_KEYTYPE_VARTEXT1; + pack_seg|= HA_VAR_LENGTH_PART; + if (record_type == STATIC_RECORD) + record_type= DYNAMIC_RECORD; + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + break; + case 'V': + printf("test1 Ver 1.2 \n"); + exit(0); + case '#': + DBUG_PUSH(argument); + break; + case '?': + usage(); + exit(1); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option))) + exit(ho_error); + if (transactional) + record_type= BLOCK_RECORD; + if (record_type == NO_RECORD) + skip_update= skip_delete= 1; + + + return; +} /* get options */ + + +static void usage() +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} + +#include "ma_check_standalone.h" + diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c new file mode 100644 index 00000000..66284653 --- /dev/null +++ b/storage/maria/ma_test2.c @@ -0,0 +1,1263 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Test av isam-databas: stor test */ + +#ifndef USE_MY_FUNC /* We want to be able to dbug this !! */ +#define USE_MY_FUNC +#endif +#include "maria_def.h" +#include "trnman.h" +#include <m_ctype.h> +#include <my_bit.h> +#include "ma_checkpoint.h" + +#define STANDARD_LENGTH 37 +#define MARIA_KEYS 6 +#define MAX_PARTS 4 + +static void get_options(int argc, char *argv[]); +static uint rnd(uint max_value); +static void fix_length(uchar *record,uint length); +static void put_blob_in_record(uchar *blob_pos,char **blob_buffer, + ulong *length); +static void copy_key(MARIA_HA *info, uint inx, uchar *record, uchar *key); + +static int verbose= 0, testflag= 0, first_key= 0, async_io= 0, pagecacheing= 0; +static int write_cacheing= 0, do_locking= 0, rec_pointer_size= 0; +static int silent= 0, opt_quick_mode= 0, transactional= 0, skip_update= 0; +static int die_in_middle_of_transaction= 0, pack_fields= 1; +static int pack_seg= HA_SPACE_PACK, pack_type= HA_PACK_KEY, remove_count= -1; +static int create_flag= 0, srand_arg= 0, checkpoint= 0; +static my_bool opt_versioning= 0; +static uint use_blob= 0, update_count= 0; +static ulong pagecache_size=8192*32; +static enum data_file_type record_type= DYNAMIC_RECORD; + +static uint keys=MARIA_KEYS,recant=1000; +static uint16 key1[1001],key3[5001]; +static uchar record[300],record2[300],key[100],key2[100]; +static uchar read_record[300],read_record2[300],read_record3[300]; +static HA_KEYSEG glob_keyseg[MARIA_KEYS][MAX_PARTS]; + + /* Test program */ + +int main(int argc, char *argv[]) +{ + uint i; + int j,n1,n2,n3,error,k; + uint write_count,update,dupp_keys,opt_delete,start,length,blob_pos, + reclength,ant,found_parts; + my_off_t lastpos; + ha_rows range_records,records; + MARIA_HA *file; + MARIA_KEYDEF keyinfo[10]; + MARIA_COLUMNDEF recinfo[10]; + MARIA_INFO info; + char *blob_buffer; + MARIA_CREATE_INFO create_info; + char filename[FN_REFLEN]; + page_range pages; + +#ifdef SAFE_MUTEX + safe_mutex_deadlock_detector= 1; +#endif + MY_INIT(argv[0]); + + maria_data_root= "."; + get_options(argc,argv); + fn_format(filename, "test2", maria_data_root, "", MYF(0)); + + if (! async_io) + my_disable_async_io=1; + + /* If we sync or not have no affect on this test */ + my_disable_sync= 1; + + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, pagecache_size, 0, 0, + maria_block_size, 0, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) + { + fprintf(stderr, "Error in initialization"); + exit(1); + } + if (opt_versioning) + init_thr_lock(); + + reclength=STANDARD_LENGTH+60+(use_blob ? 8 : 0); + blob_pos=STANDARD_LENGTH+60; + keyinfo[0].seg= &glob_keyseg[0][0]; + keyinfo[0].seg[0].start=0; + keyinfo[0].seg[0].length=6; + keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].language= default_charset_info->number; + keyinfo[0].seg[0].flag=(uint8) pack_seg; + keyinfo[0].seg[0].null_bit=0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].keysegs=1; + keyinfo[0].flag = pack_type; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[1].seg= &glob_keyseg[1][0]; + keyinfo[1].seg[0].start=7; + keyinfo[1].seg[0].length=6; + keyinfo[1].seg[0].type=HA_KEYTYPE_BINARY; + keyinfo[1].seg[0].flag=0; + keyinfo[1].seg[0].null_bit=0; + keyinfo[1].seg[0].null_pos=0; + keyinfo[1].seg[1].start=0; /* two part key */ + keyinfo[1].seg[1].length=6; + keyinfo[1].seg[1].type=HA_KEYTYPE_NUM; + keyinfo[1].seg[1].flag=HA_REVERSE_SORT; + keyinfo[1].seg[1].null_bit=0; + keyinfo[1].seg[1].null_pos=0; + keyinfo[1].key_alg=HA_KEY_ALG_BTREE; + keyinfo[1].keysegs=2; + keyinfo[1].flag =0; + keyinfo[1].block_length= MARIA_MIN_KEY_BLOCK_LENGTH; /* Diff blocklength */ + keyinfo[2].seg= &glob_keyseg[2][0]; + keyinfo[2].seg[0].start=12; + keyinfo[2].seg[0].length=8; + keyinfo[2].seg[0].type=HA_KEYTYPE_BINARY; + keyinfo[2].seg[0].flag=HA_REVERSE_SORT; + keyinfo[2].seg[0].null_bit=0; + keyinfo[2].seg[0].null_pos=0; + keyinfo[2].key_alg=HA_KEY_ALG_BTREE; + keyinfo[2].keysegs=1; + keyinfo[2].flag =HA_NOSAME; + keyinfo[2].block_length= 0; /* Default block length */ + keyinfo[3].seg= &glob_keyseg[3][0]; + keyinfo[3].seg[0].start=0; + keyinfo[3].seg[0].length=reclength-(use_blob ? 8 : 0); + keyinfo[3].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[3].seg[0].language=default_charset_info->number; + keyinfo[3].seg[0].flag=(uint8) pack_seg; + keyinfo[3].seg[0].null_bit=0; + keyinfo[3].seg[0].null_pos=0; + keyinfo[3].key_alg=HA_KEY_ALG_BTREE; + keyinfo[3].keysegs=1; + keyinfo[3].flag = pack_type; + keyinfo[3].block_length= 0; /* Default block length */ + keyinfo[4].seg= &glob_keyseg[4][0]; + keyinfo[4].seg[0].start=0; + keyinfo[4].seg[0].length=5; + keyinfo[4].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[4].seg[0].language=default_charset_info->number; + keyinfo[4].seg[0].flag=0; + keyinfo[4].seg[0].null_bit=0; + keyinfo[4].seg[0].null_pos=0; + keyinfo[4].key_alg=HA_KEY_ALG_BTREE; + keyinfo[4].keysegs=1; + keyinfo[4].flag = pack_type; + keyinfo[4].block_length= 0; /* Default block length */ + keyinfo[5].seg= &glob_keyseg[5][0]; + keyinfo[5].seg[0].start=0; + keyinfo[5].seg[0].length=4; + keyinfo[5].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[5].seg[0].language=default_charset_info->number; + keyinfo[5].seg[0].flag=pack_seg; + keyinfo[5].seg[0].null_bit=0; + keyinfo[5].seg[0].null_pos=0; + keyinfo[5].key_alg=HA_KEY_ALG_BTREE; + keyinfo[5].keysegs=1; + keyinfo[5].flag = pack_type; + keyinfo[5].block_length= 0; /* Default block length */ + + recinfo[0].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[0].length=7; + recinfo[0].null_bit=0; + recinfo[0].null_pos=0; + recinfo[1].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[1].length=5; + recinfo[1].null_bit=0; + recinfo[1].null_pos=0; + recinfo[2].type=pack_fields ? FIELD_SKIP_PRESPACE : 0; + recinfo[2].length=9; + recinfo[2].null_bit=0; + recinfo[2].null_pos=0; + recinfo[3].type=FIELD_NORMAL; + recinfo[3].length=STANDARD_LENGTH-7-5-9-4; + recinfo[3].null_bit=0; + recinfo[3].null_pos=0; + recinfo[4].type=pack_fields ? FIELD_SKIP_ZERO : 0; + recinfo[4].length=4; + recinfo[4].null_bit=0; + recinfo[4].null_pos=0; + recinfo[5].type=pack_fields ? FIELD_SKIP_ENDSPACE : 0; + recinfo[5].length=60; + recinfo[5].null_bit=0; + recinfo[5].null_pos=0; + if (use_blob) + { + recinfo[6].type=FIELD_BLOB; + recinfo[6].length=4+portable_sizeof_char_ptr; + recinfo[6].null_bit=0; + recinfo[6].null_pos=0; + } + + write_count=update=dupp_keys=opt_delete=0; + blob_buffer=0; + + for (i=1000 ; i>0 ; i--) key1[i]=0; + for (i=5000 ; i>0 ; i--) key3[i]=0; + + if (!silent) + printf("- Creating maria-file\n"); + file= 0; + bzero((char*) &create_info,sizeof(create_info)); + create_info.max_rows=(ha_rows) (rec_pointer_size ? + (1L << (rec_pointer_size*8))/ + reclength : 0); + create_info.reloc_rows=(ha_rows) 100; + create_info.transactional= transactional; + if (maria_create(filename, record_type, keys,&keyinfo[first_key], + use_blob ? 7 : 6, &recinfo[0], + 0,(MARIA_UNIQUEDEF*) 0, + &create_info,create_flag)) + goto err; + if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED, 0))) + goto err; + maria_begin(file); + if (opt_versioning) + maria_versioning(file, 1); + if (testflag == 1) + goto end; + if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + if (!silent) + printf("- Writing key:s\n"); + if (do_locking) + maria_lock_database(file,F_WRLCK); + if (write_cacheing) + maria_extra(file,HA_EXTRA_WRITE_CACHE,0); + if (opt_quick_mode) + maria_extra(file,HA_EXTRA_QUICK,0); + + for (i=0 ; i < recant ; i++) + { + ulong blob_length; + n1=rnd(1000); n2=rnd(100); n3=rnd(5000); + sprintf((char*) record,"%6d:%4d:%8d:Pos: %4d ",n1,n2,n3,write_count); + int4store(record+STANDARD_LENGTH-4,(long) i); + fix_length(record,(uint) STANDARD_LENGTH+rnd(60)); + put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length); + DBUG_PRINT("test",("record: %d blob_length: %lu", i, blob_length)); + + if (maria_write(file,record)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0) + { + printf("Error: %d in write at record: %d\n",my_errno,i); + goto err; + } + if (verbose) printf(" Double key: %d at record# %d\n", n3, i); + } + else + { + if (key3[n3] == 1 && first_key <3 && first_key+keys >= 3) + { + printf("Error: Didn't get error when writing second key: '%8d'\n",n3); + goto err2; + } + write_count++; key1[n1]++; key3[n3]=1; + } + + /* Check if we can find key without flushing database */ + if (i % 10 == 0) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (!j) + for (j=999 ; j>0 && key1[j] == 0 ; j--) ; + sprintf((char*) key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("Test in loop: Can't find key: \"%s\"\n",key); + goto err; + } + } + } + if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (write_cacheing) + { + if (maria_extra(file,HA_EXTRA_NO_CACHE,0)) + { + puts("got error from maria_extra(HA_EXTRA_NO_CACHE)"); + goto err; + } + } + + if (testflag == 2) + goto end; + +#ifdef REMOVE_WHEN_WE_HAVE_RESIZE + if (pagecacheing) + resize_pagecache(maria_pagecache, maria_block_size, + pagecache_size * 2, 0, 0); +#endif + if (!silent) + printf("- Delete\n"); + if (srand_arg) + srand(srand_arg); + if (!update_count) + update_count= recant/10; + + for (i=0 ; i < update_count ; i++) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (j != 0) + { + sprintf((char*) key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("can't find key1: \"%s\"\n",key); + goto err; + } + if (bcmp(read_record+keyinfo[0].seg[0].start, + key, keyinfo[0].seg[0].length)) + { + printf("Found wrong record when searching for key: \"%s\"\n",key); + goto err2; + } + if (opt_delete == (uint) remove_count) /* While testing */ + goto end; + if (maria_delete(file,read_record)) + { + printf("error: %d; can't delete record: \"%s\"\n", my_errno,read_record); + goto err; + } + opt_delete++; + key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--; + key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0; + } + else + { + puts("Warning: Skipping delete test because no dupplicate keys"); + break; + } + } + if (testflag == 3) + goto end; + if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (!silent) + printf("- Update\n"); + if (srand_arg) + srand(srand_arg); + if (!update_count) + update_count= recant/10; + + for (i=0 ; i < update_count ; i++) + { + n1=rnd(1000); n2=rnd(100); n3=rnd(5000); + sprintf((char*) record2,"%6d:%4d:%8d:XXX: %4d ",n1,n2,n3,update); + int4store(record2+STANDARD_LENGTH-4,(long) i); + fix_length(record2,(uint) STANDARD_LENGTH+rnd(60)); + + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + if (j != 0) + { + sprintf((char*) key,"%6d",j); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + { + printf("can't find key1: \"%s\"\n", (char*) key); + goto err; + } + if (bcmp(read_record+keyinfo[0].seg[0].start, + key, keyinfo[0].seg[0].length)) + { + printf("Found wrong record when searching for key: \"%s\"; Found \"%.*s\"\n", + key, keyinfo[0].seg[0].length, + read_record+keyinfo[0].seg[0].start); + goto err2; + } + if (use_blob) + { + ulong blob_length; + if (i & 1) + put_blob_in_record(record2+blob_pos,&blob_buffer, &blob_length); + else + bmove(record2+blob_pos, read_record+blob_pos, 4 + sizeof(char*)); + } + if (skip_update) + continue; + if (maria_update(file,read_record,record2)) + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0) + { + printf("error: %d; can't update:\nFrom: \"%s\"\nTo: \"%s\"\n", + my_errno,read_record,record2); + goto err; + } + if (verbose) + printf("Double key when tried to update:\nFrom: \"%s\"\nTo: \"%s\"\n",record,record2); + } + else + { + key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--; + key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0; + key1[n1]++; key3[n3]=1; + update++; + } + } + } + if (testflag == 4) + goto end; + if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + for (i=999, dupp_keys=j=0 ; i>0 ; i--) + { + if (key1[i] > dupp_keys) + { + dupp_keys=key1[i]; j=i; + } + } + sprintf((char*) key,"%6d",j); + start=keyinfo[0].seg[0].start; + length=keyinfo[0].seg[0].length; + if (dupp_keys) + { + if (!silent) + printf("- Same key: first - next -> last - prev -> first\n"); + DBUG_PRINT("progpos",("first - next -> last - prev -> first")); + if (verbose) printf(" Using key: \"%s\" Keys: %d\n",key,dupp_keys); + + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_rsame(file,read_record2,-1)) + goto err; + if (memcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame didn't find same record\n"); + goto err2; + } + info.recpos=maria_position(file); + if (maria_rfirst(file,read_record2,0) || + maria_rsame_with_pos(file,read_record2,0,info.recpos) || + memcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame_with_pos didn't find same record\n"); + goto err2; + } + { + int skr; + info.recpos= maria_position(file); + skr= maria_rnext(file,read_record2,0); + if ((skr && my_errno != HA_ERR_END_OF_FILE) || + maria_rprev(file,read_record2,0) || + memcmp(read_record,read_record2,reclength) != 0 || + info.recpos != maria_position(file)) + { + printf("maria_rsame_with_pos lost position\n"); + goto err; + } + } + ant=1; + while (maria_rnext(file,read_record2,0) == 0 && + memcmp(read_record2+start,key,length) == 0) ant++; + if (ant != dupp_keys) + { + printf("next: Found: %d keys of %d\n",ant,dupp_keys); + goto err2; + } + ant=0; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys) + { + printf("prev: Found: %d records of %d\n",ant,dupp_keys); + goto err2; + } + + /* Check of maria_rnext_same */ + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + ant=1; + while (!maria_rnext_same(file,read_record3) && ant < dupp_keys+10) + ant++; + if (ant != dupp_keys || my_errno != HA_ERR_END_OF_FILE) + { + printf("maria_rnext_same: Found: %d records of %d\n",ant,dupp_keys); + goto err2; + } + } + + if (!silent) + printf("- All keys: first - next -> last - prev -> first\n"); + DBUG_PRINT("progpos",("All keys: first - next -> last - prev -> first")); + ant=1; + if (maria_rfirst(file,read_record,0)) + { + printf("Can't find first record\n"); + goto err; + } + while ((error=maria_rnext(file,read_record3,0)) == 0 && ant < write_count+10) + ant++; + if (ant != write_count - opt_delete || error != HA_ERR_END_OF_FILE) + { + printf("next: I found: %d records of %d (error: %d)\n", + ant, write_count - opt_delete, error); + goto err; + } + if (maria_rlast(file,read_record2,0) || + bcmp(read_record2,read_record3,reclength)) + { + printf("Can't find last record\n"); + DBUG_DUMP("record2", read_record2, reclength); + DBUG_DUMP("record3", read_record3, reclength); + goto err2; + } + ant=1; + while (maria_rprev(file,read_record3,0) == 0 && ant < write_count+10) + ant++; + if (ant != write_count - opt_delete) + { + printf("prev: I found: %d records of %d\n",ant,write_count); + goto err2; + } + if (bcmp(read_record,read_record3,reclength)) + { + printf("Can't find first record\n"); + goto err2; + } + + if (!silent) + printf("- Test if: Read first - next - prev - prev - next == first\n"); + DBUG_PRINT("progpos",("- Read first - next - prev - prev - next == first")); + if (maria_rfirst(file,read_record,0) || + maria_rnext(file,read_record3,0) || + maria_rprev(file,read_record3,0) || + maria_rprev(file,read_record3,0) == 0 || + maria_rnext(file,read_record3,0)) + goto err; + if (bcmp(read_record,read_record3,reclength) != 0) + printf("Can't find first record\n"); + + if (!silent) + printf("- Test if: Read last - prev - next - next - prev == last\n"); + DBUG_PRINT("progpos",("Read last - prev - next - next - prev == last")); + if (maria_rlast(file,read_record2,0) || + maria_rprev(file,read_record3,0) || + maria_rnext(file,read_record3,0) || + maria_rnext(file,read_record3,0) == 0 || + maria_rprev(file,read_record3,0)) + goto err; + if (bcmp(read_record2,read_record3,reclength)) + printf("Can't find last record\n"); +#ifdef NOT_ANYMORE + if (!silent) + puts("- Test read key-part"); + strmov(key2,key); + for(i=strlen(key2) ; i-- > 1 ;) + { + key2[i]=0; + + /* The following row is just to catch some bugs in the key code */ + bzero((char*) file->lastkey,file->s->base.max_key_length*2); + if (maria_rkey(file,read_record,0,key2,(uint) i,HA_READ_PREFIX)) + goto err; + if (bcmp(read_record+start,key,(uint) i)) + { + puts("Didn't find right record"); + goto err2; + } + } +#endif + if (dupp_keys > 2) + { + if (!silent) + printf("- Read key (first) - next - delete - next -> last\n"); + DBUG_PRINT("progpos",("first - next - delete - next -> last")); + if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_rnext(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + while (maria_rnext(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-1) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-1); + goto err2; + } + } + if (dupp_keys>4) + { + if (!silent) + printf("- Read last of key - prev - delete - prev -> first\n"); + DBUG_PRINT("progpos",("last - prev - delete - prev -> first")); + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-2) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-2); + goto err2; + } + } + if (dupp_keys > 6) + { + if (!silent) + printf("- Read first - delete - next -> last\n"); + DBUG_PRINT("progpos",("first - delete - next -> last")); + if (maria_rkey(file,read_record3,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT)) + goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=1; + if (maria_rnext(file,read_record,0)) + goto err; /* Skall finnas poster */ + while (maria_rnext(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-3) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-3); + goto err2; + } + + if (!silent) + printf("- Read last - delete - prev -> first\n"); + DBUG_PRINT("progpos",("last - delete - prev -> first")); + if (maria_rprev(file,read_record3,0)) goto err; + if (maria_delete(file,read_record3)) goto err; + opt_delete++; + ant=0; + while (maria_rprev(file,read_record3,0) == 0 && + bcmp(read_record3+start,key,length) == 0) ant++; + if (ant != dupp_keys-4) + { + printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-4); + goto err2; + } + } + + if (!silent) + puts("- Test if: Read rrnd - same"); + DBUG_PRINT("progpos",("Read rrnd - same")); + maria_scan_init(file); + for (i=0 ; i < write_count ; i++) + { + int tmp; + if ((tmp= maria_scan(file,read_record)) && + tmp != HA_ERR_END_OF_FILE && + tmp != HA_ERR_RECORD_DELETED) + { + printf("Got error %d when scanning table\n", tmp); + break; + } + if (!tmp) + { + /* Remember position to last found row */ + info.recpos= maria_position(file); + bmove(read_record2,read_record,reclength); + } + } + maria_scan_end(file); + if (i != write_count && i != write_count - opt_delete) + { + printf("Found wrong number of rows while scanning table\n"); + goto err2; + } + + if (maria_rsame_with_pos(file,read_record,0,info.recpos)) + goto err; + if (bcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame_with_pos didn't find same record\n"); + goto err2; + } + + for (i=MY_MIN(2,keys) ; i-- > 0 ;) + { + if (maria_rsame(file,read_record2,(int) i)) goto err; + if (bcmp(read_record,read_record2,reclength) != 0) + { + printf("maria_rsame didn't find same record\n"); + goto err2; + } + } + if (!silent) + puts("- Test maria_records_in_range"); + maria_status(file,&info,HA_STATUS_VARIABLE); + for (i=0 ; i < info.keys ; i++) + { + key_range min_key, max_key; + if (maria_rfirst(file,read_record,(int) i) || + maria_rlast(file,read_record2,(int) i)) + goto err; + copy_key(file,(uint) i, read_record, key); + copy_key(file,(uint) i, read_record2, key2); + min_key.key= key; + min_key.keypart_map= HA_WHOLE_KEY; + min_key.flag= HA_READ_KEY_EXACT; + max_key.key= key2; + max_key.keypart_map= HA_WHOLE_KEY; + max_key.flag= HA_READ_AFTER_KEY; + + range_records= maria_records_in_range(file,(int) i, &min_key, &max_key, + &pages); + if (range_records < info.records*8/10 || + range_records > info.records*12/10) + { + printf("maria_records_range returned %ld; Should be about %ld\n", + (long) range_records,(long) info.records); + goto err2; + } + if (verbose) + { + printf("maria_records_range returned %ld; Exact is %ld (diff: %4.2g %%)\n", + (long) range_records, (long) info.records, + labs((long) range_records - (long) info.records)*100.0/ + info.records); + } + } + for (i=0 ; i < 5 ; i++) + { + for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ; + for (k=rnd(1000)+1 ; k>0 && key1[k] == 0 ; k--) ; + if (j != 0 && k != 0) + { + key_range min_key, max_key; + if (j > k) + swap_variables(int, j, k); + sprintf((char*) key,"%6d",j); + sprintf((char*) key2,"%6d",k); + + min_key.key= key; + min_key.keypart_map= HA_WHOLE_KEY; + min_key.flag= HA_READ_AFTER_KEY; + max_key.key= key2; + max_key.keypart_map= HA_WHOLE_KEY; + max_key.flag= HA_READ_BEFORE_KEY; + range_records= maria_records_in_range(file, 0, &min_key, &max_key, + &pages); + records=0; + for (j++ ; j < k ; j++) + records+=key1[j]; + if ((long) range_records < (long) records*6/10-2 || + (long) range_records > (long) records*14/10+2) + { + printf("maria_records_range for key: %d returned %lu; Should be about %lu\n", + i, (ulong) range_records, (ulong) records); + goto err2; + } + if (verbose && records) + { + printf("maria_records_range returned %lu; Exact is %lu (diff: %4.2g %%)\n", + (ulong) range_records, (ulong) records, + labs((long) range_records-(long) records)*100.0/records); + + } + } + } + + if (!silent) + printf("- maria_info\n"); + maria_status(file,&info,HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (info.records != write_count-opt_delete || + info.deleted > opt_delete + update || info.keys != keys) + { + puts("Wrong info from maria_info"); + printf("Got: records: %lu delete: %lu i_keys: %d\n", + (ulong) info.records, (ulong) info.deleted, info.keys); + goto err2; + } + if (verbose) + { + char buff[80]; + get_date(buff,3,info.create_time); + printf("info: Created %s\n",buff); + get_date(buff,3,info.check_time); + printf("info: checked %s\n",buff); + get_date(buff,3,info.update_time); + printf("info: Modified %s\n",buff); + } + + maria_panic(HA_PANIC_WRITE); + maria_panic(HA_PANIC_READ); + if (maria_is_changed(file)) + puts("Warning: maria_is_changed reported that datafile was changed"); + + if (!silent) + printf("- maria_extra(CACHE) + maria_rrnd.... + maria_extra(NO_CACHE)\n"); + if (maria_reset(file) || maria_extra(file,HA_EXTRA_CACHE,0)) + { + if (do_locking || (!use_blob && !pack_fields)) + { + puts("got error from maria_extra(HA_EXTRA_CACHE)"); + goto err; + } + } + ant=0; + maria_scan_init(file); + while ((error= maria_scan(file,record)) != HA_ERR_END_OF_FILE && + ant < write_count + 10) + ant+= error ? 0 : 1; + maria_scan_end(file); + if (ant != write_count-opt_delete) + { + printf("scan with cache: I can only find: %d records of %d\n", + ant,write_count-opt_delete); + maria_scan_end(file); + goto err2; + } + if (maria_extra(file,HA_EXTRA_NO_CACHE,0)) + { + puts("got error from maria_extra(HA_EXTRA_NO_CACHE)"); + maria_scan_end(file); + goto err; + } + maria_scan_end(file); + + ant=0; + maria_scan_init(file); + while ((error=maria_scan(file,record)) != HA_ERR_END_OF_FILE && + ant < write_count + 10) + ant+= error ? 0 : 1; + if (ant != write_count-opt_delete) + { + printf("scan with cache: I can only find: %d records of %d\n", + ant,write_count-opt_delete); + maria_scan_end(file); + goto err2; + } + maria_scan_end(file); + + if (testflag == 5) + goto end; + if (checkpoint == 5 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + + if (!silent) + printf("- Removing keys\n"); + DBUG_PRINT("progpos",("Removing keys")); + lastpos = HA_OFFSET_ERROR; + /* DBUG_POP(); */ + maria_reset(file); + found_parts=0; + maria_scan_init(file); + while ((error= maria_scan(file,read_record)) != HA_ERR_END_OF_FILE) + { + info.recpos=maria_position(file); + if (lastpos >= info.recpos && lastpos != HA_OFFSET_ERROR) + { + printf("maria_rrnd didn't advance filepointer; old: %ld, new: %ld\n", + (long) lastpos, (long) info.recpos); + goto err2; + } + lastpos=info.recpos; + if (error == 0) + { + if (opt_delete == (uint) remove_count) /* While testing */ + goto end; + if (rnd(2) == 1 && maria_rsame(file,read_record,-1)) + { + printf("can't find record %lx\n",(long) info.recpos); + goto err; + } + if (use_blob) + { + ulong blob_length,pos; + uchar *ptr; + memcpy(&ptr, read_record+blob_pos+4, sizeof(ptr)); + blob_length= uint4korr(read_record+blob_pos); + for (pos=0 ; pos < blob_length ; pos++) + { + if (ptr[pos] != (uchar) (blob_length+pos)) + { + printf("Found blob with wrong info at %ld\n",(long) lastpos); + maria_scan_end(file); + my_errno= 0; + goto err2; + } + } + } + if (maria_delete(file,read_record)) + { + printf("can't delete record: %6.6s, delete_count: %d\n", + read_record, opt_delete); + maria_scan_end(file); + goto err; + } + opt_delete++; + } + else + found_parts++; + } + if (my_errno != HA_ERR_END_OF_FILE && my_errno != HA_ERR_RECORD_DELETED) + printf("error: %d from maria_rrnd\n",my_errno); + if (write_count != opt_delete) + { + printf("Deleted only %d of %d records (%d parts)\n",opt_delete,write_count, + found_parts); + maria_scan_end(file); + goto err2; + } + if (testflag == 6) + goto end; + if (checkpoint == 6 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE)) + goto err; + +end: + maria_scan_end(file); + if (die_in_middle_of_transaction) + { + /* As commit record is not done, UNDO entries needs to be rolled back */ + switch (die_in_middle_of_transaction) { + case 1: + /* + Flush changed data and index pages go to disk + That will also flush log. Recovery will skip REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, + FLUSH_RELEASE, FLUSH_RELEASE); + break; + case 2: + /* + Just flush log. Pages are likely to not be on disk. Recovery will + then execute REDOs and UNDOs. + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + case 3: + /* + Flush nothing. Pages and log are likely to not be on disk. Recovery + will then do nothing. + */ + break; + case 4: + /* + Flush changed data pages go to disk. Changed index pages are not + flushed. Recovery will skip some REDOs and apply UNDOs. + */ + _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE, + FLUSH_RELEASE); + /* + We have to flush log separately as the redo for the last key page + may not be flushed + */ + if (translog_flush(file->trn->undo_lsn)) + goto err; + break; + } + printf("Dying on request without maria_commit()/maria_close()\n"); + sf_leaking_memory= 1; /* no memory leak reports here */ + exit(0); + } + if (maria_commit(file)) + goto err; + if (maria_close(file)) + { + file= 0; + goto err; + } + file= 0; + maria_panic(HA_PANIC_CLOSE); /* Should close log */ + if (!silent) + { + printf("\nFollowing test have been made:\n"); + printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete); + if (rec_pointer_size) + printf("Record pointer size: %d\n",rec_pointer_size); + printf("maria_block_size: %lu\n", maria_block_size); + if (write_cacheing) + puts("Key cache resized"); + if (write_cacheing) + puts("Write cacheing used"); + if (write_cacheing) + puts("quick mode"); + if (async_io && do_locking) + puts("Asyncron io with locking used"); + else if (do_locking) + puts("Locking used"); + if (use_blob) + puts("blobs used"); + printf("key cache status: \n\ +blocks used:%10lu\n\ +not flushed:%10lu\n\ +w_requests: %10lu\n\ +writes: %10lu\n\ +r_requests: %10lu\n\ +reads: %10lu\n", + (ulong) maria_pagecache->blocks_used, + (ulong) maria_pagecache->global_blocks_changed, + (ulong) maria_pagecache->global_cache_w_requests, + (ulong) maria_pagecache->global_cache_write, + (ulong) maria_pagecache->global_cache_r_requests, + (ulong) maria_pagecache->global_cache_read); + } + maria_end(); + my_free(blob_buffer); + my_uuid_end(); + my_end(silent ? MY_CHECK_ERROR : MY_CHECK_ERROR | MY_GIVE_INFO); + return(0); +err: + printf("got error: %d when using MARIA-database\n",my_errno); +err2: + if (file) + { + if (maria_commit(file)) + printf("got error: %d when using MARIA-database\n",my_errno); + maria_close(file); + } + maria_end(); + my_uuid_end(); + my_end(0); + return(1); +} /* main */ + + +/* Read options */ + +static void get_options(int argc, char **argv) +{ + char *pos,*progname; + + progname= argv[0]; + + while (--argc >0 && *(pos = *(++argv)) == '-' ) { + switch(*++pos) { + case 'B': + pack_type= HA_BINARY_PACK_KEY; + break; + case 'b': + use_blob= 1000; + if (*++pos) + use_blob= atol(pos); + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + if (*++pos) + pagecache_size=atol(pos); + break; + case 'W': /* Use write cacheing */ + write_cacheing=1; + if (*++pos) + my_default_record_cache_size=atoi(pos); + break; + case 'd': + remove_count= atoi(++pos); + break; + case 'i': + if (*++pos) + srand(srand_arg= atoi(pos)); + break; + case 'L': + do_locking=1; + break; + case 'a': /* use asyncron io */ + async_io=1; + if (*++pos) + my_default_record_cache_size=atoi(pos); + break; + case 'v': /* verbose */ + verbose=1; + break; + case 'm': /* records */ + if ((recant=atoi(++pos)) < 10 && testflag > 2) + { + fprintf(stderr,"record count must be >= 10 (if testflag > 2)\n"); + exit(1); + } + if (recant <= 1) + { + fprintf(stderr,"record count must be >= 2\n"); + exit(1); + } + break; + case 'e': /* maria_block_length */ + case 'E': + if ((maria_block_size= atoi(++pos)) < MARIA_MIN_KEY_BLOCK_LENGTH || + maria_block_size > MARIA_MAX_KEY_BLOCK_LENGTH) + { + fprintf(stderr,"Wrong maria_block_length\n"); + exit(1); + } + maria_block_size= my_round_up_to_next_power(maria_block_size); + break; + case 'f': + if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS) + first_key=0; + break; + case 'H': + checkpoint= atoi(++pos); + break; + case 'h': + maria_data_root= ++pos; + break; + case 'k': + if ((keys=(uint) atoi(++pos)) < 1 || + keys > (uint) (MARIA_KEYS-first_key)) + keys=MARIA_KEYS-first_key; + break; + case 'M': + record_type= BLOCK_RECORD; + break; + case 'P': + pack_type=0; /* Don't use DIFF_LENGTH */ + pack_seg=0; + break; + case 'R': /* Length of record pointer */ + rec_pointer_size=atoi(++pos); + if (rec_pointer_size > 7) + rec_pointer_size=0; + break; + case 'S': + pack_fields=0; /* Static-length-records */ + record_type= STATIC_RECORD; + break; + case 's': + silent=1; + break; + case 't': + testflag=atoi(++pos); /* testmod */ + break; + case 'T': + transactional= 1; + break; + case 'A': + die_in_middle_of_transaction= atoi(++pos); + break; + case 'u': + update_count=atoi(++pos); + if (!update_count) + skip_update= 1; + break; + case 'q': + opt_quick_mode=1; + break; + case 'c': + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 'D': + create_flag|=HA_CREATE_DELAY_KEY_WRITE; + break; + case 'g': + skip_update= TRUE; + break; + case 'C': + opt_versioning= 1; + break; + case '?': + case 'I': + case 'V': + printf("%s Ver 1.2 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE); + puts("By Monty, for testing Maria\n"); + printf("Usage: %s [-?AbBcCDIKLPRqSsTVWltv] [-k#] [-f#] [-m#] [-e#] [-E#] [-t#]\n", + progname); + exit(0); + case '#': + DBUG_PUSH (++pos); + break; + default: + printf("Illegal option: '%c'\n",*pos); + break; + } + } + return; +} /* get options */ + + /* Get a random value 0 <= x <= n */ + +static uint rnd(uint max_value) +{ + return (uint) ((rand() & 32767)/32767.0*max_value); +} /* rnd */ + + + /* Create a variable length record */ + +static void fix_length(uchar *rec, uint length) +{ + bmove(rec+STANDARD_LENGTH, + "0123456789012345678901234567890123456789012345678901234567890", + length-STANDARD_LENGTH); + strfill((char*) rec+length,STANDARD_LENGTH+60-length,' '); +} /* fix_length */ + + +/* Put maybe a blob in record */ + +static int first_entry; + +static void put_blob_in_record(uchar *blob_pos, char **blob_buffer, + ulong *blob_length) +{ + ulong i,length; + *blob_length= 0; + if (use_blob) + { + if (! *blob_buffer && + !(*blob_buffer=my_malloc(PSI_NOT_INSTRUMENTED, (uint) use_blob,MYF(MY_WME)))) + { + use_blob= 0; + return; + } + if (rnd(10) == 0) + { + if (first_entry++ == 0) + { + /* Ensure we have at least one blob of max length in file */ + length= use_blob; + } + else + length=rnd(use_blob); + for (i=0 ; i < length ; i++) + (*blob_buffer)[i]=(char) (length+i); + int4store(blob_pos,length); + memcpy(blob_pos+4, blob_buffer, sizeof(char*)); + *blob_length= length; + } + else + { + int4store(blob_pos,0); + } + } + return; +} + + +static void copy_key(MARIA_HA *info,uint inx,uchar *rec,uchar *key_buff) +{ + HA_KEYSEG *keyseg; + + for (keyseg=info->s->keyinfo[inx].seg ; keyseg->type ; keyseg++) + { + memcpy(key_buff,rec+keyseg->start,(size_t) keyseg->length); + key_buff+=keyseg->length; + } + return; +} + +#include "ma_check_standalone.h" + diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c new file mode 100644 index 00000000..411a5f6c --- /dev/null +++ b/storage/maria/ma_test3.c @@ -0,0 +1,498 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Test av locking */ + +#ifndef _WIN32 /*no fork() in Windows*/ + +#include "maria_def.h" +#include <sys/types.h> +#ifdef HAVE_SYS_WAIT_H +# include <sys/wait.h> +#endif +#ifndef WEXITSTATUS +# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8) +#endif +#ifndef WIFEXITED +# define WIFEXITED(stat_val) (((stat_val) & 255) == 0) +#endif + + +#if defined(HAVE_LRAND48) +#define rnd(X) (lrand48() % X) +#define rnd_init(X) srand48(X) +#else +#define rnd(X) (random() % X) +#define rnd_init(X) srandom(X) +#endif + + +const char *filename= "test3"; +uint tests=10,forks=10,pagecacheing=0; + +static void get_options(int argc, char *argv[]); +void start_test(int id); +int test_read(MARIA_HA *,int),test_write(MARIA_HA *,int,int), + test_update(MARIA_HA *,int,int),test_rrnd(MARIA_HA *,int); + +struct record { + uchar id[8]; + uchar nr[4]; + uchar text[10]; +} record; + + +int main(int argc,char **argv) +{ + int status,wait_ret; + uint i=0; + MARIA_KEYDEF keyinfo[10]; + MARIA_COLUMNDEF recinfo[10]; + HA_KEYSEG keyseg[10][2]; + MY_INIT(argv[0]); + get_options(argc,argv); + + fprintf(stderr, "WARNING! this program is to test 'external locking'" + " (when several processes share a table through file locking)" + " which is not supported by Maria at all; expect errors." + " We may soon remove this program.\n"); + maria_init(); + bzero((char*) keyinfo,sizeof(keyinfo)); + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) keyseg,sizeof(keyseg)); + keyinfo[0].seg= &keyseg[0][0]; + keyinfo[0].seg[0].start=0; + keyinfo[0].seg[0].length=8; + keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT; + keyinfo[0].seg[0].flag=HA_SPACE_PACK; + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].keysegs=1; + keyinfo[0].flag = (uint8) HA_PACK_KEY; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[1].seg= &keyseg[1][0]; + keyinfo[1].seg[0].start=8; + keyinfo[1].seg[0].length=4; /* Long is always 4 in maria */ + keyinfo[1].seg[0].type=HA_KEYTYPE_LONG_INT; + keyinfo[1].seg[0].flag=0; + keyinfo[1].key_alg=HA_KEY_ALG_BTREE; + keyinfo[1].keysegs=1; + keyinfo[1].flag =HA_NOSAME; + keyinfo[1].block_length= 0; /* Default block length */ + + recinfo[0].type=0; + recinfo[0].length=sizeof(record.id); + recinfo[1].type=0; + recinfo[1].length=sizeof(record.nr); + recinfo[2].type=0; + recinfo[2].length=sizeof(record.text); + + puts("- Creating maria-file"); + my_delete(filename,MYF(0)); /* Remove old locks under gdb */ + if (maria_create(filename,BLOCK_RECORD, 2, &keyinfo[0],2,&recinfo[0],0, + (MARIA_UNIQUEDEF*) 0, (MARIA_CREATE_INFO*) 0,0)) + exit(1); + + rnd_init(0); + printf("- Starting %d processes\n",forks); fflush(stdout); + for (i=0 ; i < forks; i++) + { + if (!fork()) + { + start_test(i+1); + sleep(1); + return 0; + } + (void)rnd(1); + } + + for (i=0 ; i < forks ; i++) + while ((wait_ret=wait(&status)) && wait_ret == -1); + maria_end(); + return 0; +} + + +static void get_options(int argc, char **argv) +{ + char *pos,*progname; + + progname= argv[0]; + + while (--argc >0 && *(pos = *(++argv)) == '-' ) { + switch(*++pos) { + case 'f': + forks=atoi(++pos); + break; + case 't': + tests=atoi(++pos); + break; + case 'K': /* Use key cacheing */ + pagecacheing=1; + break; + case 'A': /* All flags */ + pagecacheing=1; + break; + case '?': + case 'I': + case 'V': + printf("%s Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE); + puts("By Monty, for your professional use\n"); + puts("Test av locking with threads\n"); + printf("Usage: %s [-?lKA] [-f#] [-t#]\n",progname); + exit(0); + case '#': + DBUG_PUSH (++pos); + break; + default: + printf("Illegal option: '%c'\n",*pos); + break; + } + } + return; +} + + +void start_test(int id) +{ + uint i; + int error,lock_type; + MARIA_INFO isam_info; + MARIA_HA *file,*file1,*file2=0,*lock; + + if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED,0)) || + !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED,0))) + { + fprintf(stderr,"Can't open isam-file: %s\n",filename); + exit(1); + } + if (pagecacheing && rnd(2) == 0) + init_pagecache(maria_pagecache, 65536L, 0, 0, MARIA_KEY_BLOCK_LENGTH, 0, + MY_WME); + printf("Process %d, pid: %ld\n",id,(long) getpid()); fflush(stdout); + + for (error=i=0 ; i < tests && !error; i++) + { + file= (rnd(2) == 1) ? file1 : file2; + lock=0 ; lock_type=0; + if (rnd(10) == 0) + { + if (maria_lock_database(lock=(rnd(2) ? file1 : file2), + lock_type=(rnd(2) == 0 ? F_RDLCK : F_WRLCK))) + { + fprintf(stderr,"%2d: start: Can't lock table %d\n",id,my_errno); + error=1; + break; + } + } + switch (rnd(4)) { + case 0: error=test_read(file,id); break; + case 1: error=test_rrnd(file,id); break; + case 2: error=test_write(file,id,lock_type); break; + case 3: error=test_update(file,id,lock_type); break; + } + if (lock) + maria_lock_database(lock,F_UNLCK); + } + if (!error) + { + maria_status(file1,&isam_info,HA_STATUS_VARIABLE); + printf("%2d: End of test. Records: %ld Deleted: %ld\n", + id,(long) isam_info.records, (long) isam_info.deleted); + fflush(stdout); + } + + maria_close(file1); + maria_close(file2); + if (error) + { + printf("%2d: Aborted\n",id); fflush(stdout); + exit(1); + } +} + + +int test_read(MARIA_HA *file,int id) +{ + uint i,lock,found,next,prev; + ulong find; + + lock=0; + if (rnd(2) == 0) + { + lock=1; + if (maria_lock_database(file,F_RDLCK)) + { + fprintf(stderr,"%2d: Can't lock table %d\n",id,my_errno); + return 1; + } + } + + found=next=prev=0; + for (i=0 ; i < 100 ; i++) + { + find=rnd(100000); + if (!maria_rkey(file,record.id,1,(uchar*) &find, HA_WHOLE_KEY, + HA_READ_KEY_EXACT)) + found++; + else + { + if (my_errno != HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr,"%2d: Got error %d from read in read\n",id,my_errno); + return 1; + } + else if (!maria_rnext(file,record.id,1)) + next++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in read\n",id,my_errno); + return 1; + } + else if (!maria_rprev(file,record.id,1)) + prev++; + else + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in read\n", + id,my_errno); + return 1; + } + } + } + } + } + if (lock) + { + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + return 1; + } + } + printf("%2d: read: found: %5d next: %5d prev: %5d\n", + id,found,next,prev); + fflush(stdout); + return 0; +} + + +int test_rrnd(MARIA_HA *file,int id) +{ + uint count,lock; + + lock=0; + if (rnd(2) == 0) + { + lock=1; + if (maria_lock_database(file,F_RDLCK)) + { + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + maria_close(file); + return 1; + } + if (rnd(2) == 0) + maria_extra(file,HA_EXTRA_CACHE,0); + } + + count=0; + if (maria_rrnd(file,record.id,0L)) + { + if (my_errno == HA_ERR_END_OF_FILE) + goto end; + fprintf(stderr,"%2d: Can't read first record (%d)\n",id,my_errno); + return 1; + } + for (count=1 ; !maria_rrnd(file,record.id,HA_OFFSET_ERROR) ;count++) ; + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rrnd\n",id,my_errno); + return 1; + } + +end: + if (lock) + { + maria_extra(file,HA_EXTRA_NO_CACHE,0); + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + exit(0); + } + } + printf("%2d: rrnd: %5d\n",id,count); fflush(stdout); + return 0; +} + + +int test_write(MARIA_HA *file,int id,int lock_type) +{ + uint i,tries,count,lock; + + lock=0; + if (rnd(2) == 0 || lock_type == F_RDLCK) + { + lock=1; + if (maria_lock_database(file,F_WRLCK)) + { + if (lock_type == F_RDLCK && my_errno == EDEADLK) + { + printf("%2d: write: deadlock\n",id); fflush(stdout); + return 0; + } + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + maria_close(file); + return 1; + } + if (rnd(2) == 0) + maria_extra(file,HA_EXTRA_WRITE_CACHE,0); + } + + my_snprintf((char*) record.id, sizeof(record.id), "%7ld", (long) getpid()); + strnmov((char*) record.text,"Testing...", sizeof(record.text)); + + tries=(uint) rnd(100)+10; + for (i=count=0 ; i < tries ; i++) + { + uint32 tmp=rnd(80000)+20000; + int4store(record.nr,tmp); + if (!maria_write(file,record.id)) + count++; + else + { + if (my_errno != HA_ERR_FOUND_DUPP_KEY) + { + fprintf(stderr,"%2d: Got error %d (errno %d) from write\n",id,my_errno, + errno); + return 1; + } + } + } + if (lock) + { + maria_extra(file,HA_EXTRA_NO_CACHE,0); + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"%2d: Can't unlock table\n",id); + exit(0); + } + } + printf("%2d: write: %5d\n",id,count); fflush(stdout); + return 0; +} + + +int test_update(MARIA_HA *file,int id,int lock_type) +{ + uint i,lock,update; + uint32 tmp; + char find[4]; + struct record new_record; + + lock=0; + if (rnd(2) == 0 || lock_type == F_RDLCK) + { + lock=1; + if (maria_lock_database(file,F_WRLCK)) + { + if (lock_type == F_RDLCK && my_errno == EDEADLK) + { + printf("%2d: write: deadlock\n",id); fflush(stdout); + return 0; + } + fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno); + return 1; + } + } + bzero((char*) &new_record,sizeof(new_record)); + strmov((char*) new_record.text,"Updated"); + + update=0; + for (i=0 ; i < 100 ; i++) + { + tmp=rnd(100000); + int4store(find,tmp); + if (maria_rkey(file,record.id,1,(uchar*) find, HA_WHOLE_KEY, + HA_READ_KEY_EXACT)) + { + if (my_errno != HA_ERR_KEY_NOT_FOUND) + { + fprintf(stderr,"%2d: Got error %d from read in update\n",id,my_errno); + return 1; + } + else if (maria_rnext(file,record.id,1)) + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in update\n", + id,my_errno); + return 1; + } + else if (maria_rprev(file,record.id,1)) + { + if (my_errno != HA_ERR_END_OF_FILE) + { + fprintf(stderr,"%2d: Got error %d from rnext in update\n", + id,my_errno); + return 1; + } + continue; + } + } + } + memcpy(new_record.id,record.id,sizeof(record.id)); + tmp=rnd(20000)+40000; + int4store(new_record.nr,tmp); + if (!maria_update(file,record.id,new_record.id)) + update++; + else + { + if (my_errno != HA_ERR_RECORD_CHANGED && + my_errno != HA_ERR_RECORD_DELETED && + my_errno != HA_ERR_FOUND_DUPP_KEY) + { + fprintf(stderr,"%2d: Got error %d from update\n",id,my_errno); + return 1; + } + } + } + if (lock) + { + if (maria_lock_database(file,F_UNLCK)) + { + fprintf(stderr,"Can't unlock table,id, error%d\n",my_errno); + return 1; + } + } + printf("%2d: update: %5d\n",id,update); fflush(stdout); + return 0; +} + +#include "ma_check_standalone.h" + +#else /* _WIN32 */ + +#include <stdio.h> + +int main() +{ + fprintf(stderr,"this test has not been ported to Windows\n"); + return 0; +} + +#endif /* _WIN32 */ + diff --git a/storage/maria/ma_test_all.res b/storage/maria/ma_test_all.res new file mode 100644 index 00000000..586aaf68 --- /dev/null +++ b/storage/maria/ma_test_all.res @@ -0,0 +1,14 @@ +Running tests with dynamic row format +Running tests with static row format +Running tests with block row format +Running tests with block row format and transactions +ma_test2 -s -L -K -R1 -m2000 ; Should give error 135 +Error: 135 in write at record: 1099 +got error: 135 when using MARIA-database +./maria_chk -sm test2 will warn that 'Datafile is almost full' +maria_chk: MARIA file test2 +maria_chk: warning: Datafile is almost full, 65516 of 65534 used +MARIA-table 'test2' is usable but should be fixed +MARIA RECOVERY TESTS +ALL RECOVERY TESTS OK +!!!!!!!! BUT REMEMBER to FIX this BLOB issue !!!!!!! diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh new file mode 100755 index 00000000..4e9be4a5 --- /dev/null +++ b/storage/maria/ma_test_all.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# +# This file is now deprecated and has been replaced by +# unittest/ma_test_all-t +# +# +# +# + +PRG='unittest/ma_test_all-t' +UTST='../../unittest/unit.pl' + +if [ ! -x $PRG ] ; then + DIR=`dirname $0` + PRG="$DIR/unittest/ma_test_all-t" + UTST="$DIR/../../unittest/unit.pl" +fi + +if test -n "$1"; then + + # unit.pl can't pass options to ma_test_all-t, so if anything + # was passed as an argument, assume the purpose was to pass + # them to ma_test_all-t and call it directly + + $PRG $@ +else + perl $UTST run $PRG +fi diff --git a/storage/maria/ma_test_big.sh b/storage/maria/ma_test_big.sh new file mode 100644 index 00000000..6419d05e --- /dev/null +++ b/storage/maria/ma_test_big.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# +# This tests is good to find bugs in the redo/undo handling and in +# finding bugs in blob handling +# + +set -e +a=15 +while test $a -le 5000 +do + echo $a + rm -f maria_log* + ma_test2 -s -L -K -W -P -M -T -c -b32768 -t4 -A1 -m$a > /dev/null + maria_read_log -a -s >& /dev/null + maria_chk -es test2 + maria_read_log -a -s >& /dev/null + maria_chk -es test2 + rm test2.MA? + maria_read_log -a -s >& /dev/null + maria_chk -es test2 + a=$((a+1)) +done diff --git a/storage/maria/ma_test_force_start.pl b/storage/maria/ma_test_force_start.pl new file mode 100755 index 00000000..8e56d6ed --- /dev/null +++ b/storage/maria/ma_test_force_start.pl @@ -0,0 +1,238 @@ +#!/usr/bin/env perl + + +use strict; +use warnings; + +my $usage= <<EOF; +This program tests that the options +--aria-force-start-after-recovery-failures --aria-recover work as +expected. +It has to be run from directory mysql-test, and works with non-debug +and debug binaries. +Pass it option -d or -i (to test corruption of data or index file). +EOF + +# -d currently exhibits BUG#36578 +# "Maria: maria-recover may fail to autorepair a table" + +die($usage) if (@ARGV == 0); + +my $corrupt_index; + +if ($ARGV[0] eq '-d') + { + $corrupt_index= 0; + } +elsif ($ARGV[0] eq '-i') + { + $corrupt_index= 1; + } +else + { + die($usage); + } + +my $force_after= 3; +my $corrupt_file= $corrupt_index ? "MAI" : "MAD"; +my $corrupt_message= + "\\[ERROR\\] mysqld(.exe)*: Table '..test.t1' is marked as crashed and should be repaired"; + +my $sql_name= "./var/tmp/create_table.sql"; +my $error_log_name= "./var/log/master.err"; +my @cmd_output; +my $whatever; # garbage data +$ENV{MTR_VERSION} = 1; # MTR2 does not have --start-and-exit +my $base_server_cmd= "perl mysql-test-run.pl --mysqld=--aria-force-start-after-recovery-failures=$force_after --suite=maria maria.maria-recover "; +if ($^O =~ /^mswin/i) + { + print <<EOF; +WARNING: with Activestate Perl, mysql-test-run.pl --start-and-exit has a bug: +it does not exit; cygwin perl recommended +EOF + } +my $iswindows= ( $^O =~ /win/i && $^O !~ /darwin/i ); +$base_server_cmd.= ($iswindows ? "--mysqld=--console" : "--mem"); +my $server_cmd; +my $server_pid_name="./var/run/master.pid"; +my $server_pid; +my $i; # count of server restarts +sub kill_server; + +my $suffix= ($iswindows ? ".exe" : ""); +my $client_exe_path= "../client/release"; +# we use -f, sometimes -x is unexpectedly false in Cygwin +if ( ! -f "$client_exe_path/mysql$suffix" ) + { + $client_exe_path= "../client/relwithdebinfo"; + if ( ! -f "$client_exe_path/mysql$suffix" ) + { + $client_exe_path= "../client/debug"; + if ( ! -f "$client_exe_path/mysql$suffix" ) + { + $client_exe_path= "../client"; + if ( ! -f "$client_exe_path/mysql$suffix" ) + { + die("Cannot find 'mysql' executable\n"); + } + } + } + } + +print "starting mysqld\n"; +$server_cmd= $base_server_cmd . " --start-and-exit 2>&1"; +@cmd_output=`$server_cmd`; +die if $?; +my $master_port= (grep (/Using MASTER_MYPORT .*= (\d+)$/, @cmd_output))[0]; +$master_port =~ s/.*= //; +chomp $master_port; +die unless $master_port > 0; + +my $client_cmd= "$client_exe_path/mysql -u root -h 127.0.0.1 -P $master_port test < $sql_name"; + +open(FILE, ">", $sql_name) or die; + +# To exhibit BUG#36578 with -d, we don't create an index if -d. This is +# because the presence of an index will cause repair-by-sort to be used, +# where sort_get_next_record() is only called inside +#_ma_create_index_by_sort(), so the latter function fails and in this +# case retry_repair is set, so bug does not happen. Whereas without +# an index, repair-with-key-cache is called, which calls +# sort_get_next_record() whose failure itself does not cause a retry. + +print FILE "create table t1 (a varchar(1000)". + ($corrupt_index ? ", index(a)" : "") .") engine=aria;\n"; +print FILE <<EOF; +insert into t1 values("ThursdayMorningsMarket"); +# If Recovery executes REDO_INDEX_NEW_PAGE it will overwrite our +# intentional corruption; we make Recovery skip this record by bumping +# create_rename_lsn using OPTIMIZE TABLE. This also makes sure to put +# the pages on disk, so that we can corrupt them. +optimize table t1; +# mark table open, so that --aria-recover repairs it +insert into t1 select concat(a,'b') from t1 limit 1; +EOF +close FILE; + +print "creating table\n"; +`$client_cmd`; +die if $?; + +print "killing mysqld hard\n"; +kill_server(9); + +print "ruining " . + ($corrupt_index ? "first page of keys" : "bitmap page") . + " in table to test aria-recover\n"; +open(FILE, "+<", "./var/master-data/test/t1.$corrupt_file") or die; +$whatever= ("\xAB" x 100); +sysseek (FILE, $corrupt_index ? 8192 : (8192-100-100), 0) or die; +syswrite (FILE, $whatever) or die; +close FILE; + +print "ruining log to make recovery fail; mysqld should fail the $force_after first restarts\n"; +open(FILE, "+<", "./var/tmp/aria_log.00000001") or die; +$whatever= ("\xAB" x 8192); +sysseek (FILE, 99, 0) or die; +syswrite (FILE, $whatever) or die; +close FILE; + +$server_cmd= $base_server_cmd . " --start-dirty 2>&1"; +for($i= 1; $i <= $force_after; $i= $i + 1) + { + print "mysqld restart number $i... "; + unlink($error_log_name) or die; + `$server_cmd`; + # mysqld should return 1 when can't read log + die unless (($? >> 8) == 1); + open(FILE, "<", $error_log_name) or die; + @cmd_output= <FILE>; + close FILE; + die unless grep(/\[ERROR\] mysqld(.exe)*: Aria engine: log initialization failed/, @cmd_output); + die unless grep(/\[ERROR\] Plugin 'Aria' init function returned error./, @cmd_output); + print "failed - ok\n"; + } + +print "mysqld restart number $i... "; +unlink($error_log_name) or die; +@cmd_output=`$server_cmd`; +die if $?; +open(FILE, "<", $error_log_name) or die; +@cmd_output= <FILE>; +close FILE; +die unless grep(/\[Warning\] mysqld(.exe)*: Aria engine: removed all logs after [\d]+ consecutive failures of recovery from logs/, @cmd_output); +die unless grep(/\[ERROR\] mysqld(.exe)*: File '.*tmp.aria_log.00000001' not found \(Errcode: 2\)/, @cmd_output); +print "success - ok\n"; + +open(FILE, ">", $sql_name) or die; +print FILE <<EOF; +set global aria_recover=normal; +insert into t1 values('aaa'); +EOF +close FILE; + +# verify corruption has not yet been noticed +open(FILE, "<", $error_log_name) or die; +@cmd_output= <FILE>; +close FILE; +die if grep(/$corrupt_message/, @cmd_output); + +print "inserting in table\n"; +`$client_cmd`; +die if $?; +print "table is usable - ok\n"; + +open(FILE, "<", $error_log_name) or die; +@cmd_output= <FILE>; +close FILE; +die unless grep(/$corrupt_message/, @cmd_output); +die unless grep(/\[Warning\] Recovering table: '..test.t1'/, @cmd_output); +print "was corrupted and automatically repaired - ok\n"; + +# remove our traces +kill_server(15); + +print "TEST ALL OK\n"; + +# kills mysqld with signal given in parameter +sub kill_server + { + my ($sig)= @_; + my $wait_count= 0; + my $kill_cmd; + my @kill_output; + open(FILE, "<", $server_pid_name) or die; + @cmd_output= <FILE>; + close FILE; + $server_pid= $cmd_output[0]; + chomp $server_pid; + die unless $server_pid > 0; + if ($iswindows) + { + # On Windows, server_pid_name is not the "main" process id + # so perl's kill() does not see this process id. + # But taskkill works, though only with /F ("-9"-style kill). + $kill_cmd= "taskkill /F /PID $server_pid 2>&1"; + @kill_output= `$kill_cmd`; + die unless grep(/has been terminated/, @kill_output); + } + else + { + kill($sig, $server_pid) or die; + } + while (1) # wait until mysqld process gone + { + if ($iswindows) + { + @kill_output= `$kill_cmd`; + last if grep(/not found/, @kill_output); + } + else + { + kill (0, $server_pid) or last; + } + print "waiting for mysqld to die\n" if ($wait_count > 30); + $wait_count= $wait_count + 1; + select(undef, undef, undef, 0.1); + } + } diff --git a/storage/maria/ma_test_recovery b/storage/maria/ma_test_recovery new file mode 100755 index 00000000..0b20264c --- /dev/null +++ b/storage/maria/ma_test_recovery @@ -0,0 +1,8 @@ +#!/bin/sh + +# Remove comment from next line if this script fails and you need more +# information of what's going on + +# This file is deprecated and has been replaced with ma_test_recovery.pl + +unittest/ma_test_recovery.pl $@ diff --git a/storage/maria/ma_trnman.h b/storage/maria/ma_trnman.h new file mode 100644 index 00000000..c863eac5 --- /dev/null +++ b/storage/maria/ma_trnman.h @@ -0,0 +1,116 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _ma_trnman_h +#define _ma_trnman_h + +/** + Sets table's trn and prints debug information + Links table into new_trn->used_instances + + @param tbl MARIA_HA of table + @param newtrn what to put into tbl->trn +*/ + +static inline void _ma_set_trn_for_table(MARIA_HA *tbl, TRN *newtrn) +{ + DBUG_PRINT("info",("table: %p trn: %p -> %p", + tbl, tbl->trn, newtrn)); + + /* check that we are not calling this twice in a row */ + DBUG_ASSERT(newtrn->used_instances != (void*) tbl); + + tbl->trn= newtrn; + /* Link into used list */ + if (newtrn->used_instances) + ((MARIA_HA*) newtrn->used_instances)->trn_prev= &tbl->trn_next; + tbl->trn_next= (MARIA_HA*) newtrn->used_instances; + tbl->trn_prev= (MARIA_HA**) &newtrn->used_instances; + newtrn->used_instances= tbl; +} + + +/* + Same as _ma_set_trn_for_table(), but don't link table into used_instance list + Used when we want to temporary set trn for a table in extra() +*/ + +static inline void _ma_set_tmp_trn_for_table(MARIA_HA *tbl, TRN *newtrn) +{ + DBUG_PRINT("info",("table: %p trn: %p -> %p", + tbl, tbl->trn, newtrn)); + tbl->trn= newtrn; + tbl->trn_prev= 0; + tbl->trn_next= 0; /* To avoid assert in ha_maria::close() */ +} + + +/* + Reset TRN in table +*/ + +static inline void _ma_reset_trn_for_table(MARIA_HA *tbl) +{ + DBUG_PRINT("info",("table: %p trn: %p -> NULL", tbl, tbl->trn)); + + /* The following is only false if tbl->trn == &dummy_transaction_object */ + if (tbl->trn_prev) + { + if (tbl->trn_next) + tbl->trn_next->trn_prev= tbl->trn_prev; + *tbl->trn_prev= tbl->trn_next; + tbl->trn_prev= 0; + tbl->trn_next= 0; + } + tbl->trn= 0; +} + + +/* + Take over the used_instances link from a trn object + Reset the link in the trn object +*/ + +static inline void relink_trn_used_instances(MARIA_HA **used_tables, TRN *trn) +{ + if (likely(*used_tables= (MARIA_HA*) trn->used_instances)) + { + /* Check that first back link is correct */ + DBUG_ASSERT((*used_tables)->trn_prev == (MARIA_HA **)&trn->used_instances); + + /* Fix back link to point to new base for the list */ + (*used_tables)->trn_prev= used_tables; + trn->used_instances= 0; + } +} + +/** + When we want to check a table, we verify that the transaction ids of rows + and keys are not bigger than the biggest id generated by Maria so far, which + is returned by the function below. + + @note If control file is not open, 0 may be returned; to not confuse + this with a valid max trid of 0, the caller should notice that it failed to + open the control file (ma_control_file_inited() can serve for that). +*/ + +static inline TrID max_trid_in_system(void) +{ + TrID id= trnman_get_max_trid(); /* 0 if transac manager not initialized */ + /* 'id' may be far bigger, if last shutdown is old */ + return MY_MAX(id, max_trid_in_control_file); +} + +#endif /* _ma_trnman_h */ diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c new file mode 100644 index 00000000..5e7925b9 --- /dev/null +++ b/storage/maria/ma_unique.c @@ -0,0 +1,271 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2020, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Functions to check if a row is unique */ + +#include "maria_def.h" +#include <m_ctype.h> + +/** + Check if there exist a row with the same hash + + @notes + This function is not versioning safe. For the moment this is not a problem + as it's only used for internal temporary tables in MySQL for which there + isn't any versioning information. +*/ + +my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, + ha_checksum unique_hash, my_off_t disk_pos) +{ + my_off_t lastpos=info->cur_row.lastpos; + MARIA_KEYDEF *keyinfo= &info->s->keyinfo[def->key]; + uchar *key_buff= info->lastkey_buff2; + MARIA_KEY key; + int error= 0; + DBUG_ENTER("_ma_check_unique"); + DBUG_PRINT("enter",("unique_hash: %lu", (ulong) unique_hash)); + + /* We need to store the hash value as a key in the record, breaking const */ + maria_unique_store(record+keyinfo->seg->start, unique_hash); + /* Can't be spatial so it's ok to call _ma_make_key directly here */ + _ma_make_key(info, &key, def->key, key_buff, record, 0, 0); + + /* The above changed info->lastkey_buff2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + /* Setup that unique key is active key */ + info->last_key.keyinfo= keyinfo; + + /* any key pointer in data is destroyed */ + info->lastinx= ~0; + + DBUG_ASSERT(key.data_length == MARIA_UNIQUE_HASH_LENGTH); + if (_ma_search(info, &key, SEARCH_FIND | SEARCH_SAVE_BUFF, + info->s->state.key_root[def->key])) + { + info->page_changed=1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + goto end; + } + + for (;;) + { + if (info->cur_row.lastpos != disk_pos && + !(*info->s->compare_unique)(info,def,record,info->cur_row.lastpos)) + { + my_errno=HA_ERR_FOUND_DUPP_UNIQUE; + info->errkey= (int) def->key; + info->dup_key_pos= info->cur_row.lastpos; + info->page_changed= 1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + DBUG_PRINT("info",("Found duplicate")); + error= 1; /* Found identical */ + goto end; + } + DBUG_ASSERT(info->last_key.data_length == MARIA_UNIQUE_HASH_LENGTH); + if (_ma_search_next(info, &info->last_key, SEARCH_BIGGER, + info->s->state.key_root[def->key]) || + bcmp(info->last_key.data, key_buff, MARIA_UNIQUE_HASH_LENGTH)) + { + info->page_changed= 1; /* Can't optimize read next */ + info->cur_row.lastpos= lastpos; + break; /* end of tree */ + } + } + +end: + DBUG_RETURN(error); +} + + +/* + Calculate a hash for a row + + TODO + Add support for bit fields +*/ + +ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record) +{ + const uchar *pos, *end; + ha_checksum crc= 0; + ulong seed1=0, seed2= 4; + HA_KEYSEG *keyseg; + + for (keyseg=def->seg ; keyseg < def->end ; keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint length=keyseg->length; + + if (keyseg->null_bit) + { + if (record[keyseg->null_pos] & keyseg->null_bit) + { + /* + Change crc in a way different from an empty string or 0. + (This is an optimisation; The code will work even if this isn't + done) + */ + crc=((crc << 8) + 511+ + (crc >> (8*sizeof(ha_checksum)-8))); + continue; + } + } + pos= record+keyseg->start; + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= keyseg->bit_start; + uint tmp_length= (pack_length == 1 ? (uint) *pos : + uint2korr(pos)); + pos+= pack_length; /* Skip VARCHAR length */ + set_if_smaller(length,tmp_length); + } + else if (keyseg->flag & HA_BLOB_PART) + { + uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos); + memcpy((void*) &pos,pos+keyseg->bit_start,sizeof(char*)); + if (!length || length > tmp_length) + length=tmp_length; /* The whole blob */ + } + end= pos+length; + if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 || + type == HA_KEYTYPE_VARTEXT2) + { + my_ci_hash_sort(keyseg->charset, + (const uchar*) pos, length, + &seed1, &seed2); + crc+= seed1; + } + else + { + my_hash_sort_bin((CHARSET_INFO*) 0, pos, (size_t) (end-pos), + &seed1, &seed2); + crc+= seed1; + } + } + return crc; +} + + +/* + compare unique key for two rows + + TODO + Add support for bit fields + + RETURN + 0 if both rows have equal unique value + 1 Rows are different +*/ + +my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b, + my_bool null_are_equal) +{ + const uchar *pos_a, *pos_b, *end; + HA_KEYSEG *keyseg; + + for (keyseg=def->seg ; keyseg < def->end ; keyseg++) + { + enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type; + uint a_length, b_length; + a_length= b_length= keyseg->length; + + /* If part is NULL it's regarded as different */ + if (keyseg->null_bit) + { + uint tmp; + if ((tmp=(a[keyseg->null_pos] & keyseg->null_bit)) != + (uint) (b[keyseg->null_pos] & keyseg->null_bit)) + return 1; + if (tmp) + { + if (!null_are_equal) + return 1; + continue; + } + } + pos_a= a+keyseg->start; + pos_b= b+keyseg->start; + if (keyseg->flag & HA_VAR_LENGTH_PART) + { + uint pack_length= keyseg->bit_start; + if (pack_length == 1) + { + a_length= (uint) *pos_a++; + b_length= (uint) *pos_b++; + } + else + { + a_length= uint2korr(pos_a); + b_length= uint2korr(pos_b); + pos_a+= 2; /* Skip VARCHAR length */ + pos_b+= 2; + } + set_if_smaller(a_length, keyseg->length); /* Safety */ + set_if_smaller(b_length, keyseg->length); /* safety */ + } + else if (keyseg->flag & HA_BLOB_PART) + { + /* Only compare 'length' characters if length != 0 */ + a_length= _ma_calc_blob_length(keyseg->bit_start,pos_a); + b_length= _ma_calc_blob_length(keyseg->bit_start,pos_b); + /* Check that a and b are of equal length */ + if (keyseg->length) + { + /* + This is used in some cases when we are not interested in comparing + the whole length of the blob. + */ + set_if_smaller(a_length, keyseg->length); + set_if_smaller(b_length, keyseg->length); + } + memcpy((void*) &pos_a, pos_a+keyseg->bit_start, sizeof(char*)); + memcpy((void*) &pos_b, pos_b+keyseg->bit_start, sizeof(char*)); + } + if (type == HA_KEYTYPE_TEXT/* the CHAR data type*/) + { + if (ha_compare_char_fixed(keyseg->charset, + pos_a, a_length, + pos_b, b_length, + keyseg->length / keyseg->charset->mbmaxlen, + FALSE/*b_is_prefix*/)) + return 1; + } + else if (type == HA_KEYTYPE_VARTEXT1 || + type == HA_KEYTYPE_VARTEXT2) + { + if (ha_compare_char_varying(keyseg->charset, + pos_a, a_length, + pos_b, b_length, + FALSE/*b_is_prefix*/)) + return 1; + } + else + { + if (a_length != b_length) + return 1; + end= pos_a+a_length; + while (pos_a != end) + { + if (*pos_a++ != *pos_b++) + return 1; + } + } + } + return 0; +} diff --git a/storage/maria/ma_update.c b/storage/maria/ma_update.c new file mode 100644 index 00000000..60ab4452 --- /dev/null +++ b/storage/maria/ma_update.c @@ -0,0 +1,253 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" +#include "trnman.h" + +/** + Update an old row in a MARIA table +*/ + +int maria_update(register MARIA_HA *info, const uchar *oldrec, + const uchar *newrec) +{ + int flag,key_changed,save_errno; + reg3 my_off_t pos; + uint i; + uchar old_key_buff[MARIA_MAX_KEY_BUFF], *UNINIT_VAR(new_key_buff); + my_bool auto_key_changed= 0; + ulonglong UNINIT_VAR(changed); + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_update"); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + _ma_print_error(info, HA_ERR_CRASHED, 0); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + if (!(info->update & HA_STATE_AKTIV)) + { + DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND); + } + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (share->state.state.key_file_length >= share->base.margin_key_file_length) + { + DBUG_RETURN(my_errno=HA_ERR_INDEX_FILE_FULL); + } + pos= info->cur_row.lastpos; + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + + if ((*share->compare_record)(info,oldrec)) + { + save_errno= my_errno; + DBUG_PRINT("warning", ("Got error from compare record")); + goto err_end; /* Record has changed */ + } + + /* Calculate and check all unique constraints */ + key_changed=0; + for (i=0 ; i < share->state.header.uniques ; i++) + { + MARIA_UNIQUEDEF *def=share->uniqueinfo+i; + if (_ma_unique_comp(def, newrec, oldrec,1) && + _ma_check_unique(info, def, newrec, _ma_unique_hash(def, newrec), + pos)) + { + save_errno=my_errno; + goto err_end; + } + } + + if (_ma_mark_file_changed(share)) + { + save_errno=my_errno; + goto err_end; + } + + /* Ensure we don't try to restore auto_increment if it doesn't change */ + info->last_auto_increment= ~(ulonglong) 0; + + /* Check which keys changed from the original row */ + + new_key_buff= info->lastkey_buff2; + changed=0; + for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++) + { + if (maria_is_key_active(share->state.key_map, i)) + { + if (keyinfo->flag & HA_FULLTEXT ) + { + if (_ma_ft_cmp(info,i,oldrec, newrec)) + { + if ((int) i == info->lastinx) + { + /* + We are changeing the index we are reading on. Mark that + the index data has changed and we need to do a full search + when doing read-next + */ + key_changed|=HA_STATE_WRITTEN; + } + changed|=((ulonglong) 1 << i); + if (_ma_ft_update(info,i,old_key_buff,oldrec,newrec,pos)) + goto err; + } + } + else + { + MARIA_KEY new_key, old_key; + + (*keyinfo->make_key)(info,&new_key, i, new_key_buff, newrec, + pos, info->trn->trid); + (*keyinfo->make_key)(info,&old_key, i, old_key_buff, + oldrec, pos, info->cur_row.trid); + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (new_key.data_length != old_key.data_length || + memcmp(old_key.data, new_key.data, new_key.data_length)) + { + if ((int) i == info->lastinx) + key_changed|=HA_STATE_WRITTEN; /* Mark that keyfile changed */ + changed|=((ulonglong) 1 << i); + keyinfo->version++; + if (keyinfo->ck_delete(info,&old_key)) + goto err; + if (keyinfo->ck_insert(info,&new_key)) + goto err; + if (share->base.auto_key == i+1) + auto_key_changed=1; + } + } + } + } + + if (share->calc_checksum) + { + /* + We can't use the row based checksum as this doesn't have enough + precision (one byte, while the table's is more bytes). + At least _ma_check_unique() modifies the 'newrec' record, so checksum + has to be computed _after_ it. Nobody apparently modifies 'oldrec'. + We need to pass the old row's checksum down to (*update_record)(), we do + this via info->new_row.checksum (not intuitive but existing code + mandated that cur_row is the new row). + If (*update_record)() fails, table will be marked corrupted so no need + to revert the live checksum change. + */ + info->cur_row.checksum= (*share->calc_checksum)(info, newrec); + info->new_row.checksum= (*share->calc_checksum)(info, oldrec); + info->state->checksum+= info->cur_row.checksum - info->new_row.checksum; + } + + if ((*share->update_record)(info, pos, oldrec, newrec)) + goto err; + + if (auto_key_changed & !share->now_transactional) + { + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + const uchar *key= newrec + keyseg->start; + set_if_bigger(share->state.auto_increment, + ma_retrieve_auto_increment(key, keyseg->type)); + } + + /* + We can't yet have HA_STATE_AKTIV here, as block_record dosn't support it + */ + info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED | key_changed); + info->row_changes++; + share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED; + info->state->changed= 1; + + /* + Every Maria function that updates Maria table must end with + call to _ma_writeinfo(). If operation (second param of + _ma_writeinfo()) is not 0 it sets share->changed to 1, that is + flags that data has changed. If operation is 0, this function + equals to no-op in this case. + + ma_update() must always pass !0 value as operation, since even if + there is no index change there could be data change. + */ + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (update)", + share->open_file_name.str)); + (*info->invalidator)(share->open_file_name.str); + info->invalidator=0; + } + DBUG_RETURN(0); + +err: + DBUG_PRINT("error",("key: %d errno: %d",i,my_errno)); + save_errno= my_errno; + DBUG_ASSERT(save_errno); + if (!save_errno) + save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */ + + if (my_errno == HA_ERR_FOUND_DUPP_KEY || my_errno == HA_ERR_OUT_OF_MEM || + my_errno == HA_ERR_RECORD_FILE_FULL) + { + info->errkey= (int) i; + flag=0; + do + { + if (((ulonglong) 1 << i) & changed) + { + if (share->keyinfo[i].flag & HA_FULLTEXT) + { + if ((flag++ && _ma_ft_del(info,i,new_key_buff,newrec,pos)) || + _ma_ft_add(info,i,old_key_buff,oldrec,pos)) + { + _ma_set_fatal_error(info, my_errno); + break; + } + } + else + { + MARIA_KEY new_key, old_key; + (*share->keyinfo[i].make_key)(info, &new_key, i, new_key_buff, + newrec, pos, + info->trn->trid); + (*share->keyinfo[i].make_key)(info, &old_key, i, old_key_buff, + oldrec, pos, info->cur_row.trid); + if ((flag++ && _ma_ck_delete(info, &new_key)) || + _ma_ck_write(info, &old_key)) + { + _ma_set_fatal_error(info, my_errno); + break; + } + } + } + } while (i-- != 0); + } + else + _ma_set_fatal_error(info, save_errno); + + info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_ROW_CHANGED | + key_changed); + + err_end: + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); + if (save_errno == HA_ERR_KEY_NOT_FOUND) + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(my_errno=save_errno); +} /* maria_update */ diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c new file mode 100644 index 00000000..95cc1203 --- /dev/null +++ b/storage/maria/ma_write.c @@ -0,0 +1,2503 @@ +/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (C) 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Write a row to a MARIA table */ + +#include "ma_fulltext.h" +#include "ma_rt_index.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_blockrec.h" + + /* Functions declared in this file */ + +static int w_search(MARIA_HA *info, uint32 comp_flag, + MARIA_KEY *key, my_off_t page, + MARIA_PAGE *father_page, uchar *father_keypos, + my_bool insert_last); +static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_KEY *key, MARIA_PAGE *curr_page, + MARIA_PAGE *father_page, + uchar *father_key_pos, MARIA_KEY_PARAM *s_temp); +static uchar *_ma_find_last_pos(MARIA_KEY *int_key, + MARIA_PAGE *page, uchar **after_key); +static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key); +static my_bool _ma_ck_write_btree(register MARIA_HA *info, MARIA_KEY *key); +static my_bool _ma_ck_write_btree_with_log(MARIA_HA *, MARIA_KEY *, my_off_t *, + uint32); +static my_bool _ma_log_split(MARIA_PAGE *page, uint org_length, + uint new_length, + const uchar *key_pos, + uint key_length, int move_length, + enum en_key_op prefix_or_suffix, + const uchar *data, uint data_length, + uint changed_length); +static my_bool _ma_log_del_prefix(MARIA_PAGE *page, + uint org_length, uint new_length, + const uchar *key_pos, uint key_length, + int move_length); +static my_bool _ma_log_key_middle(MARIA_PAGE *page, + uint new_length, + uint data_added_first, + uint data_changed_first, + uint data_deleted_last, + const uchar *key_pos, + uint key_length, int move_length); + +/* + @brief Default handler for returing position to new row + + @note + This is only called for non transactional tables and not for block format + which is why we use info->state here. +*/ + +MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, + const uchar *record + __attribute__((unused))) +{ + return ((info->s->state.dellink != HA_OFFSET_ERROR && + !info->append_insert_at_end) ? + info->s->state.dellink : + info->state->data_file_length); +} + +my_bool _ma_write_abort_default(MARIA_HA *info __attribute__((unused))) +{ + return 0; +} + + +/* Write new record to a table */ + +int maria_write(MARIA_HA *info, const uchar *record) +{ + MARIA_SHARE *share= info->s; + uint i; + int save_errno; + MARIA_RECORD_POS filepos, oldpos= info->cur_row.lastpos; + uchar *buff; + my_bool lock_tree= share->lock_key_trees; + my_bool fatal_error; + MARIA_KEYDEF *keyinfo; + DBUG_ENTER("maria_write"); + DBUG_PRINT("enter",("index_file: %d data_file: %d", + share->kfile.file, info->dfile.file)); + + DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage", + _ma_print_error(info, HA_ERR_CRASHED, 0); + DBUG_RETURN(my_errno= HA_ERR_CRASHED);); + if (share->options & HA_OPTION_READ_ONLY_DATA) + { + DBUG_RETURN(my_errno=EACCES); + } + if (_ma_readinfo(info,F_WRLCK,1)) + DBUG_RETURN(my_errno); + + if ((share->state.changed & STATE_DATA_FILE_FULL) || + (share->base.reloc == (ha_rows) 1 && + share->base.records == (ha_rows) 1 && + share->state.state.records == (ha_rows) 1)) + { /* System file */ + my_errno=HA_ERR_RECORD_FILE_FULL; + goto err2; + } + if (share->state.state.key_file_length >= share->base.margin_key_file_length) + { + my_errno=HA_ERR_INDEX_FILE_FULL; + goto err2; + } + if (_ma_mark_file_changed(share)) + goto err2; + + /* Calculate and check all unique constraints */ + + if (share->state.header.uniques) + { + for (i=0 ; i < share->state.header.uniques ; i++) + { + MARIA_UNIQUEDEF *def= share->uniqueinfo + i; + ha_checksum unique_hash= _ma_unique_hash(share->uniqueinfo+i,record); + if (maria_is_key_active(share->state.key_map, def->key)) + { + if (_ma_check_unique(info, def, record, + unique_hash, HA_OFFSET_ERROR)) + goto err2; + } + else + maria_unique_store(record+ share->keyinfo[def->key].seg->start, + unique_hash); + } + } + + /* Ensure we don't try to restore auto_increment if it doesn't change */ + info->last_auto_increment= ~(ulonglong) 0; + + if ((info->opt_flag & OPT_NO_ROWS)) + filepos= HA_OFFSET_ERROR; + else + { + /* + This may either calculate a record or, or write the record and return + the record id + */ + if ((filepos= (*share->write_record_init)(info, record)) == + HA_OFFSET_ERROR) + goto err2; + } + + /* Write all keys to indextree */ + buff= info->lastkey_buff2; + for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++) + { + MARIA_KEY int_key; + if (maria_is_key_active(share->state.key_map, i)) + { + my_bool local_lock_tree= (lock_tree && + !(info->bulk_insert && + is_tree_inited(&info->bulk_insert[i]))); + if (local_lock_tree) + { + mysql_rwlock_wrlock(&keyinfo->root_lock); + keyinfo->version++; + } + if (keyinfo->flag & HA_FULLTEXT ) + { + if (_ma_ft_add(info,i, buff,record,filepos)) + { + if (local_lock_tree) + mysql_rwlock_unlock(&keyinfo->root_lock); + DBUG_PRINT("error",("Got error: %d on write",my_errno)); + goto err; + } + } + else + { + while (keyinfo->ck_insert(info, + (*keyinfo->make_key)(info, &int_key, i, + buff, record, filepos, + info->trn->trid))) + { + TRN *blocker; + DBUG_PRINT("error",("Got error: %d on write",my_errno)); + /* + explicit check to filter out temp tables, they aren't + transactional and don't have a proper TRN so the code + below doesn't work for them. + Also, filter out non-thread maria use, and table modified in + the same transaction. + At last, filter out non-dup-unique errors. + */ + if (!local_lock_tree) + goto err; + if (info->dup_key_trid == info->trn->trid || + my_errno != HA_ERR_FOUND_DUPP_KEY) + { + mysql_rwlock_unlock(&keyinfo->root_lock); + goto err; + } + /* Different TrIDs: table must be transactional */ + DBUG_ASSERT(share->base.born_transactional); + /* + If transactions are disabled, and dup_key_trid is different from + our TrID, it must be ALTER TABLE with dup_key_trid==0 (no + transaction). ALTER TABLE does have MARIA_HA::TRN not dummy but + puts TrID=0 in rows/keys. + */ + DBUG_ASSERT(share->now_transactional || + (info->dup_key_trid == 0)); + blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid); + /* + if blocker TRN was not found, it means that the conflicting + transaction was committed long time ago. It could not be + aborted, as it would have to wait on the key tree lock + to remove the conflicting key it has inserted. + */ + if (!blocker || blocker->commit_trid != ~(TrID)0) + { /* committed */ + if (blocker) + mysql_mutex_unlock(& blocker->state_lock); + mysql_rwlock_unlock(&keyinfo->root_lock); + goto err; + } + mysql_rwlock_unlock(&keyinfo->root_lock); + { + /* running. now we wait */ + WT_RESOURCE_ID rc; + int res; + PSI_stage_info old_stage_info; + + rc.type= &ma_rc_dup_unique; + /* TODO savepoint id when we'll have them */ + rc.value= (intptr)blocker; + res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc); + if (res != WT_OK) + { + mysql_mutex_unlock(& blocker->state_lock); + my_errno= HA_ERR_LOCK_DEADLOCK; + goto err; + } + proc_info_hook(0, &stage_waiting_for_a_resource, &old_stage_info, + __func__, __FILE__, __LINE__); + res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock); + proc_info_hook(0, &old_stage_info, 0, __func__, __FILE__, __LINE__); + + mysql_mutex_unlock(& blocker->state_lock); + if (res != WT_OK) + { + my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT + : HA_ERR_LOCK_DEADLOCK; + goto err; + } + } + mysql_rwlock_wrlock(&keyinfo->root_lock); +#ifndef MARIA_CANNOT_ROLLBACK + keyinfo->version++; +#endif + } + } + + /* The above changed info->lastkey2. Inform maria_rnext_same(). */ + info->update&= ~HA_STATE_RNEXT_SAME; + + if (local_lock_tree) + mysql_rwlock_unlock(&keyinfo->root_lock); + } + } + if (share->calc_write_checksum) + info->cur_row.checksum= (*share->calc_write_checksum)(info,record); + if (filepos != HA_OFFSET_ERROR) + { + if ((*share->write_record)(info,record)) + goto err; + info->state->checksum+= info->cur_row.checksum; + } + if (!share->now_transactional) + { + if (share->base.auto_key != 0) + { + const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg; + const uchar *key= record + keyseg->start; + set_if_bigger(share->state.auto_increment, + ma_retrieve_auto_increment(key, keyseg->type)); + } + } + info->state->records++; + info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN | + HA_STATE_ROW_CHANGED); + info->row_changes++; + share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED; + info->state->changed= 1; + + info->cur_row.lastpos= oldpos; + _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); + if (info->invalidator != 0) + { + DBUG_PRINT("info", ("invalidator... '%s' (update)", + share->open_file_name.str)); + (*info->invalidator)(share->open_file_name.str); + info->invalidator=0; + } + + /* + Update status of the table. We need to do so after each row write + for the log tables, as we want the new row to become visible to + other threads as soon as possible. We don't lock mutex here + (as it is required by pthread memory visibility rules) as (1) it's + not critical to use outdated share->is_log_table value (2) locking + mutex here for every write is too expensive. + */ + if (share->is_log_table) + _ma_update_status((void*) info); + + DBUG_RETURN(0); + +err: + save_errno= my_errno; + fatal_error= 0; + if (my_errno == HA_ERR_FOUND_DUPP_KEY || + my_errno == HA_ERR_RECORD_FILE_FULL || + my_errno == HA_ERR_LOCK_DEADLOCK || + my_errno == HA_ERR_LOCK_WAIT_TIMEOUT || + my_errno == HA_ERR_NULL_IN_SPATIAL || + my_errno == HA_ERR_OUT_OF_MEM) + { + info->errkey= i < share->base.keys ? (int) i : -1; + /* + We delete keys in the reverse order of insertion. This is the order that + a rollback would do and is important for CLR_ENDs generated by + _ma_ft|ck_delete() and write_record_abort() to work (with any other + order they would cause wrong jumps in the chain). + */ + while ( i-- > 0) + { + if (maria_is_key_active(share->state.key_map, i)) + { + my_bool local_lock_tree= (lock_tree && + !(info->bulk_insert && + is_tree_inited(&info->bulk_insert[i]))); + keyinfo= share->keyinfo + i; + if (local_lock_tree) + mysql_rwlock_wrlock(&keyinfo->root_lock); + /** + @todo RECOVERY BUG + The key deletes below should generate CLR_ENDs + */ + if (keyinfo->flag & HA_FULLTEXT) + { + if (_ma_ft_del(info,i,buff,record,filepos)) + { + fatal_error= 1; + if (local_lock_tree) + mysql_rwlock_unlock(&keyinfo->root_lock); + break; + } + } + else + { + MARIA_KEY key; + if (keyinfo->ck_delete(info, + (*keyinfo->make_key)(info, &key, i, buff, + record, + filepos, + info->trn->trid))) + { + fatal_error= 1; + if (local_lock_tree) + mysql_rwlock_unlock(&keyinfo->root_lock); + break; + } + } + if (local_lock_tree) + mysql_rwlock_unlock(&keyinfo->root_lock); + } + } + } + else + fatal_error= 1; + + if (filepos != HA_OFFSET_ERROR) + { + if ((*share->write_record_abort)(info)) + fatal_error= 1; + } + + if (info->bulk_insert) + { + uint j; + for (j=0 ; j < share->base.keys ; j++) + maria_flush_bulk_insert(info, j); + } + + if (fatal_error) + _ma_set_fatal_error(info, HA_ERR_CRASHED); + + info->update= (HA_STATE_CHANGED | HA_STATE_WRITTEN | HA_STATE_ROW_CHANGED); + my_errno=save_errno; +err2: + save_errno=my_errno; + DBUG_ASSERT(save_errno); + if (!save_errno) + save_errno= HA_ERR_INTERNAL_ERROR; /* Should never happen */ + DBUG_PRINT("error", ("got error: %d", save_errno)); + _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE); + DBUG_RETURN(my_errno=save_errno); +} /* maria_write */ + + +/* + Write one key to btree + + TODO + Remove this function and have bulk insert change keyinfo->ck_insert + to point to the right function +*/ + +my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key) +{ + DBUG_ENTER("_ma_ck_write"); + + if (info->bulk_insert && + is_tree_inited(&info->bulk_insert[key->keyinfo->key_nr])) + { + DBUG_RETURN(_ma_ck_write_tree(info, key)); + } + DBUG_RETURN(_ma_ck_write_btree(info, key)); +} /* _ma_ck_write */ + + +/********************************************************************** + Insert key into btree (normal case) +**********************************************************************/ + +static my_bool _ma_ck_write_btree(MARIA_HA *info, MARIA_KEY *key) +{ + my_bool error; + MARIA_KEYDEF *keyinfo= key->keyinfo; + my_off_t *root= &info->s->state.key_root[keyinfo->key_nr]; + DBUG_ENTER("_ma_ck_write_btree"); + + error= _ma_ck_write_btree_with_log(info, key, root, + keyinfo->write_comp_flag | key->flag); + if (info->ft1_to_ft2) + { + if (!error) + error= _ma_ft_convert_to_ft2(info, key); + delete_dynamic(info->ft1_to_ft2); + my_free(info->ft1_to_ft2); + info->ft1_to_ft2=0; + } + DBUG_RETURN(error); +} /* _ma_ck_write_btree */ + + +/** + @brief Write a key to the b-tree + + @retval 1 error + @retval 0 ok +*/ + +static my_bool _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root, uint32 comp_flag) +{ + MARIA_SHARE *share= info->s; + LSN lsn= LSN_IMPOSSIBLE; + int error; + my_off_t new_root= *root; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_KEY org_key; /* Set/used when now_transactional=TRUE */ + my_bool transactional= share->now_transactional; + DBUG_ENTER("_ma_ck_write_btree_with_log"); + + LINT_INIT_STRUCT(org_key); + + if (transactional) + { + /* Save original value as the key may change */ + org_key= *key; + memcpy(key_buff, key->data, key->data_length + key->ref_length); + } + + error= _ma_ck_real_write_btree(info, key, &new_root, comp_flag); + if (!error && transactional) + { + /* Log the original value */ + *key= org_key; + key->data= key_buff; + error= _ma_write_undo_key_insert(info, key, root, new_root, &lsn); + } + else + { + *root= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + + DBUG_RETURN(error != 0); +} /* _ma_ck_write_btree_with_log */ + + +/** + @brief Write a key to the b-tree + + @retval 1 error + @retval 0 ok +*/ + +my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key, my_off_t *root, + uint32 comp_flag) +{ + int error; + DBUG_ENTER("_ma_ck_real_write_btree"); + + /* key_length parameter is used only if comp_flag is SEARCH_FIND */ + if (*root == HA_OFFSET_ERROR || + (error= w_search(info, comp_flag, key, *root, (MARIA_PAGE *) 0, + (uchar*) 0, 1)) > 0) + error= _ma_enlarge_root(info, key, root); + DBUG_RETURN(error != 0); +} /* _ma_ck_real_write_btree */ + + +/** + @brief Make a new root with key as only pointer + + @retval 1 error + @retval 0 ok +*/ + +my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key, my_off_t *root) +{ + uint t_length, nod_flag; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + my_bool res= 0; + DBUG_ENTER("_ma_enlarge_root"); + + page.info= info; + page.keyinfo= keyinfo; + page.buff= info->buff; + page.flag= 0; + + nod_flag= (*root != HA_OFFSET_ERROR) ? share->base.key_reflength : 0; + /* Store pointer to prev page if nod */ + _ma_kpointer(info, page.buff + share->keypage_header, *root); + t_length= (*keyinfo->pack_key)(key, nod_flag, (uchar*) 0, + (uchar*) 0, (uchar*) 0, &s_temp); + page.size= share->keypage_header + t_length + nod_flag; + + bzero(page.buff, share->keypage_header); + _ma_store_keynr(share, page.buff, keyinfo->key_nr); + if (nod_flag) + page.flag|= KEYPAGE_FLAG_ISNOD; + if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID)) + page.flag|= KEYPAGE_FLAG_HAS_TRANSID; + (*keyinfo->store_key)(keyinfo, page.buff + share->keypage_header + + nod_flag, &s_temp); + + /* Mark that info->buff was used */ + info->keyread_buff_used= info->page_changed= 1; + if ((page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) == + HA_OFFSET_ERROR) + DBUG_RETURN(1); + *root= page.pos; + + page_store_info(share, &page); + + /* + Clear unitialized part of page to avoid valgrind/purify warnings + and to get a clean page that is easier to compress and compare with + pages generated with redo + */ + bzero(page.buff + page.size, share->block_size - page.size); + + if (share->now_transactional && _ma_log_new(&page, 1)) + res= 1; + + if (_ma_write_keypage(&page, page_link->write_lock, + PAGECACHE_PRIORITY_HIGH)) + res= 1; + + DBUG_RETURN(res); +} /* _ma_enlarge_root */ + + +/* + Search after a position for a key and store it there + + TODO: + Change this to use pagecache directly instead of creating a copy + of the page. To do this, we must however change write-key-on-page + algorithm to not overwrite the buffer but instead store any overflow + key in a separate buffer. + + @return + @retval -1 error + @retval 0 ok + @retval > 0 Key should be stored in higher tree +*/ + +static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key, + my_off_t page_pos, + MARIA_PAGE *father_page, uchar *father_keypos, + my_bool insert_last) +{ + int error,flag; + uchar *temp_buff,*keypos,*keybuff; + my_bool was_last_key, buff_alloced; + my_off_t next_page, dup_key_pos; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_PAGE page; + DBUG_ENTER("w_search"); + DBUG_PRINT("enter", ("page: %lu", (ulong) (page_pos/keyinfo->block_length))); + + alloc_on_stack(*info->stack_end_ptr, temp_buff, buff_alloced, + (keyinfo->block_length + keyinfo->max_store_length*3)); + if (!temp_buff) + DBUG_RETURN(1); + + keybuff= temp_buff + (keyinfo->block_length + keyinfo->max_store_length*2); + + if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, temp_buff, 0)) + goto err; + + flag= (*keyinfo->bin_search)(key, &page, comp_flag, &keypos, + keybuff, &was_last_key); + if (flag == 0) + { + MARIA_KEY tmp_key; + /* get position to record with duplicated key */ + + tmp_key.keyinfo= keyinfo; + tmp_key.data= keybuff; + + if ((*keyinfo->get_key)(&tmp_key, page.flag, page.node, &keypos)) + dup_key_pos= _ma_row_pos_from_key(&tmp_key); + else + dup_key_pos= HA_OFFSET_ERROR; + + if (keyinfo->flag & HA_FULLTEXT) + { + uint off; + int subkeys; + + get_key_full_length_rdonly(off, keybuff); + subkeys=ft_sintXkorr(keybuff+off); + comp_flag=SEARCH_SAME; + if (subkeys >= 0) + { + /* normal word, one-level tree structure */ + flag=(*keyinfo->bin_search)(key, &page, comp_flag, + &keypos, keybuff, &was_last_key); + } + else + { + /* popular word. two-level tree. going down */ + my_off_t root= dup_key_pos; + MARIA_KEY subkey; + get_key_full_length_rdonly(off, key->data); + subkey.keyinfo= keyinfo= &share->ft2_keyinfo; + subkey.data= key->data + off; + subkey.data_length= key->data_length - off; + subkey.ref_length= key->ref_length; + subkey.flag= key->flag; + + /* we'll modify key entry 'in vivo' */ + keypos-= keyinfo->keylength + page.node; + error= _ma_ck_real_write_btree(info, &subkey, &root, comp_flag); + _ma_dpointer(share, keypos+HA_FT_WLEN, root); + subkeys--; /* should there be underflow protection ? */ + DBUG_ASSERT(subkeys < 0); + ft_intXstore(keypos, subkeys); + if (!error) + { + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + stack_alloc_free(temp_buff, buff_alloced); + DBUG_RETURN(error); + } + } + else /* not HA_FULLTEXT, normal HA_NOSAME key */ + { + /* + TODO + When the index will support true versioning - with multiple + identical values in the UNIQUE index, invisible to each other - + the following should be changed to "continue inserting keys, at the + end (of the row or statement) wait". We need to wait on *all* + unique conflicts at once, not one-at-a-time, because we need to + know all blockers in advance, otherwise we'll have incomplete wait-for + graph. + */ + /* + transaction that has inserted the conflicting key may be in progress. + the caller will wait for it to be committed or aborted. + */ + info->dup_key_trid= _ma_trid_from_key(&tmp_key); + info->dup_key_pos= dup_key_pos; + my_errno= HA_ERR_FOUND_DUPP_KEY; + DBUG_PRINT("warning", + ("Duplicate key. dup_key_trid: %lu pos %lu visible: %d", + (ulong) info->dup_key_trid, + (ulong) info->dup_key_pos, + info->trn ? trnman_can_read_from(info->trn, + info->dup_key_trid) : 2)); + goto err; + } + } + if (flag == MARIA_FOUND_WRONG_KEY) + { + my_errno= HA_ERR_CRASHED; + goto err; + } + if (!was_last_key) + insert_last=0; + next_page= _ma_kpos(page.node, keypos); + if (next_page == HA_OFFSET_ERROR || + (error= w_search(info, comp_flag, key, next_page, + &page, keypos, insert_last)) > 0) + { + error= _ma_insert(info, key, &page, keypos, keybuff, + father_page, father_keypos, insert_last); + if (error < 0) + goto err; + page_mark_changed(info, &page); + if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS)) + goto err; + } + stack_alloc_free(temp_buff, buff_alloced); + DBUG_RETURN(error); +err: + stack_alloc_free(temp_buff, buff_alloced); + DBUG_PRINT("exit",("Error: %d",my_errno)); + DBUG_RETURN(-1); +} /* w_search */ + + +/* + Insert new key. + + SYNOPSIS + _ma_insert() + info Open table information. + keyinfo Key definition information. + key New key + anc_page Key page (beginning) + key_pos Position in key page where to insert. + key_buff Copy of previous key if keys where packed. + father_page position of parent key page in file. + father_key_pos position in parent key page for balancing. + insert_last If to append at end of page. + + DESCRIPTION + Insert new key at right of key_pos. + Note that caller must save anc_buff + + This function writes log records for all changed pages + (Including anc_buff and father page) + + RETURN + < 0 Error. + 0 OK + 1 If key contains key to upper level (from balance page) + 2 If key contains key to upper level (from split space) +*/ + +int _ma_insert(register MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff, + MARIA_PAGE *father_page, uchar *father_key_pos, + my_bool insert_last) +{ + uint a_length, nod_flag, org_anc_length; + int t_length; + uchar *endpos, *prev_key, *anc_buff; + MARIA_KEY_PARAM s_temp; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_insert"); + DBUG_PRINT("enter",("key_pos:%p", key_pos)); + DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key);); + + /* + Note that anc_page->size can be bigger then block_size in case of + delete key that caused increase of page length + */ + org_anc_length= a_length= anc_page->size; + nod_flag= anc_page->node; + + anc_buff= anc_page->buff; + endpos= anc_buff+ a_length; + prev_key= (key_pos == anc_buff + share->keypage_header + nod_flag ? + (uchar*) 0 : key_buff); + t_length= (*keyinfo->pack_key)(key, nod_flag, + (key_pos == endpos ? (uchar*) 0 : key_pos), + prev_key, prev_key, &s_temp); +#ifndef DBUG_OFF + if (prev_key && (keyinfo->flag & (HA_BINARY_PACK_KEY | HA_PACK_KEY))) + { + DBUG_DUMP("prev_key", prev_key, _ma_keylength(keyinfo,prev_key)); + } + if (keyinfo->flag & HA_PACK_KEY) + { + DBUG_PRINT("test",("t_length: %d ref_len: %d", + t_length,s_temp.ref_length)); + DBUG_PRINT("test",("n_ref_len: %d n_length: %d key_pos: %p", + s_temp.n_ref_length, s_temp.n_length, s_temp.key)); + } +#endif + if (t_length > 0) + { + if (t_length >= keyinfo->maxlength*2+MARIA_INDEX_OVERHEAD_SIZE) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(-1); + } + bmove_upp(endpos+t_length, endpos, (uint) (endpos-key_pos)); + } + else + { + if (-t_length >= keyinfo->maxlength*2+MARIA_INDEX_OVERHEAD_SIZE) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(-1); + } + bmove(key_pos,key_pos-t_length,(uint) (endpos-key_pos)+t_length); + } + (*keyinfo->store_key)(keyinfo,key_pos,&s_temp); + a_length+=t_length; + + if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID)) + _ma_mark_page_with_transid(share, anc_page); + + anc_page->size= a_length; + page_store_size(share, anc_page); + + /* + Check if the new key fits totally into the the page + (anc_buff is big enough to contain a full page + one key) + */ + if (a_length <= share->max_index_block_size) + { + if (share->max_index_block_size - a_length < 32 && + (keyinfo->flag & HA_FULLTEXT) && key_pos == endpos && + share->base.key_reflength <= share->rec_reflength && + share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) + { + /* + Normal word. One-level tree. Page is almost full. + Let's consider converting. + We'll compare 'key' and the first key at anc_buff + */ + const uchar *a= key->data; + const uchar *b= anc_buff + share->keypage_header + nod_flag; + uint alen, blen, ft2len= share->ft2_keyinfo.keylength; + /* the very first key on the page is always unpacked */ + DBUG_ASSERT((*b & 128) == 0); +#if HA_FT_MAXLEN >= 127 + blen= mi_uint2korr(b); b+=2; + When you enable this code, as part of the MyISAM->Maria merge of +ChangeSet@1.2562, 2008-04-09 07:41:40+02:00, serg@janus.mylan +9 -0 + restore ft2 functionality, fix bugs. + Then this will enable two-level fulltext index, which is not totally + recoverable yet. + So remove this text and inform Guilhem so that he fixes the issue. +#else + blen= *b++; +#endif + get_key_length(alen,a); + DBUG_ASSERT(info->ft1_to_ft2==0); + if (alen == blen && + ha_compare_char_varying(keyinfo->seg->charset, + a, alen, + b, blen, + FALSE/*b_is_prefix*/) == 0) + { + /* Yup. converting */ + info->ft1_to_ft2=(DYNAMIC_ARRAY *) + my_malloc(PSI_INSTRUMENT_ME, sizeof(DYNAMIC_ARRAY), MYF(MY_WME)); + my_init_dynamic_array(PSI_INSTRUMENT_ME, info->ft1_to_ft2, ft2len, 300, + 50, MYF(0)); + + /* + Now, adding all keys from the page to dynarray + if the page is a leaf (if not keys will be deleted later) + */ + if (!nod_flag) + { + /* + Let's leave the first key on the page, though, because + we cannot easily dispatch an empty page here + */ + b+=blen+ft2len+2; + for (a=anc_buff+a_length ; b < a ; b+=ft2len+2) + insert_dynamic(info->ft1_to_ft2, b); + + /* fixing the page's length - it contains only one key now */ + anc_page->size= share->keypage_header + blen + ft2len + 2; + page_store_size(share, anc_page); + } + /* the rest will be done when we're back from recursion */ + } + } + else + { + if (share->now_transactional && + _ma_log_add(anc_page, org_anc_length, + key_pos, s_temp.changed_length, t_length, 1, + KEY_OP_DEBUG_LOG_ADD_1)) + DBUG_RETURN(-1); + } + DBUG_RETURN(0); /* There is room on page */ + } + /* Page is full */ + if (nod_flag) + insert_last=0; + /* + TODO: + Remove 'born_transactional' here. + The only reason for having it here is that the current + _ma_balance_page_ can't handle variable length keys. + */ + if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && + father_page && !insert_last && !info->quick_mode && + !info->s->base.born_transactional) + { + s_temp.key_pos= key_pos; + page_mark_changed(info, father_page); + DBUG_RETURN(_ma_balance_page(info, keyinfo, key, anc_page, + father_page, father_key_pos, + &s_temp)); + } + DBUG_RETURN(_ma_split_page(info, key, anc_page, + MY_MIN(org_anc_length, + info->s->max_index_block_size), + key_pos, s_temp.changed_length, t_length, + key_buff, insert_last)); +} /* _ma_insert */ + + +/** + @brief split a full page in two and assign emerging item to key + + @fn _ma_split_page() + info Maria handler + keyinfo Key handler + key Buffer for middle key + split_page Page that should be split + org_split_length Original length of split_page before key was inserted + inserted_key_pos Address in buffer where key was inserted + changed_length Number of bytes changed at 'inserted_key_pos' + move_length Number of bytes buffer was moved when key was inserted + key_buff Key buffer to use for temporary storage of key + insert_last_key If we are insert key on rightmost key page + + @note + split_buff is not stored on disk (caller has to do this) + + @return + @retval 2 ok (Middle key up from _ma_insert()) + @retval -1 error +*/ + +int _ma_split_page(MARIA_HA *info, MARIA_KEY *key, MARIA_PAGE *split_page, + uint org_split_length, + uchar *inserted_key_pos, uint changed_length, + int move_length, + uchar *key_buff, my_bool insert_last_key) +{ + uint keynr; + uint length,a_length,key_ref_length,t_length,nod_flag,key_length; + uint page_length, split_length, page_flag; + uchar *key_pos, *pos, *UNINIT_VAR(after_key); + MARIA_KEY_PARAM s_temp; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + MARIA_KEY tmp_key; + MARIA_PAGE new_page; + int res; + DBUG_ENTER("_ma_split_page"); + + DBUG_DUMP("buff", split_page->buff, split_page->size); + + info->page_changed=1; /* Info->buff is used */ + info->keyread_buff_used=1; + page_flag= split_page->flag; + nod_flag= split_page->node; + key_ref_length= share->keypage_header + nod_flag; + + new_page.info= info; + new_page.buff= info->buff; + new_page.keyinfo= keyinfo; + + tmp_key.data= key_buff; + tmp_key.keyinfo= keyinfo; + if (insert_last_key) + key_pos= _ma_find_last_pos(&tmp_key, split_page, &after_key); + else + key_pos= _ma_find_half_pos(&tmp_key, split_page, &after_key); + if (!key_pos) + DBUG_RETURN(-1); + + key_length= tmp_key.data_length + tmp_key.ref_length; + split_length= (uint) (key_pos - split_page->buff); + a_length= split_page->size; + split_page->size= split_length; + page_store_size(share, split_page); + + key_pos=after_key; + if (nod_flag) + { + DBUG_PRINT("test",("Splitting nod")); + pos=key_pos-nod_flag; + memcpy(new_page.buff + share->keypage_header, pos, (size_t) nod_flag); + } + + /* Move middle item to key and pointer to new page */ + if ((new_page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) == + HA_OFFSET_ERROR) + DBUG_RETURN(-1); + + _ma_copy_key(key, &tmp_key); + _ma_kpointer(info, key->data + key_length, new_page.pos); + + /* Store new page */ + if (!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &key_pos)) + DBUG_RETURN(-1); + + t_length=(*keyinfo->pack_key)(&tmp_key, nod_flag, (uchar *) 0, + (uchar*) 0, (uchar*) 0, &s_temp); + length=(uint) ((split_page->buff + a_length) - key_pos); + memcpy(new_page.buff + key_ref_length + t_length, key_pos, + (size_t) length); + (*keyinfo->store_key)(keyinfo,new_page.buff+key_ref_length,&s_temp); + page_length= length + t_length + key_ref_length; + + bzero(new_page.buff, share->keypage_header); + /* Copy KEYFLAG_FLAG_ISNODE and KEYPAGE_FLAG_HAS_TRANSID from parent page */ + new_page.flag= page_flag; + new_page.size= page_length; + page_store_info(share, &new_page); + + /* Copy key number */ + keynr= _ma_get_keynr(share, split_page->buff); + _ma_store_keynr(share, new_page.buff, keynr); + + res= 2; /* Middle key up */ + if (share->now_transactional && _ma_log_new(&new_page, 0)) + res= -1; + + /* + Clear unitialized part of page to avoid valgrind/purify warnings + and to get a clean page that is easier to compress and compare with + pages generated with redo + */ + bzero(new_page.buff + page_length, share->block_size - page_length); + + if (_ma_write_keypage(&new_page, page_link->write_lock, + DFLT_INIT_HITS)) + res= -1; + + /* Save changes to split pages */ + if (share->now_transactional && + _ma_log_split(split_page, org_split_length, split_length, + inserted_key_pos, changed_length, move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + res= -1; + + DBUG_DUMP_KEY("middle_key", key); + DBUG_RETURN(res); +} /* _ma_split_page */ + + +/* + Calculate how to much to move to split a page in two + + Returns pointer to start of key. + key will contain the key. + after_key will contain the position to where the next key starts +*/ + +uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *ma_page, + uchar **after_key) +{ + uint keys, length, key_ref_length, page_flag, nod_flag; + uchar *page, *end, *lastpos; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + DBUG_ENTER("_ma_find_half_pos"); + + nod_flag= ma_page->node; + key_ref_length= share->keypage_header + nod_flag; + page_flag= ma_page->flag; + length= ma_page->size - key_ref_length; + page= ma_page->buff+ key_ref_length; /* Point to first key */ + + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + key_ref_length= keyinfo->keylength+nod_flag; + key->data_length= keyinfo->keylength - info->s->rec_reflength; + key->ref_length= info->s->rec_reflength; + key->flag= 0; + keys=length/(key_ref_length*2); + end=page+keys*key_ref_length; + *after_key=end+key_ref_length; + memcpy(key->data, end, key_ref_length); + DBUG_RETURN(end); + } + + end=page+length/2-key_ref_length; /* This is aprox. half */ + key->data[0]= 0; /* Safety */ + do + { + lastpos=page; + if (!(length= (*keyinfo->get_key)(key, page_flag, nod_flag, &page))) + DBUG_RETURN(0); + } while (page < end); + *after_key= page; + DBUG_PRINT("exit",("returns: %p page: %p half: %p", + lastpos, page, end)); + DBUG_RETURN(lastpos); +} /* _ma_find_half_pos */ + + +/** + Find second to last key on leaf page + + @notes + Used to split buffer at last key. In this case the next to last + key will be moved to parent page and last key will be on it's own page. + + @TODO + Add one argument for 'last key value' to get_key so that one can + do the loop without having to copy the found key the whole time + + @return + @retval Pointer to the start of the key before the last key + @retval int_key will contain the last key +*/ + +static uchar *_ma_find_last_pos(MARIA_KEY *int_key, MARIA_PAGE *ma_page, + uchar **after_key) +{ + uint keys, length, key_ref_length, page_flag; + uchar *page, *end, *lastpos, *prevpos; + uchar key_buff[MARIA_MAX_KEY_BUFF]; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= int_key->keyinfo; + MARIA_KEY tmp_key; + DBUG_ENTER("_ma_find_last_pos"); + + key_ref_length= share->keypage_header; + page_flag= ma_page->flag; + length= ma_page->size - key_ref_length; + page= ma_page->buff + key_ref_length; + + if (!(keyinfo->flag & + (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | + HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID)) + { + keys= length / keyinfo->keylength - 2; + length= keyinfo->keylength; + int_key->data_length= length - info->s->rec_reflength; + int_key->ref_length= info->s->rec_reflength; + int_key->flag= 0; + end=page+keys*length; + *after_key=end+length; + memcpy(int_key->data, end, length); + DBUG_RETURN(end); + } + + end=page+length-key_ref_length; + lastpos=page; + tmp_key.data= key_buff; + tmp_key.keyinfo= int_key->keyinfo; + key_buff[0]= 0; /* Safety */ + + /* We know that there are at least 2 keys on the page */ + + if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page))) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(0); + } + + do + { + prevpos=lastpos; lastpos=page; + int_key->data_length= tmp_key.data_length; + int_key->ref_length= tmp_key.ref_length; + int_key->flag= tmp_key.flag; + memcpy(int_key->data, key_buff, length); /* previous key */ + if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page))) + { + _ma_set_fatal_error(info, HA_ERR_CRASHED); + DBUG_RETURN(0); + } + } while (page < end); + + *after_key=lastpos; + DBUG_PRINT("exit",("returns: %p page: %p end: %p", + prevpos,page,end)); + DBUG_RETURN(prevpos); +} /* _ma_find_last_pos */ + + +/** + @brief Balance page with static size keys with page on right/left + + @param key Middle key will be stored here + + @notes + Father_buff will always be changed + Caller must handle saving of curr_buff + + @return + @retval 0 Balance was done (father buff is saved) + @retval 1 Middle key up (father buff is not saved) + @retval -1 Error +*/ + +static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + MARIA_KEY *key, MARIA_PAGE *curr_page, + MARIA_PAGE *father_page, + uchar *father_key_pos, MARIA_KEY_PARAM *s_temp) +{ + MARIA_PINNED_PAGE tmp_page_link, *new_page_link= &tmp_page_link; + MARIA_SHARE *share= info->s; + my_bool right, buff_alloced; + uint k_length,father_length,father_keylength,nod_flag,curr_keylength; + uint right_length,left_length,new_right_length,new_left_length,extra_length; + uint keys, tmp_length, extra_buff_length; + uchar *pos, *extra_buff, *parting_key; + uchar *tmp_part_key; + MARIA_PAGE next_page, extra_page, *left_page, *right_page; + DBUG_ENTER("_ma_balance_page"); + + alloc_on_stack(*info->stack_end_ptr, tmp_part_key, buff_alloced, + keyinfo->max_store_length); + if (!tmp_part_key) + DBUG_RETURN(-1); + + k_length= keyinfo->keylength; + father_length= father_page->size; + father_keylength= k_length + share->base.key_reflength; + nod_flag= curr_page->node; + curr_keylength= k_length+nod_flag; + info->page_changed=1; + + if ((father_key_pos != father_page->buff+father_length && + (info->state->records & 1)) || + father_key_pos == father_page->buff+ share->keypage_header + + share->base.key_reflength) + { + right=1; + next_page.pos= _ma_kpos(share->base.key_reflength, + father_key_pos+father_keylength); + left_page= curr_page; + right_page= &next_page; + DBUG_PRINT("info", ("use right page: %lu", + (ulong) (next_page.pos / keyinfo->block_length))); + } + else + { + right=0; + father_key_pos-=father_keylength; + next_page.pos= _ma_kpos(share->base.key_reflength,father_key_pos); + left_page= &next_page; + right_page= curr_page; + DBUG_PRINT("info", ("use left page: %lu", + (ulong) (next_page.pos / keyinfo->block_length))); + } /* father_key_pos ptr to parting key */ + + if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, info->buff, 0)) + goto err; + page_mark_changed(info, &next_page); + DBUG_DUMP("next", next_page.buff, next_page.size); + + /* Test if there is room to share keys */ + left_length= left_page->size; + right_length= right_page->size; + keys= ((left_length+right_length-share->keypage_header*2-nod_flag*2)/ + curr_keylength); + + if ((right ? right_length : left_length) + curr_keylength <= + share->max_index_block_size) + { + /* Enough space to hold all keys in the two buffers ; Balance bufferts */ + new_left_length= share->keypage_header+nod_flag+(keys/2)*curr_keylength; + new_right_length=share->keypage_header+nod_flag+(((keys+1)/2)* + curr_keylength); + left_page->size= new_left_length; + page_store_size(share, left_page); + right_page->size= new_right_length; + page_store_size(share, right_page); + + DBUG_PRINT("info", ("left_length: %u -> %u right_length: %u -> %u", + left_length, new_left_length, + right_length, new_right_length)); + if (left_length < new_left_length) + { + uint length; + DBUG_PRINT("info", ("move keys to end of buff")); + + /* Move keys right_page -> left_page */ + pos= left_page->buff+left_length; + memcpy(pos,father_key_pos, (size_t) k_length); + memcpy(pos+k_length, right_page->buff + share->keypage_header, + (size_t) (length=new_left_length - left_length - k_length)); + pos= right_page->buff + share->keypage_header + length; + memcpy(father_key_pos, pos, (size_t) k_length); + bmove(right_page->buff + share->keypage_header, + pos + k_length, new_right_length - share->keypage_header); + + if (share->now_transactional) + { + if (right) + { + /* + Log changes to page on left + The original page is on the left and stored in left_page->buff + We have on the page the newly inserted key and data + from buff added last on the page + */ + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, + new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_ADD_SUFFIX, + curr_page->buff + left_length, + new_left_length - left_length, + new_left_length - left_length+ k_length)) + goto err; + /* + Log changes to page on right + This contains the original data with some keys deleted from + start of page + */ + if (_ma_log_prefix(&next_page, 0, + ((int) new_right_length - (int) right_length), + KEY_OP_DEBUG_LOG_PREFIX_3)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + Data is removed from start of page + The inserted key may be in buff or moved to curr_buff + */ + if (_ma_log_del_prefix(curr_page, + right_length - s_temp->changed_length, + new_right_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length)) + goto err; + /* + Log changes to page on left, which has new data added last + */ + if (_ma_log_suffix(&next_page, left_length, new_left_length)) + goto err; + } + } + } + else + { + uint length; + DBUG_PRINT("info", ("move keys to start of right_page")); + + bmove_upp(right_page->buff + new_right_length, + right_page->buff + right_length, + right_length - share->keypage_header); + length= new_right_length -right_length - k_length; + memcpy(right_page->buff + share->keypage_header + length, father_key_pos, + (size_t) k_length); + pos= left_page->buff + new_left_length; + memcpy(father_key_pos, pos, (size_t) k_length); + memcpy(right_page->buff + share->keypage_header, pos+k_length, + (size_t) length); + + if (share->now_transactional) + { + if (right) + { + /* + Log changes to page on left + The original page is on the left and stored in curr_buff + The page is shortened from end and the key may be on the page + */ + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, + new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + goto err; + /* + Log changes to page on right + This contains the original data, with some data from cur_buff + added first + */ + if (_ma_log_prefix(&next_page, + (uint) (new_right_length - right_length), + (int) (new_right_length - right_length), + KEY_OP_DEBUG_LOG_PREFIX_4)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + We have on the page the newly inserted key and data + from buff added first on the page + */ + uint diff_length= new_right_length - right_length; + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, + new_right_length, + s_temp->key_pos + diff_length, + s_temp->changed_length, + s_temp->move_length, + KEY_OP_ADD_PREFIX, + curr_page->buff + share->keypage_header, + diff_length, diff_length + k_length)) + goto err; + /* + Log changes to page on left, which is shortened from end + */ + if (_ma_log_suffix(&next_page, left_length, new_left_length)) + goto err; + } + } + } + + /* Log changes to father (one level up) page */ + + if (share->now_transactional && + _ma_log_change(father_page, father_key_pos, k_length, + KEY_OP_DEBUG_FATHER_CHANGED_1)) + goto err; + + /* + next_page_link->changed is marked as true above and fathers + page_link->changed is marked as true in caller + */ + if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS) || + _ma_write_keypage(father_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS)) + goto err; + stack_alloc_free(tmp_part_key, buff_alloced); + DBUG_RETURN(0); + } + + /* left_page and right_page are full, lets split and make new nod */ + + extra_buff= info->buff+share->base.max_key_block_length; + new_left_length= new_right_length= (share->keypage_header + nod_flag + + (keys+1) / 3 * curr_keylength); + extra_page.info= info; + extra_page.keyinfo= keyinfo; + extra_page.buff= extra_buff; + + /* + 5 is the minum number of keys we can have here. This comes from + the fact that each full page can store at least 2 keys and in this case + we have a 'split' key, ie 2+2+1 = 5 + */ + if (keys == 5) /* Too few keys to balance */ + new_left_length-=curr_keylength; + extra_length= (nod_flag + left_length + right_length - + new_left_length - new_right_length - curr_keylength); + extra_buff_length= extra_length + share->keypage_header; + DBUG_PRINT("info",("left_length: %d right_length: %d new_left_length: %d new_right_length: %d extra_length: %d", + left_length, right_length, + new_left_length, new_right_length, + extra_length)); + + left_page->size= new_left_length; + page_store_size(share, left_page); + right_page->size= new_right_length; + page_store_size(share, right_page); + + bzero(extra_buff, share->keypage_header); + extra_page.flag= nod_flag ? KEYPAGE_FLAG_ISNOD : 0; + extra_page.size= extra_buff_length; + page_store_info(share, &extra_page); + + /* Copy key number */ + _ma_store_keynr(share, extra_buff, keyinfo->key_nr); + + /* move first largest keys to new page */ + pos= right_page->buff + right_length-extra_length; + memcpy(extra_buff + share->keypage_header, pos, extra_length); + /* Zero old data from buffer */ + bzero(extra_buff + extra_buff_length, + share->block_size - extra_buff_length); + + /* Save new parting key between buff and extra_buff */ + memcpy(tmp_part_key, pos-k_length,k_length); + /* Make place for new keys */ + bmove_upp(right_page->buff + new_right_length, pos - k_length, + right_length - extra_length - k_length - share->keypage_header); + /* Copy keys from left page */ + pos= left_page->buff + new_left_length; + memcpy(right_page->buff + share->keypage_header, pos + k_length, + (size_t) (tmp_length= left_length - new_left_length - k_length)); + /* Copy old parting key */ + parting_key= right_page->buff + share->keypage_header + tmp_length; + memcpy(parting_key, father_key_pos, (size_t) k_length); + + /* Move new parting keys up to caller */ + memcpy((right ? key->data : father_key_pos),pos,(size_t) k_length); + memcpy((right ? father_key_pos : key->data),tmp_part_key, k_length); + + if ((extra_page.pos= _ma_new(info, DFLT_INIT_HITS, &new_page_link)) + == HA_OFFSET_ERROR) + goto err; + _ma_kpointer(info,key->data+k_length, extra_page.pos); + /* This is safe as long we are using not keys with transid */ + key->data_length= k_length - info->s->rec_reflength; + key->ref_length= info->s->rec_reflength; + + if (right) + { + /* + Page order according to key values: + orignal_page (curr_page = left_page), next_page (buff), extra_buff + + Move page positions so that we store data in extra_page where + next_page was and next_page will be stored at the new position + */ + swap_variables(my_off_t, extra_page.pos, next_page.pos); + } + + if (share->now_transactional) + { + if (right) + { + /* + left_page is shortened, + right_page is getting new keys at start and shortened from end. + extra_page is new page + + Note that extra_page (largest key parts) will be stored at the + place of the original 'right' page (next_page) and right page + will be stored at the new page position + + This makes the log entries smaller as right_page contains all + data to generate the data extra_buff + */ + + /* + Log changes to page on left (page shortened page at end) + */ + if (_ma_log_split(curr_page, + left_length - s_temp->move_length, new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + goto err; + /* + Log changes to right page (stored at next page) + This contains the last 'extra_buff' from 'buff' + */ + if (_ma_log_prefix(&extra_page, + 0, (int) (extra_buff_length - right_length), + KEY_OP_DEBUG_LOG_PREFIX_5)) + goto err; + + /* + Log changes to middle page, which is stored at the new page + position + */ + if (_ma_log_new(&next_page, 0)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + This contains the original data, with some data from curr_buff + added first and shortened at end + */ + int data_added_first= left_length - new_left_length; + if (_ma_log_key_middle(right_page, + new_right_length, + data_added_first, + data_added_first, + extra_length, + s_temp->key_pos, + s_temp->changed_length, + s_temp->move_length)) + goto err; + + /* Log changes to page on left, which is shortened from end */ + if (_ma_log_suffix(left_page, left_length, new_left_length)) + goto err; + + /* Log change to rightmost (new) page */ + if (_ma_log_new(&extra_page, 0)) + goto err; + } + + /* Log changes to father (one level up) page */ + if (share->now_transactional && + _ma_log_change(father_page, father_key_pos, k_length, + KEY_OP_DEBUG_FATHER_CHANGED_2)) + goto err; + } + + if (_ma_write_keypage(&next_page, + (right ? new_page_link->write_lock : + PAGECACHE_LOCK_LEFT_WRITELOCKED), + DFLT_INIT_HITS) || + _ma_write_keypage(&extra_page, + (!right ? new_page_link->write_lock : + PAGECACHE_LOCK_LEFT_WRITELOCKED), + DFLT_INIT_HITS)) + goto err; + + stack_alloc_free(tmp_part_key, buff_alloced); + DBUG_RETURN(1); /* Middle key up */ + +err: + stack_alloc_free(tmp_part_key, buff_alloced); + DBUG_RETURN(-1); +} /* _ma_balance_page */ + + +/********************************************************************** + * Bulk insert code * + **********************************************************************/ + +typedef struct { + MARIA_HA *info; + uint keynr; +} bulk_insert_param; + + +static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key) +{ + my_bool error; + uint keynr= key->keyinfo->key_nr; + DBUG_ENTER("_ma_ck_write_tree"); + + /* Store ref_length as this is always constant */ + info->bulk_insert_ref_length= key->ref_length; + error= tree_insert(&info->bulk_insert[keynr], key->data, + key->data_length + key->ref_length, + info->bulk_insert[keynr].custom_arg) == 0; + DBUG_RETURN(error); +} /* _ma_ck_write_tree */ + + +/* typeof(_ma_keys_compare)=qsort_cmp2 */ + +static int keys_compare(bulk_insert_param *param, uchar *key1, uchar *key2) +{ + uint not_used[2]; + return ha_key_cmp(param->info->s->keyinfo[param->keynr].seg, + key1, key2, USE_WHOLE_KEY, SEARCH_SAME, + not_used); +} + + +static int keys_free(void* key_arg, TREE_FREE mode, void *param_arg) +{ + /* + Probably I can use info->lastkey here, but I'm not sure, + and to be safe I'd better use local lastkey. + */ + bulk_insert_param *param= (bulk_insert_param*)param_arg; + MARIA_SHARE *share= param->info->s; + uchar lastkey[MARIA_MAX_KEY_BUFF], *key= (uchar*)key_arg; + uint keylen; + MARIA_KEYDEF *keyinfo= share->keyinfo + param->keynr; + MARIA_KEY tmp_key; + + switch (mode) { + case free_init: + if (share->lock_key_trees) + { + mysql_rwlock_wrlock(&keyinfo->root_lock); + keyinfo->version++; + } + return 0; + case free_free: + /* Note: keylen doesn't contain transid lengths */ + keylen= _ma_keylength(keyinfo, key); + tmp_key.data= lastkey; + tmp_key.keyinfo= keyinfo; + tmp_key.data_length= keylen - share->rec_reflength; + tmp_key.ref_length= param->info->bulk_insert_ref_length; + tmp_key.flag= (param->info->bulk_insert_ref_length == + share->rec_reflength ? 0 : SEARCH_USER_KEY_HAS_TRANSID); + /* + We have to copy key as ma_ck_write_btree may need the buffer for + copying middle key up if tree is growing + */ + memcpy(lastkey, key, tmp_key.data_length + tmp_key.ref_length); + _ma_ck_write_btree(param->info, &tmp_key); + return 0; + case free_end: + if (share->lock_key_trees) + mysql_rwlock_unlock(&keyinfo->root_lock); + return 0; + } + return 0; +} + + +int maria_init_bulk_insert(MARIA_HA *info, size_t cache_size, ha_rows rows) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *key=share->keyinfo; + bulk_insert_param *params; + uint i, num_keys, total_keylength; + ulonglong key_map; + DBUG_ENTER("_ma_init_bulk_insert"); + DBUG_PRINT("enter",("cache_size: %lu", (ulong) cache_size)); + + DBUG_ASSERT(!info->bulk_insert && + (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT)); + + maria_clear_all_keys_active(key_map); + for (i=total_keylength=num_keys=0 ; i < share->base.keys ; i++) + { + if (! (key[i].flag & HA_NOSAME) && (share->base.auto_key != i + 1) && + maria_is_key_active(share->state.key_map, i)) + { + num_keys++; + maria_set_key_active(key_map, i); + total_keylength+=key[i].maxlength+TREE_ELEMENT_EXTRA_SIZE; + } + } + + if (num_keys==0 || + num_keys * (size_t) MARIA_MIN_SIZE_BULK_INSERT_TREE > cache_size) + DBUG_RETURN(0); + + if (rows && rows*total_keylength < cache_size) + cache_size= (size_t)rows; + else + cache_size/=total_keylength*16; + + info->bulk_insert=(TREE *) + my_malloc(PSI_INSTRUMENT_ME, (sizeof(TREE)*share->base.keys+ + sizeof(bulk_insert_param)*num_keys),MYF(0)); + + if (!info->bulk_insert) + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + params=(bulk_insert_param *)(info->bulk_insert+share->base.keys); + for (i=0 ; i < share->base.keys ; i++) + { + if (maria_is_key_active(key_map, i)) + { + params->info=info; + params->keynr=i; + /* Only allocate a 16'th of the buffer at a time */ + init_tree(&info->bulk_insert[i], + cache_size * key[i].maxlength, + cache_size * key[i].maxlength, 0, + (qsort_cmp2) keys_compare, keys_free, (void *)params++, MYF(0)); + } + else + info->bulk_insert[i].root=0; + } + + DBUG_RETURN(0); +} + +void maria_flush_bulk_insert(MARIA_HA *info, uint inx) +{ + if (info->bulk_insert) + { + if (is_tree_inited(&info->bulk_insert[inx])) + reset_tree(&info->bulk_insert[inx]); + } +} + + +int maria_end_bulk_insert(MARIA_HA *info, my_bool abort) +{ + int first_error= 0; + DBUG_ENTER("maria_end_bulk_insert"); + if (info->bulk_insert) + { + uint i; + for (i=0 ; i < info->s->base.keys ; i++) + { + if (is_tree_inited(&info->bulk_insert[i])) + { + int error; + if (info->s->deleting) + reset_free_element(&info->bulk_insert[i]); + if ((error= delete_tree(&info->bulk_insert[i], abort))) + { + first_error= first_error ? first_error : error; + abort= 1; + } + } + } + my_free(info->bulk_insert); + info->bulk_insert= 0; + } + DBUG_RETURN(first_error); +} + + +/**************************************************************************** + Dedicated functions that generate log entries +****************************************************************************/ + + +int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key, + my_off_t *root, my_off_t new_root, LSN *res_lsn) +{ + MARIA_SHARE *share= info->s; + MARIA_KEYDEF *keyinfo= key->keyinfo; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE]; + const uchar *key_value; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_msg_to_write_hook_for_undo_key msg; + uint key_length; + + /* Save if we need to write a clr record */ + lsn_store(log_data, info->trn->undo_lsn); + key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + keyinfo->key_nr); + key_length= key->data_length + key->ref_length; + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key->data; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + + msg.root= root; + msg.value= new_root; + msg.auto_increment= 0; + key_value= key->data; + if (share->base.auto_key == ((uint) keyinfo->key_nr + 1)) + { + const HA_KEYSEG *keyseg= keyinfo->seg; + uchar reversed[MARIA_MAX_KEY_BUFF]; + if (keyseg->flag & HA_SWAP_KEY) + { + /* We put key from log record to "data record" packing format... */ + const uchar *key_ptr= key->data, *key_end= key->data + keyseg->length; + uchar *to= reversed + keyseg->length; + do + { + *--to= *key_ptr++; + } while (key_ptr != key_end); + key_value= to; + } + /* ... so that we can read it with: */ + msg.auto_increment= + ma_retrieve_auto_increment(key_value, keyseg->type); + /* and write_hook_for_undo_key_insert() will pick this. */ + } + + return translog_write_record(res_lsn, LOGREC_UNDO_KEY_INSERT, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + key_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data + LSN_STORE_SIZE, &msg) ? -1 : 0; +} + + +/** + @brief Log creation of new page + + @note + We don't have to store the page_length into the log entry as we can + calculate this from the length of the log entry + + @retval 1 error + @retval 0 ok +*/ + +my_bool _ma_log_new(MARIA_PAGE *ma_page, my_bool root_page) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + +1]; + uint page_length; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + MARIA_HA *info= ma_page->info; + MARIA_SHARE *share= info->s; + my_off_t page= ma_page->pos / share->block_size; + DBUG_ENTER("_ma_log_new"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(share->now_transactional); + + /* Store address of new root page */ + page_store(log_data + FILEID_STORE_SIZE, page); + + /* Store link to next unused page */ + if (info->key_del_used == 2) + page= 0; /* key_del not changed */ + else + page= ((share->key_del_current == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO : + share->key_del_current / share->block_size); + + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + key_nr_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE*2, + ma_page->keyinfo->key_nr); + log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE*2 + KEY_NR_STORE_SIZE]= + (uchar) root_page; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + page_length= ma_page->size - LSN_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ma_page->buff + LSN_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= page_length; + + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX_NEW_PAGE, + info->trn, info, + (translog_size_t) + (sizeof(log_data) + page_length), + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief + Log when some part of the key page changes +*/ + +my_bool _ma_log_change(MARIA_PAGE *ma_page, const uchar *key_pos, uint length, + enum en_key_debug debug_marker __attribute__((unused))) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 6 + 7], *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uint offset= (uint) (key_pos - ma_page->buff), translog_parts; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_change"); + DBUG_PRINT("enter", ("page: %lu length: %u", (ulong) page, length)); + + DBUG_ASSERT(info->s->now_transactional); + DBUG_ASSERT(offset + length <= ma_page->size); + DBUG_ASSERT(ma_page->org_size == ma_page->size); + + /* Store address of new root page */ + page= ma_page->pos / info->s->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + (*log_pos++)= KEY_OP_DEBUG; + (*log_pos++)= debug_marker; +#endif + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos[3]= KEY_OP_CHANGE; + int2store(log_pos+4, length); + log_pos+= 6; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + translog_parts= 2; + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &length, &translog_parts); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) (log_pos - log_data) + length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief Write log entry for page splitting + + @fn _ma_log_split() + @param + ma_page Page that is changed + org_length Original length of page. Can be bigger than block_size + for block that overflowed + new_length New length of page + key_pos Where key is inserted on page (may be 0 if no key) + key_length Number of bytes changed at key_pos + move_length Number of bytes moved at key_pos to make room for key + prefix_or_suffix KEY_OP_NONE Ignored + KEY_OP_ADD_PREFIX Add data to start of page + KEY_OP_ADD_SUFFIX Add data to end of page + data What data was added + data_length Number of bytes added first or last + changed_length Number of bytes changed first or last. + + @note + Write log entry for page that has got a key added to the page under + one and only one of the following senarios: + - Page is shortened from end + - Data is added to end of page + - Data added at front of page +*/ + +static my_bool _ma_log_split(MARIA_PAGE *ma_page, + uint org_length, uint new_length, + const uchar *key_pos, uint key_length, + int move_length, enum en_key_op prefix_or_suffix, + const uchar *data, uint data_length, + uint changed_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+3+3+3+3+2 +7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + uint offset= (uint) (key_pos - ma_page->buff); + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_split"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + DBUG_ASSERT(changed_length >= data_length); + DBUG_ASSERT(org_length <= info->s->max_index_block_size); + DBUG_ASSERT(new_length == ma_page->size); + DBUG_ASSERT(org_length == ma_page->org_size); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + (*log_pos++)= KEY_OP_DEBUG; + (*log_pos++)= KEY_OP_DEBUG_LOG_SPLIT; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, ma_page->buff); + + if (new_length <= offset || !key_pos) + { + /* + Page was split before inserted key. Write redo entry where + we just cut current page at page_length + */ + uint length_offset= org_length - new_length; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, length_offset); + log_pos+= 3; + translog_parts= 1; + extra_length= 0; + DBUG_ASSERT(data_length == 0); + } + else + { + /* Key was added to page which was split after the inserted key */ + uint max_key_length; + + /* + Handle case when split happened directly after the newly inserted key. + */ + max_key_length= new_length - offset; + extra_length= MY_MIN(key_length, max_key_length); + if (offset + move_length > new_length) + { + /* This is true when move_length includes changes for next packed key */ + move_length= new_length - offset; + } + + if ((int) new_length < (int) (org_length + move_length + data_length)) + { + /* Shorten page */ + uint diff= org_length + move_length + data_length - new_length; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos + 1, diff); + log_pos+= 3; + DBUG_ASSERT(data_length == 0); /* Page is shortened */ + DBUG_ASSERT(offset <= org_length - diff); + } + else + { + DBUG_ASSERT(new_length == org_length + move_length + data_length); + DBUG_ASSERT(offset <= org_length); + } + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, extra_length); + log_pos+= 3; + + /* Point to original inserted key data */ + if (prefix_or_suffix == KEY_OP_ADD_PREFIX) + key_pos+= data_length; + + translog_parts= 2; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extra_length; + } + + if (data_length) + { + /* Add prefix or suffix */ + log_pos[0]= prefix_or_suffix; + int2store(log_pos+1, data_length); + log_pos+= 3; + if (prefix_or_suffix == KEY_OP_ADD_PREFIX) + { + int2store(log_pos+1, changed_length); + log_pos+= 2; + data_length= changed_length; + } + log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].str= data; + log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].length= data_length; + translog_parts++; + extra_length+= data_length; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got a key added to the page + and page is shortened from start of page + + @fn _ma_log_del_prefix() + @param info Maria handler + @param page Page number + @param buff Page buffer + @param org_length Length of buffer when read + @param new_length Final length + @param key_pos Where on page buffer key was added. This is position + before prefix was removed + @param key_length How many bytes was changed at 'key_pos' + @param move_length How many bytes was moved up when key was added + + @return + @retval 0 ok + @retval 1 error +*/ + +static my_bool _ma_log_del_prefix(MARIA_PAGE *ma_page, + uint org_length, uint new_length, + const uchar *key_pos, uint key_length, + int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 12 + 7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uint offset= (uint) (key_pos - ma_page->buff); + uint diff_length= org_length + move_length - new_length; + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_del_prefix"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + DBUG_ASSERT((int) diff_length > 0); + DBUG_ASSERT(ma_page->org_size == org_length); + DBUG_ASSERT(ma_page->size == new_length); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + translog_parts= 1; + extra_length= 0; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= KEY_OP_DEBUG_LOG_DEL_PREFIX; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, ma_page->buff); + + if (offset < diff_length + info->s->keypage_header) + { + /* + Key is not anymore on page. Move data down, but take into account that + the original page had grown with 'move_length bytes' + */ + DBUG_ASSERT(offset + key_length <= diff_length + info->s->keypage_header); + + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, diff_length - move_length); + log_pos+= 3; + } + else + { + /* + Correct position to key, as data before key has been delete and key + has thus been moved down + */ + offset-= diff_length; + key_pos-= diff_length; + + /* Move data down */ + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, diff_length); + log_pos+= 3; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, key_length); + log_pos+= 3; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + translog_parts= 2; + extra_length= key_length; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added first and + data deleted last. Old changed key may be part of page +*/ + +static my_bool _ma_log_key_middle(MARIA_PAGE *ma_page, + uint new_length, + uint data_added_first, + uint data_changed_first, + uint data_deleted_last, + const uchar *key_pos, + uint key_length, int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+5+3+3+3 + 7]; + uchar *log_pos; + LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6]; + uint key_offset; + uint translog_parts, extra_length; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->pos / info->s->block_size; + DBUG_ENTER("_ma_log_key_middle"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(ma_page->size == new_length); + + /* new place of key after changes */ + key_pos+= data_added_first; + key_offset= (uint) (key_pos - ma_page->buff); + if (key_offset < new_length) + { + /* key is on page; Calculate how much of the key is there */ + uint max_key_length= new_length - key_offset; + if (max_key_length < key_length) + { + /* Key is last on page */ + key_length= max_key_length; + move_length= 0; + } + /* + Take into account that new data was added as part of original key + that also needs to be removed from page + */ + data_deleted_last+= move_length; + } + + /* First log changes to page */ + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + +#ifdef EXTRA_DEBUG_KEY_CHANGES + *log_pos++= KEY_OP_DEBUG; + *log_pos++= KEY_OP_DEBUG_LOG_MIDDLE; +#endif + + /* Store keypage_flag */ + *log_pos++= KEY_OP_SET_PAGEFLAG; + *log_pos++= _ma_get_keypage_flag(info->s, ma_page->buff); + + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, data_deleted_last); + log_pos+= 3; + + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, data_added_first); + int2store(log_pos+3, data_changed_first); + log_pos+= 5; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (ma_page->buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first; + translog_parts= 2; + extra_length= data_changed_first; + + /* If changed key is on page, log those changes too */ + + if (key_offset < new_length) + { + uchar *start_log_pos= log_pos; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, key_offset); + log_pos+= 3; + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, key_length); + log_pos+= 3; + + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= start_log_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= (uint) (log_pos - + start_log_pos); + + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= key_length; + translog_parts+=2; + extra_length+= (uint) (log_array[TRANSLOG_INTERNAL_PARTS + 2].length + + key_length); + } + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + (log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length), + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +#ifdef NOT_NEEDED + +/** + @brief + Write log entry for page that has got data added first and + data deleted last +*/ + +static my_bool _ma_log_middle(MARIA_PAGE *ma_page, + uint data_added_first, uint data_changed_first, + uint data_deleted_last) +{ + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3 + 5 + 7], *log_pos; + MARIA_HA *info= ma_page->info; + my_off_t page= ma_page->page / info->s->block_size; + uint translog_parts, extra_length; + DBUG_ENTER("_ma_log_middle"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(ma_page->org_size + data_added_first - data_deleted_last == + ma_page->size); + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, data_deleted_last); + log_pos+= 3; + + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, data_added_first); + int2store(log_pos+3, data_changed_first); + log_pos+= 5; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ((char*) buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first; + translog_parts= 2; + extra_length= data_changed_first; + + _ma_log_key_changes(ma_page, + log_array + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_pos, &extra_length, &translog_parts); + /* Remember new page length for future log entires for same page */ + ma_page->org_size= ma_page->size; + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + (translog_size_t) + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} +#endif diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h new file mode 100644 index 00000000..91e1b10a --- /dev/null +++ b/storage/maria/maria_def.h @@ -0,0 +1,1765 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + Copyright (c) 2009, 2020, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* This file is included by all internal maria files */ + +#ifndef MARIA_DEF_INCLUDED +#define MARIA_DEF_INCLUDED + +#include <my_global.h> + +#ifdef EMBEDDED_LIBRARY +#undef WITH_S3_STORAGE_ENGINE +#endif + +#include "maria.h" /* Structs & some defines */ +#include "ma_pagecache.h" +#include <myisampack.h> /* packing of keys */ +#include <my_tree.h> +#include <my_bitmap.h> +#include <my_pthread.h> +#include <thr_lock.h> +#include <hash.h> +#include "ma_loghandler.h" +#include "ma_control_file.h" +#include "ma_state.h" +#include <waiting_threads.h> +#include <mysql/psi/mysql_file.h> + +#define MARIA_CANNOT_ROLLBACK + +C_MODE_START + +/* + Limit max keys according to HA_MAX_POSSIBLE_KEY; See myisamchk.h for details +*/ + +#if MAX_INDEXES > HA_MAX_POSSIBLE_KEY +#define MARIA_MAX_KEY HA_MAX_POSSIBLE_KEY /* Max allowed keys */ +#else +#define MARIA_MAX_KEY MAX_INDEXES /* Max allowed keys */ +#endif + +#define MARIA_NAME_IEXT ".MAI" +#define MARIA_NAME_DEXT ".MAD" +/* Max extra space to use when sorting keys */ +#define MARIA_MAX_TEMP_LENGTH (2*1024L*1024L*1024L) +/* Possible values for maria_block_size (must be power of 2) */ +#define MARIA_KEY_BLOCK_LENGTH 8192 /* default key block length */ +#define MARIA_MIN_KEY_BLOCK_LENGTH 1024 /* Min key block length */ +#define MARIA_MAX_KEY_BLOCK_LENGTH 32768 +/* Minimal page cache when we only want to be able to scan a table */ +#define MARIA_MIN_PAGE_CACHE_SIZE (8192L*16L) + +/* + In the following macros '_keyno_' is 0 .. keys-1. + If there can be more keys than bits in the key_map, the highest bit + is for all upper keys. They cannot be switched individually. + This means that clearing of high keys is ignored, setting one high key + sets all high keys. +*/ +#define MARIA_KEYMAP_BITS (8 * SIZEOF_LONG_LONG) +#define MARIA_KEYMAP_HIGH_MASK (1ULL << (MARIA_KEYMAP_BITS - 1)) +#define maria_get_mask_all_keys_active(_keys_) \ + (((_keys_) < MARIA_KEYMAP_BITS) ? \ + ((1ULL << (_keys_)) - 1ULL) : \ + (~ 0ULL)) +#if MARIA_MAX_KEY > MARIA_KEYMAP_BITS +#define maria_is_key_active(_keymap_,_keyno_) \ + (((_keyno_) < MARIA_KEYMAP_BITS) ? \ + MY_TEST((_keymap_) & (1ULL << (_keyno_))) : \ + MY_TEST((_keymap_) & MARIA_KEYMAP_HIGH_MASK)) +#define maria_set_key_active(_keymap_,_keyno_) \ + (_keymap_)|= (((_keyno_) < MARIA_KEYMAP_BITS) ? \ + (1ULL << (_keyno_)) : \ + MARIA_KEYMAP_HIGH_MASK) +#define maria_clear_key_active(_keymap_,_keyno_) \ + (_keymap_)&= (((_keyno_) < MARIA_KEYMAP_BITS) ? \ + (~ (1ULL << (_keyno_))) : \ + (~ (0ULL)) /*ignore*/ ) +#else +#define maria_is_key_active(_keymap_,_keyno_) \ + MY_TEST((_keymap_) & (1ULL << (_keyno_))) +#define maria_set_key_active(_keymap_,_keyno_) \ + (_keymap_)|= (1ULL << (_keyno_)) +#define maria_clear_key_active(_keymap_,_keyno_) \ + (_keymap_)&= (~ (1ULL << (_keyno_))) +#endif +#define maria_is_any_key_active(_keymap_) \ + MY_TEST((_keymap_)) +#define maria_is_all_keys_active(_keymap_,_keys_) \ + ((_keymap_) == maria_get_mask_all_keys_active(_keys_)) +#define maria_set_all_keys_active(_keymap_,_keys_) \ + (_keymap_)= maria_get_mask_all_keys_active(_keys_) +#define maria_clear_all_keys_active(_keymap_) \ + (_keymap_)= 0 +#define maria_intersect_keys_active(_to_,_from_) \ + (_to_)&= (_from_) +#define maria_is_any_intersect_keys_active(_keymap1_,_keys_,_keymap2_) \ + ((_keymap1_) & (_keymap2_) & \ + maria_get_mask_all_keys_active(_keys_)) +#define maria_copy_keys_active(_to_,_maxkeys_,_from_) \ + (_to_)= (maria_get_mask_all_keys_active(_maxkeys_) & \ + (_from_)) + + /* Param to/from maria_info */ + +typedef struct st_maria_info +{ + ha_rows records; /* Records in database */ + ha_rows deleted; /* Deleted records in database */ + MARIA_RECORD_POS recpos; /* Pos for last used record */ + MARIA_RECORD_POS newrecpos; /* Pos if we write new record */ + MARIA_RECORD_POS dup_key_pos; /* Position to record with dup key */ + my_off_t data_file_length; /* Length of data file */ + my_off_t max_data_file_length, index_file_length; + my_off_t max_index_file_length, delete_length; + ulonglong auto_increment; + ulonglong key_map; /* Which keys are used */ + time_t create_time; /* When table was created */ + time_t check_time; + time_t update_time; + ulong record_offset; + double *rec_per_key; /* for sql optimizing */ + ulong reclength; /* Recordlength */ + ulong mean_reclength; /* Mean recordlength (if packed) */ + char *data_file_name, *index_file_name; + enum data_file_type data_file_type; + uint keys; /* Number of keys in use */ + uint options; /* HA_OPTION_... used */ + uint reflength; + int errkey, /* With key was dupplicated on err */ + sortkey; /* clustered by this key */ + File filenr; /* (uniq) filenr for datafile */ +} MARIA_INFO; + +struct st_maria_share; +struct st_maria_handler; /* For referense */ +struct st_maria_keydef; + +struct st_maria_key /* Internal info about a key */ +{ + uchar *data; /* Data for key */ + struct st_maria_keydef *keyinfo; /* Definition for key */ + uint data_length; /* Length of key data */ + uint ref_length; /* record ref + transid */ + uint32 flag; /* 0 or SEARCH_PART_KEY */ +}; + +struct st_maria_decode_tree /* Decode huff-table */ +{ + uint16 *table; + uint quick_table_bits; + uchar *intervalls; +}; + + +typedef struct s3_info S3_INFO; + +extern ulong maria_block_size, maria_checkpoint_frequency; +extern ulong maria_concurrent_insert; +extern my_bool maria_flush, maria_single_user, maria_page_checksums; +extern my_off_t maria_max_temp_length; +extern ulong maria_bulk_insert_tree_size, maria_data_pointer_size; +extern MY_TMPDIR *maria_tmpdir; +extern my_bool maria_encrypt_tables; + +/* + This is used to check if a symlink points into the mysql data home, + which is normally forbidden as it can be used to get access to + not privileged data +*/ +extern int (*maria_test_invalid_symlink)(const char *filename); + + /* Prototypes for maria-functions */ + +extern int maria_init(void); +extern void maria_end(void); +extern my_bool maria_upgrade(void); +extern int maria_close(MARIA_HA *file); +extern int maria_delete(MARIA_HA *file, const uchar *buff); +extern MARIA_HA *maria_open(const char *name, int mode, + uint wait_if_locked, S3_INFO *s3); +extern int maria_panic(enum ha_panic_function function); +extern int maria_rfirst(MARIA_HA *file, uchar *buf, int inx); +extern int maria_rkey(MARIA_HA *file, uchar *buf, int inx, + const uchar *key, key_part_map keypart_map, + enum ha_rkey_function search_flag); +extern int maria_rlast(MARIA_HA *file, uchar *buf, int inx); +extern int maria_rnext(MARIA_HA *file, uchar *buf, int inx); +extern int maria_rnext_same(MARIA_HA *info, uchar *buf); +extern int maria_rprev(MARIA_HA *file, uchar *buf, int inx); +extern int maria_rrnd(MARIA_HA *file, uchar *buf, + MARIA_RECORD_POS pos); +extern int maria_scan_init(MARIA_HA *file); +extern int maria_scan(MARIA_HA *file, uchar *buf); +extern void maria_scan_end(MARIA_HA *file); +extern int maria_rsame(MARIA_HA *file, uchar *record, int inx); +extern int maria_rsame_with_pos(MARIA_HA *file, uchar *record, + int inx, MARIA_RECORD_POS pos); +extern int maria_update(MARIA_HA *file, const uchar *old, + const uchar *new_record); +extern int maria_write(MARIA_HA *file, const uchar *buff); +extern MARIA_RECORD_POS maria_position(MARIA_HA *file); +extern int maria_status(MARIA_HA *info, MARIA_INFO *x, uint flag); +extern int maria_lock_database(MARIA_HA *file, int lock_type); +extern int maria_delete_table(const char *name); +extern int maria_rename(const char *from, const char *to); +extern int maria_extra(MARIA_HA *file, + enum ha_extra_function function, void *extra_arg); +extern int maria_reset(MARIA_HA *file); +extern ha_rows maria_records_in_range(MARIA_HA *info, int inx, + const key_range *min_key, + const key_range *max_key, + page_range *page); +extern int maria_is_changed(MARIA_HA *info); +extern int maria_delete_all_rows(MARIA_HA *info); +extern uint maria_get_pointer_length(ulonglong file_length, uint def); +extern int maria_commit(MARIA_HA *info); +extern int maria_begin(MARIA_HA *info); +extern void maria_disable_logging(MARIA_HA *info); +extern void maria_enable_logging(MARIA_HA *info); + +#define HA_RECOVER_NONE 0 /* No automatic recover */ +#define HA_RECOVER_DEFAULT 1 /* Automatic recover active */ +#define HA_RECOVER_BACKUP 2 /* Make a backupfile on recover */ +#define HA_RECOVER_FORCE 4 /* Recover even if we loose rows */ +#define HA_RECOVER_QUICK 8 /* Don't check rows in data file */ + +#define HA_RECOVER_ANY (HA_RECOVER_DEFAULT | HA_RECOVER_BACKUP | HA_RECOVER_FORCE | HA_RECOVER_QUICK) + +/* this is used to pass to mysql_mariachk_table */ + +#define MARIA_CHK_REPAIR 1 /* equivalent to mariachk -r */ +#define MARIA_CHK_VERIFY 2 /* Verify, run repair if failure */ + +typedef uint maria_bit_type; + +typedef struct st_maria_bit_buff +{ /* Used for packing of record */ + maria_bit_type current_byte; + uint bits; + uchar *pos, *end, *blob_pos, *blob_end; + uint error; +} MARIA_BIT_BUFF; + +/* functions in maria_check */ +void maria_chk_init(HA_CHECK *param); +void maria_chk_init_for_check(HA_CHECK *param, MARIA_HA *info); +int maria_chk_status(HA_CHECK *param, MARIA_HA *info); +int maria_chk_del(HA_CHECK *param, MARIA_HA *info, ulonglong test_flag); +int maria_chk_size(HA_CHECK *param, MARIA_HA *info); +int maria_chk_key(HA_CHECK *param, MARIA_HA *info); +int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend); +int maria_repair(HA_CHECK *param, MARIA_HA *info, char * name, my_bool); +int maria_sort_index(HA_CHECK *param, MARIA_HA *info, char * name); +int maria_zerofill(HA_CHECK *param, MARIA_HA *info, const char *name); +int maria_repair_by_sort(HA_CHECK *param, MARIA_HA *info, + const char *name, my_bool rep_quick); +int maria_repair_parallel(HA_CHECK *param, MARIA_HA *info, + const char *name, my_bool rep_quick); +int maria_change_to_newfile(const char *filename, const char *old_ext, + const char *new_ext, time_t backup_time, + myf myflags); +void maria_lock_memory(HA_CHECK *param); +int maria_update_state_info(HA_CHECK *param, MARIA_HA *info, uint update); +void maria_update_key_parts(MARIA_KEYDEF *keyinfo, double *rec_per_key_part, + ulonglong *unique, ulonglong *notnull, + ulonglong records); +int maria_filecopy(HA_CHECK *param, File to, File from, my_off_t start, + my_off_t length, const char *type); +int maria_movepoint(MARIA_HA *info, uchar *record, my_off_t oldpos, + my_off_t newpos, uint prot_key); +int maria_test_if_almost_full(MARIA_HA *info); +int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename); +int maria_disable_indexes(MARIA_HA *info); +int maria_enable_indexes(MARIA_HA *info); +int maria_indexes_are_disabled(MARIA_HA *info); +void maria_disable_indexes_for_rebuild(MARIA_HA *info, ha_rows rows, + my_bool all_keys); +my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows, ulonglong key_map, + my_bool force); + +int maria_init_bulk_insert(MARIA_HA *info, size_t cache_size, ha_rows rows); +void maria_flush_bulk_insert(MARIA_HA *info, uint inx); +int maria_end_bulk_insert(MARIA_HA *info, my_bool abort); +int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves); +void maria_ignore_trids(MARIA_HA *info); +my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows); + +/* fulltext functions */ +FT_INFO *maria_ft_init_search(uint,void *, uint, uchar *, size_t, + CHARSET_INFO *, uchar *); + +/* 'Almost-internal' Maria functions */ + +void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info, + my_bool repair); + + +/* Do extra sanity checking */ +#define SANITY_CHECKS 1 +#ifdef EXTRA_DEBUG +#define EXTRA_DEBUG_KEY_CHANGES +#endif +/* + The following defines can be used when one has problems with redo logging + Setting this will log the full key page which can be compared with the + redo-changed key page. This will however make the aria log files MUCH bigger. +*/ +#if defined(EXTRA_ARIA_DEBUG) +#define EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES +#endif +/* For testing recovery */ +#ifdef TO_BE_REMOVED +#define IDENTICAL_PAGES_AFTER_RECOVERY 1 +#endif + +#define MAX_NONMAPPED_INSERTS 1000 +#define MARIA_MAX_TREE_LEVELS 32 +#define MARIA_MAX_RECORD_ON_STACK 16384 + +#define MARIA_MIN_SORT_MEMORY (16384-MALLOC_OVERHEAD) + +/* maria_open() flag, specific for maria_pack */ +#define HA_OPEN_IGNORE_MOVED_STATE (1U << 30) + +typedef struct st_sort_key_blocks MA_SORT_KEY_BLOCKS; +typedef struct st_sort_ftbuf MA_SORT_FT_BUF; + +extern PAGECACHE maria_pagecache_var, *maria_pagecache; +int maria_assign_to_pagecache(MARIA_HA *info, ulonglong key_map, + PAGECACHE *key_cache); +void maria_change_pagecache(PAGECACHE *old_key_cache, + PAGECACHE *new_key_cache); + +typedef struct st_maria_sort_info +{ + /* sync things */ + mysql_mutex_t mutex; + mysql_cond_t cond; + MARIA_HA *info, *new_info; + HA_CHECK *param; + char *buff; + MA_SORT_KEY_BLOCKS *key_block, *key_block_end; + MA_SORT_FT_BUF *ft_buf; + my_off_t filelength, dupp, buff_length; + pgcache_page_no_t page; + ha_rows max_records; + uint current_key, total_keys; + volatile uint got_error; + uint threads_running; + myf myf_rw; + enum data_file_type new_data_file_type, org_data_file_type; +} MARIA_SORT_INFO; + +typedef struct st_maria_sort_param +{ + pthread_t thr; + IO_CACHE read_cache, tempfile, tempfile_for_exceptions; + DYNAMIC_ARRAY buffpek; + MARIA_BIT_BUFF bit_buff; /* For parallel repair of packrec. */ + + MARIA_KEYDEF *keyinfo; + MARIA_SORT_INFO *sort_info; + HA_KEYSEG *seg; + uchar **sort_keys; + uchar *rec_buff; + void *wordlist, *wordptr; + MEM_ROOT wordroot; + uchar *record; + MY_TMPDIR *tmpdir; + + /* + The next two are used to collect statistics, see maria_update_key_parts for + description. + */ + ulonglong unique[HA_MAX_KEY_SEG+1]; + ulonglong notnull[HA_MAX_KEY_SEG+1]; + ulonglong sortbuff_size; + + MARIA_RECORD_POS pos,max_pos,filepos,start_recpos, current_filepos; + uint key, key_length,real_key_length; + uint maxbuffers, keys, find_length, sort_keys_length; + my_bool fix_datafile, master; + my_bool calc_checksum; /* calculate table checksum */ + size_t rec_buff_size; + + int (*key_cmp)(struct st_maria_sort_param *, const void *, const void *); + int (*key_read)(struct st_maria_sort_param *, uchar *); + int (*key_write)(struct st_maria_sort_param *, const uchar *); + void (*lock_in_memory)(HA_CHECK *); + int (*write_keys)(struct st_maria_sort_param *, uchar **, + ulonglong , struct st_buffpek *, IO_CACHE *); + my_off_t (*read_to_buffer)(IO_CACHE *,struct st_buffpek *, uint); + int (*write_key)(struct st_maria_sort_param *, IO_CACHE *,uchar *, + uint, ulonglong); +} MARIA_SORT_PARAM; + +int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile); + +struct st_transaction; + +/* undef map from my_nosys; We need test-if-disk full */ +#undef my_write + +#define CRC_SIZE 4 + +typedef struct st_maria_state_info +{ + struct + { /* Fileheader (24 bytes) */ + uchar file_version[4]; + uchar options[2]; + uchar header_length[2]; + uchar state_info_length[2]; + uchar base_info_length[2]; + uchar base_pos[2]; + uchar key_parts[2]; /* Key parts */ + uchar unique_key_parts[2]; /* Key parts + unique parts */ + uchar keys; /* number of keys in file */ + uchar uniques; /* number of UNIQUE definitions */ + uchar not_used; /* Language for indexes */ + uchar fulltext_keys; + uchar data_file_type; + /* Used by mariapack to store the original data_file_type */ + uchar org_data_file_type; + } header; + + MARIA_STATUS_INFO state; + /* maria_ha->state points here for crash-safe but not versioned tables */ + MARIA_STATUS_INFO common; + /* State for a versioned table that is temporary non versioned */ + MARIA_STATUS_INFO no_logging; + ha_rows split; /* number of split blocks */ + my_off_t dellink; /* Link to next removed block */ + pgcache_page_no_t first_bitmap_with_space; + ulonglong auto_increment; + TrID create_trid; /* Minum trid for file */ + TrID last_change_trn; /* selfdescriptive */ + ulong update_count; /* Updated for each write lock */ + ulong status; + double *rec_per_key_part; + ulong *nulls_per_key_part; + ha_checksum checksum; /* Table checksum */ + my_off_t *key_root; /* Start of key trees */ + my_off_t key_del; /* delete links for index pages */ + my_off_t records_at_analyze; /* Rows when calculating rec_per_key */ + + ulong sec_index_changed; /* Updated when new sec_index */ + ulong sec_index_used; /* which extra index are in use */ + ulonglong key_map; /* Which keys are in use */ + ulong version; /* timestamp of create */ + time_t create_time; /* Time when created database */ + time_t recover_time; /* Time for last recover */ + time_t check_time; /* Time for last check */ + uint sortkey; /* sorted by this key (not used) */ + uint open_count; + uint changed; /* Changed since maria_chk */ + uint org_changed; /* Changed since open */ + /** + Birthday of the table: no record in the log before this LSN should ever + be applied to the table. Updated when created, renamed, explicitly + repaired (REPAIR|OPTIMIZE TABLE, ALTER TABLE ENABLE KEYS, maria_chk). + */ + LSN create_rename_lsn; + /** @brief Log horizon when state was last updated on disk */ + TRANSLOG_ADDRESS is_of_horizon; + /** + REDO phase should ignore any record before this LSN. UNDO phase + shouldn't, this is the difference with create_rename_lsn. + skip_redo_lsn >= create_rename_lsn. + The distinction is for these cases: + - after a repair at end of bulk insert (enabling indices), REDO phase + should skip the table but UNDO phase should not, so only skip_redo_lsn is + increased, not create_rename_lsn + - if one table is corrupted and so recovery fails, user may repair the + table with maria_chk and let recovery restart: that recovery should then + skip the repaired table even in the UNDO phase, so create_rename_lsn is + increased. + */ + LSN skip_redo_lsn; + /* LSN when we wrote file id to the log */ + LSN logrec_file_id; + + uint8 dupp_key; /* Lastly processed index with */ + /* violated uniqueness constraint */ + + /* the following isn't saved on disk */ + uint state_diff_length; /* Should be 0 */ + uint state_length; /* Length of state header in file */ + ulong *key_info; +} MARIA_STATE_INFO; + + +/* Number of bytes written be _ma_state_info_write_sub() */ +#define MARIA_STATE_INFO_SIZE \ + (24 + 2 + LSN_STORE_SIZE*3 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8) +#define MARIA_FILE_OPEN_COUNT_OFFSET 0 +#define MARIA_FILE_CHANGED_OFFSET 2 +#define MARIA_FILE_CREATE_RENAME_LSN_OFFSET 4 +#define MARIA_FILE_CREATE_TRID_OFFSET (4 + LSN_STORE_SIZE*3 + 11*8) + +#define MARIA_MAX_KEY_LENGTH 2300 +#define MARIA_MAX_KEY_BUFF (MARIA_MAX_KEY_LENGTH+HA_MAX_KEY_SEG*6+8+8 + \ + MARIA_MAX_PACK_TRANSID_SIZE) +#define MARIA_MAX_POSSIBLE_KEY_BUFF (MARIA_MAX_KEY_LENGTH + 24+ 6+6) +#define MARIA_STATE_KEY_SIZE (8 + 4) +#define MARIA_STATE_KEYBLOCK_SIZE 8 +#define MARIA_STATE_KEYSEG_SIZE 12 +#define MARIA_STATE_EXTRA_SIZE (MARIA_MAX_KEY*MARIA_STATE_KEY_SIZE + MARIA_MAX_KEY*HA_MAX_KEY_SEG*MARIA_STATE_KEYSEG_SIZE) +#define MARIA_KEYDEF_SIZE (2+ 5*2) +#define MARIA_UNIQUEDEF_SIZE (2+1+1) +#define HA_KEYSEG_SIZE (6+ 2*2 + 4*2) +#define MARIA_COLUMNDEF_SIZE (2*7+1+1+4) +#define MARIA_BASE_INFO_SIZE (MY_UUID_SIZE + 5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16) +#define MARIA_INDEX_BLOCK_MARGIN 16 /* Safety margin for .MYI tables */ +#define MARIA_MAX_POINTER_LENGTH 7 /* Node pointer */ +/* Internal management bytes needed to store 2 transid/key on an index page */ +#define MARIA_MAX_PACK_TRANSID_SIZE (TRANSID_SIZE+1) +#define MARIA_TRANSID_PACK_OFFSET (256- TRANSID_SIZE - 1) +#define MARIA_MIN_TRANSID_PACK_OFFSET (MARIA_TRANSID_PACK_OFFSET-TRANSID_SIZE) +#define MARIA_INDEX_OVERHEAD_SIZE (MARIA_MAX_PACK_TRANSID_SIZE * 2 + \ + MARIA_MAX_POINTER_LENGTH) +#define MARIA_DELETE_KEY_NR 255 /* keynr for deleted blocks */ + + /* extra options */ +#define MA_EXTRA_OPTIONS_ENCRYPTED (1 << 0) +#define MA_EXTRA_OPTIONS_INSERT_ORDER (1 << 1) + +#include "ma_check.h" + +/* + Basic information of the Maria table. This is stored on disk + and not changed (unless we do DLL changes). +*/ + +typedef struct st_ma_base_info +{ + my_off_t keystart; /* Start of keys */ + my_off_t max_data_file_length; + my_off_t max_key_file_length; + my_off_t margin_key_file_length; + ha_rows records, reloc; /* Create information */ + ulong mean_row_length; /* Create information */ + ulong reclength; /* length of unpacked record */ + ulong pack_reclength; /* Length of full packed rec */ + ulong min_pack_length; + ulong max_pack_length; /* Max possibly length of packed rec */ + ulong min_block_length; + ulong s3_block_size; /* Block length for S3 files */ + uint fields; /* fields in table */ + uint fixed_not_null_fields; + uint fixed_not_null_fields_length; + uint max_field_lengths; + uint pack_fields; /* packed fields in table */ + uint varlength_fields; /* char/varchar/blobs */ + /* Number of bytes in the index used to refer to a row (2-8) */ + uint rec_reflength; + /* Number of bytes in the index used to refer to another index page (2-8) */ + uint key_reflength; /* = 2-8 */ + uint keys; /* same as in state.header */ + uint auto_key; /* Which key-1 is a auto key */ + uint blobs; /* Number of blobs */ + /* Length of packed bits (when table was created first time) */ + uint pack_bytes; + /* Length of null bits (when table was created first time) */ + uint original_null_bytes; + uint null_bytes; /* Null bytes in record */ + uint field_offsets; /* Number of field offsets */ + uint max_key_block_length; /* Max block length */ + uint max_key_length; /* Max key length */ + /* Extra allocation when using dynamic record format */ + uint extra_alloc_bytes; + uint extra_alloc_procent; + uint is_nulls_extended; /* 1 if new null bytes */ + uint default_row_flag; /* 0 or ROW_FLAG_NULLS_EXTENDED */ + uint block_size; + /* Size of initial record buffer */ + uint default_rec_buff_size; + /* Extra number of bytes the row format require in the record buffer */ + uint extra_rec_buff_size; + /* Tuning flags that can be ignored by older Maria versions */ + uint extra_options; + /* default language, not really used but displayed by maria_chk */ + uint language; + /* Compression library used. 0 for no compression */ + uint compression_algorithm; + + /* The following are from the header */ + uint key_parts, all_key_parts; + uchar uuid[MY_UUID_SIZE]; + /** + @brief If false, we disable logging, versioning, transaction etc. Observe + difference with MARIA_SHARE::now_transactional + */ + my_bool born_transactional; +} MARIA_BASE_INFO; + +uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base); + +/* Structs used intern in database */ + +typedef struct st_maria_blob /* Info of record */ +{ + ulong offset; /* Offset to blob in record */ + uint pack_length; /* Type of packed length */ + ulong length; /* Calc:ed for each record */ +} MARIA_BLOB; + + +typedef struct st_maria_pack +{ + ulong header_length; + uint ref_length; + uchar version; +} MARIA_PACK; + +typedef struct st_maria_file_bitmap +{ + struct st_maria_share *share; + uchar *map; + pgcache_page_no_t page; /* Page number for current bitmap */ + pgcache_page_no_t last_bitmap_page; /* Last possible bitmap page */ + my_bool changed; /* 1 if page needs to be written */ + my_bool changed_not_flushed; /* 1 if some bitmap is not flushed */ + my_bool return_first_match; /* Shortcut find_head() */ + uint used_size; /* Size of bitmap head that is not 0 */ + uint full_head_size; /* Where to start search for head */ + uint full_tail_size; /* Where to start search for tail */ + uint flush_all_requested; /**< If _ma_bitmap_flush_all waiting */ + uint waiting_for_flush_all_requested; /* If someone is waiting for above */ + uint non_flushable; /**< 0 if bitmap and log are in sync */ + uint waiting_for_non_flushable; /* If someone is waiting for above */ + PAGECACHE_FILE file; /* datafile where bitmap is stored */ + + mysql_mutex_t bitmap_lock; + mysql_cond_t bitmap_cond; /**< When bitmap becomes flushable */ + /* Constants, allocated when initiating bitmaps */ + uint sizes[8]; /* Size per bit combination */ + uint total_size; /* Total usable size of bitmap page */ + uint max_total_size; /* Max value for total_size */ + uint last_total_size; /* Size of bitmap on last_bitmap_page */ + uint block_size; /* Block size of file */ + ulong pages_covered; /* Pages covered by bitmap + 1 */ + DYNAMIC_ARRAY pinned_pages; /**< not-yet-flushable bitmap pages */ +} MARIA_FILE_BITMAP; + +#define MARIA_CHECKPOINT_LOOKS_AT_ME 1 +#define MARIA_CHECKPOINT_SHOULD_FREE_ME 2 +#define MARIA_CHECKPOINT_SEEN_IN_LOOP 4 + +typedef struct st_maria_crypt_data MARIA_CRYPT_DATA; +struct ms3_st; + +typedef struct st_maria_share +{ /* Shared between opens */ + MARIA_STATE_INFO state; + MARIA_STATE_INFO checkpoint_state; /* Copy of saved state by checkpoint */ + MARIA_BASE_INFO base; + MARIA_STATE_HISTORY *state_history; + MARIA_KEYDEF ft2_keyinfo; /* Second-level ft-key definition */ + MARIA_KEYDEF *keyinfo; /* Key definitions */ + MARIA_UNIQUEDEF *uniqueinfo; /* unique definitions */ + HA_KEYSEG *keyparts; /* key part info */ + MARIA_COLUMNDEF *columndef; /* Pointer to column information */ + MARIA_PACK pack; /* Data about packed records */ + MARIA_BLOB *blobs; /* Pointer to blobs */ + uint16 *column_nr; /* Original column order */ + LEX_STRING unique_file_name; /* realpath() of index file */ + LEX_STRING data_file_name; /* Resolved path names from symlinks */ + LEX_STRING index_file_name; + LEX_STRING open_file_name; /* parameter to open filename */ + uchar *file_map; /* mem-map of file if possible */ + LIST *open_list; /* Tables open with this share */ + PAGECACHE *pagecache; /* ref to the current key cache */ + MARIA_DECODE_TREE *decode_trees; + /* + Previous auto-increment value. Used to verify if we can restore the + auto-increment counter if we have to abort an insert (duplicate key). + */ + ulonglong last_auto_increment; + uint16 *decode_tables; + uint16 id; /**< 2-byte id by which log records refer to the table */ + /* Called the first time the table instance is opened */ + my_bool (*once_init)(struct st_maria_share *, File); + /* Called when the last instance of the table is closed */ + my_bool (*once_end)(struct st_maria_share *); + /* Is called for every open of the table */ + my_bool (*init)(MARIA_HA *); + /* Is called for every close of the table */ + void (*end)(MARIA_HA *); + /* Called when we want to read a record from a specific position */ + int (*read_record)(MARIA_HA *, uchar *, MARIA_RECORD_POS); + /* Initialize a scan */ + my_bool (*scan_init)(MARIA_HA *); + /* Read next record while scanning */ + int (*scan)(MARIA_HA *, uchar *, MARIA_RECORD_POS, my_bool); + /* End scan */ + void (*scan_end)(MARIA_HA *); + int (*scan_remember_pos)(MARIA_HA *, MARIA_RECORD_POS*); + int (*scan_restore_pos)(MARIA_HA *, MARIA_RECORD_POS); + /* Pre-write of row (some handlers may do the actual write here) */ + MARIA_RECORD_POS (*write_record_init)(MARIA_HA *, const uchar *); + /* Write record (or accept write_record_init) */ + my_bool (*write_record)(MARIA_HA *, const uchar *); + /* Called when write failed */ + my_bool (*write_record_abort)(MARIA_HA *); + my_bool (*update_record)(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); + my_bool (*delete_record)(MARIA_HA *, const uchar *record); + my_bool (*compare_record)(MARIA_HA *, const uchar *); + /* calculate checksum for a row */ + ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *); + /* + Calculate checksum for a row during write. May be 0 if we calculate + the checksum in write_record_init() + */ + ha_checksum(*calc_write_checksum)(MARIA_HA *, const uchar *); + /* calculate checksum for a row during check table */ + ha_checksum(*calc_check_checksum)(MARIA_HA *, const uchar *); + /* Compare a row in memory with a row on disk */ + my_bool (*compare_unique)(MARIA_HA *, MARIA_UNIQUEDEF *, + const uchar *record, MARIA_RECORD_POS pos); + my_off_t (*keypos_to_recpos)(struct st_maria_share *share, my_off_t pos); + my_off_t (*recpos_to_keypos)(struct st_maria_share *share, my_off_t pos); + my_bool (*row_is_visible)(MARIA_HA *); + + /* Mapings to read/write the data file */ + size_t (*file_read)(MARIA_HA *, uchar *, size_t, my_off_t, myf); + size_t (*file_write)(MARIA_HA *, const uchar *, size_t, my_off_t, myf); + /* query cache invalidator for merged tables */ + invalidator_by_filename invalidator; + /* query cache invalidator for changing state */ + invalidator_by_filename chst_invalidator; + my_off_t key_del_current; /* delete links for index pages */ + ulong this_process; /* processid */ + ulong last_process; /* For table-change-check */ + ulong last_version; /* Version on start */ + ulong options; /* Options used */ + ulong min_pack_length; /* These are used by packed data */ + ulong max_pack_length; + ulong state_diff_length; + uint rec_reflength; /* rec_reflength in use now */ + uint keypage_header; + uint32 ftkeys; /* Number of distinct full-text keys + + 1 */ + PAGECACHE_FILE kfile; /* Shared keyfile */ + S3_INFO *s3_path; /* Connection and path in s3 */ + File data_file; /* Shared data file */ + int mode; /* mode of file on open */ + uint reopen; /* How many times opened */ + uint in_trans; /* Number of references by trn */ + uint w_locks, r_locks, tot_locks; /* Number of read/write locks */ + uint block_size; /* block_size of keyfile & data file*/ + uint max_index_block_size; /* block_size - end_of_page_info */ + /* Fixed length part of a packed row in BLOCK_RECORD format */ + uint base_length; + myf write_flag; + enum data_file_type data_file_type; + enum pagecache_page_type page_type; /* value depending transactional */ + /** + if Checkpoint looking at table; protected by close_lock or THR_LOCK_maria + */ + uint8 in_checkpoint; + my_bool temporary; + /* Below flag is needed to make log tables work with concurrent insert */ + my_bool is_log_table; + my_bool has_null_fields; + my_bool has_varchar_fields; /* If table has varchar fields */ + /* + Set to 1 if open_count was wrong at open. Set to avoid asserts for + wrong open count on close. + */ + my_bool open_count_not_zero_on_open; + + my_bool changed, /* If changed since lock */ + global_changed, /* If changed since open */ + not_flushed; + my_bool no_status_updates; /* Set to 1 if S3 readonly table */ + my_bool internal_table; /* Internal tmp table */ + my_bool lock_key_trees; /* If we have to lock trees on read */ + my_bool non_transactional_concurrent_insert; + my_bool delay_key_write; + my_bool have_rtree; + /** + @brief if the table is transactional right now. It may have been created + transactional (base.born_transactional==TRUE) but with transactionality + (logging) temporarily disabled (now_transactional==FALSE). The opposite + (FALSE, TRUE) is impossible. + */ + my_bool now_transactional; + my_bool have_versioning; + my_bool key_del_used; /* != 0 if key_del is locked */ + my_bool deleting; /* we are going to delete this table */ + my_bool redo_error_given; /* Used during recovery */ + my_bool silence_encryption_errors; /* Used during recovery */ + THR_LOCK lock; + void (*lock_restore_status)(void *); + /** + Protects kfile, dfile, most members of the state, state disk writes, + versioning information (like in_trans, state_history). + @todo find the exhaustive list. + */ + mysql_mutex_t intern_lock; + mysql_mutex_t key_del_lock; + mysql_cond_t key_del_cond; + /** + _Always_ held while closing table; prevents checkpoint from looking at + structures freed during closure (like bitmap). If you need close_lock and + intern_lock, lock them in this order. + */ + mysql_mutex_t close_lock; + my_off_t mmaped_length; + uint nonmmaped_inserts; /* counter of writing in + non-mmaped area */ + MARIA_FILE_BITMAP bitmap; + mysql_rwlock_t mmap_lock; + LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */ + + /** + Crypt data + */ + uint crypt_page_header_space; + MARIA_CRYPT_DATA *crypt_data; + + /** + Keep of track of last insert page, used to implement insert order + */ + uint last_insert_page; + pgcache_page_no_t last_insert_bitmap; +} MARIA_SHARE; + + +typedef uchar MARIA_BITMAP_BUFFER; + +typedef struct st_maria_bitmap_block +{ + pgcache_page_no_t page; /* Page number */ + /* Number of continuous pages. TAIL_BIT is set if this is a tail page */ + uint page_count; + uint empty_space; /* Set for head and tail pages */ + /* + Number of BLOCKS for block-region (holds all non-blob-fields or one blob) + */ + uint sub_blocks; + /* set to <> 0 in write_record() if this block was actually used */ + uint8 used; + uint8 org_bitmap_value; +} MARIA_BITMAP_BLOCK; + + +typedef struct st_maria_bitmap_blocks +{ + MARIA_BITMAP_BLOCK *block; + uint count; + my_bool tail_page_skipped; /* If some tail pages was not used */ + my_bool page_skipped; /* If some full pages was not used */ +} MARIA_BITMAP_BLOCKS; + + +/* Data about the currently read row */ +typedef struct st_maria_row +{ + MARIA_BITMAP_BLOCKS insert_blocks; + MARIA_BITMAP_BUFFER *extents; + MARIA_RECORD_POS lastpos, nextpos; + MARIA_RECORD_POS *tail_positions; + ha_checksum checksum; + LSN orig_undo_lsn; /* Lsn at start of row insert */ + TrID trid; /* Transaction id for current row */ + uchar *empty_bits, *field_lengths; + uint *null_field_lengths; /* All null field lengths */ + ulong *blob_lengths; /* Length for each blob */ + ulong min_length, normal_length, char_length, varchar_length; + ulong blob_length, total_length; + size_t extents_buffer_length; /* Size of 'extents' buffer */ + uint head_length, header_length; + uint field_lengths_length; /* Length of data in field_lengths */ + uint extents_count; /* number of extents in 'extents' */ + uint full_page_count, tail_count; /* For maria_chk */ + uint space_on_head_page; +} MARIA_ROW; + +/* Data to scan row in blocked format */ +typedef struct st_maria_block_scan +{ + uchar *bitmap_buff, *bitmap_pos, *bitmap_end, *page_buff; + uchar *dir, *dir_end; + pgcache_page_no_t bitmap_page, max_page; + ulonglong bits; + uint number_of_rows, bit_pos; + MARIA_RECORD_POS row_base_page; + ulonglong row_changes; +} MARIA_BLOCK_SCAN; + + +struct st_maria_handler +{ + MARIA_SHARE *s; /* Shared between open:s */ + struct st_ma_transaction *trn; /* Pointer to active transaction */ + struct st_maria_handler *trn_next,**trn_prev; + MARIA_STATUS_INFO *state, state_save; + MARIA_STATUS_INFO *state_start; /* State at start of transaction */ + MARIA_USED_TABLES *used_tables; + struct ms3_st *s3; + void **stack_end_ptr; + MARIA_ROW cur_row; /* The active row that we just read */ + MARIA_ROW new_row; /* Storage for a row during update */ + MARIA_KEY last_key; /* Last found key */ + MARIA_BLOCK_SCAN scan, *scan_save; + MARIA_BLOB *blobs; /* Pointer to blobs */ + MARIA_BIT_BUFF bit_buff; + DYNAMIC_ARRAY bitmap_blocks; + DYNAMIC_ARRAY pinned_pages; + /* accumulate indexfile changes between write's */ + TREE *bulk_insert; + LEX_CUSTRING *log_row_parts; /* For logging */ + DYNAMIC_ARRAY *ft1_to_ft2; /* used only in ft1->ft2 conversion */ + MEM_ROOT ft_memroot; /* used by the parser */ + MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit */ + void *external_ref; /* For MariaDB TABLE */ + uchar *buff; /* page buffer */ + uchar *keyread_buff; /* Buffer for last key read */ + uchar *lastkey_buff; /* Last used search key */ + uchar *lastkey_buff2; + uchar *first_mbr_key; /* Searhed spatial key */ + uchar *rec_buff; /* Temp buffer for recordpack */ + uchar *blob_buff; /* Temp buffer for blobs */ + uchar *int_keypos; /* Save position for next/previous */ + uchar *int_maxpos; /* -""- */ + uint keypos_offset; /* Tmp storage for offset int_keypos */ + uint maxpos_offset; /* Tmp storage for offset int_maxpos */ + uchar *update_field_data; /* Used by update in rows-in-block */ + uint int_nod_flag; /* -""- */ + uint32 int_keytree_version; /* -""- */ + int (*read_record)(MARIA_HA *, uchar*, MARIA_RECORD_POS); + invalidator_by_filename invalidator; /* query cache invalidator */ + ulonglong last_auto_increment; /* auto value at start of statement */ + ulonglong row_changes; /* Incremented for each change */ + ulonglong start_row_changes; /* Row changes since start trans */ + ulong this_unique; /* uniq filenumber or thread */ + ulong last_unique; /* last unique number */ + ulong this_loop; /* counter for this open */ + ulong last_loop; /* last used counter */ + MARIA_RECORD_POS save_lastpos; + MARIA_RECORD_POS dup_key_pos; + TrID dup_key_trid; + my_off_t pos; /* Intern variable */ + my_off_t last_keypage; /* Last key page read */ + my_off_t last_search_keypage; /* Last keypage when searching */ + + /* + QQ: the folloing two xxx_length fields should be removed, + as they are not compatible with parallel repair + */ + ulong packed_length, blob_length; /* Length of found, packed record */ + size_t rec_buff_size, blob_buff_size; + PAGECACHE_FILE dfile; /* The datafile */ + IO_CACHE rec_cache; /* When cacheing records */ + LIST open_list; + LIST share_list; + MY_BITMAP changed_fields; + ulong row_base_length; /* Length of row header */ + uint row_flag; /* Flag to store in row header */ + uint opt_flag; /* Optim. for space/speed */ + uint open_flags; /* Flags used in open() */ + uint update; /* If file changed since open */ + uint error_count; /* Incremented for each error given */ + int lastinx; /* Last used index */ + uint last_rkey_length; /* Last length in maria_rkey() */ + uint *last_rtree_keypos; /* Last key positions for rtrees */ + uint bulk_insert_ref_length; /* Lenght of row ref during bi */ + uint non_flushable_state; + enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */ + uint save_lastkey_data_length; + uint save_lastkey_ref_length; + uint pack_key_length; /* For MARIA_MRG */ + myf lock_wait; /* is 0 or MY_SHORT_WAIT */ + int errkey; /* Got last error on this key */ + int lock_type; /* How database was locked */ + int tmp_lock_type; /* When locked by readinfo */ + uint data_changed; /* Somebody has changed data */ + uint save_update; /* When using KEY_READ */ + int save_lastinx; + uint preload_buff_size; /* When preloading indexes */ + uint16 last_used_keyseg; /* For MARIAMRG */ + uint8 key_del_used; /* != 0 if key_del is used */ + my_bool was_locked; /* Was locked in panic */ + my_bool intern_lock_locked; /* locked in ma_extra() */ + my_bool append_insert_at_end; /* Set if concurrent insert */ + my_bool quick_mode; + my_bool in_check_table; /* We are running check tables */ + /* Marker if key_del_changed */ + /* If info->keyread_buff can't be used for rnext */ + my_bool page_changed; + /* If info->keyread_buff has to be re-read for rnext */ + my_bool keyread_buff_used; + my_bool once_flags; /* For MARIA_MRG */ + /* For bulk insert enable/disable transactions control */ + my_bool switched_transactional; + /* If transaction will autocommit */ + my_bool autocommit; +#ifdef _WIN32 + my_bool owned_by_merge; /* This Maria table is part of a merge union */ +#endif + THR_LOCK_DATA lock; + uchar *maria_rtree_recursion_state; /* For RTREE */ + uchar length_buff[5]; /* temp buff to store blob lengths */ + int maria_rtree_recursion_depth; + + my_bool create_unique_index_by_sort; + index_cond_func_t index_cond_func; /* Index condition function */ + void *index_cond_func_arg; /* parameter for the func */ +}; + +/* Table options for the Aria and S3 storage engine */ + +struct ha_table_option_struct +{ + ulonglong s3_block_size; + uint compression_algorithm; +}; + +/* Some defines used by maria-functions */ + +#define USE_WHOLE_KEY 65535 /* Use whole key in _search() */ +#define F_EXTRA_LCK -1 + +/* bits in opt_flag */ +#define MEMMAP_USED 32U +#define REMEMBER_OLD_POS 64U + +#define WRITEINFO_UPDATE_KEYFILE 1U +#define WRITEINFO_NO_UNLOCK 2U + +/* once_flags */ +#define USE_PACKED_KEYS 1U +#define RRND_PRESERVE_LASTINX 2U + +/* bits in state.changed */ + +#define STATE_CHANGED 1U +#define STATE_CRASHED 2U +#define STATE_CRASHED_ON_REPAIR 4U +#define STATE_NOT_ANALYZED 8U +#define STATE_NOT_OPTIMIZED_KEYS 16U +#define STATE_NOT_SORTED_PAGES 32U +#define STATE_NOT_OPTIMIZED_ROWS 64U +#define STATE_NOT_ZEROFILLED 128U +#define STATE_NOT_MOVABLE 256U +#define STATE_MOVED 512U /* set if base->uuid != maria_uuid */ +#define STATE_IN_REPAIR 1024U /* We are running repair on table */ +#define STATE_CRASHED_PRINTED 2048U +#define STATE_DATA_FILE_FULL 4096U + +#define STATE_CRASHED_FLAGS (STATE_CRASHED | STATE_CRASHED_ON_REPAIR | STATE_CRASHED_PRINTED) + +/* options to maria_read_cache */ + +#define READING_NEXT 1U +#define READING_HEADER 2U + +/* Number of bytes on key pages to indicate used size */ +#define KEYPAGE_USED_SIZE 2U +#define KEYPAGE_KEYID_SIZE 1U +#define KEYPAGE_FLAG_SIZE 1U +#define KEYPAGE_KEY_VERSION_SIZE 4U /* encryption */ +#define KEYPAGE_CHECKSUM_SIZE 4U +#define MAX_KEYPAGE_HEADER_SIZE (LSN_STORE_SIZE + KEYPAGE_USED_SIZE + \ + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + \ + TRANSID_SIZE + KEYPAGE_KEY_VERSION_SIZE) +#define KEYPAGE_FLAG_ISNOD 1U +#define KEYPAGE_FLAG_HAS_TRANSID 2U + +#define _ma_get_page_used(share,x) \ + ((uint) mi_uint2korr((x) + (share)->keypage_header - KEYPAGE_USED_SIZE)) +#define _ma_store_page_used(share,x,y) \ + mi_int2store((x) + (share)->keypage_header - KEYPAGE_USED_SIZE, (y)) +#define _ma_get_keypage_flag(share,x) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE] +#define _ma_test_if_nod(share,x) \ + ((_ma_get_keypage_flag(share,x) & KEYPAGE_FLAG_ISNOD) ? (share)->base.key_reflength : 0) + +#define _ma_store_keynr(share, x, nr) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE]= (nr) +#define _ma_get_keynr(share, x) ((uchar) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE]) +#define _ma_store_transid(buff, transid) \ + transid_store((buff) + LSN_STORE_SIZE, (transid)) +#define _ma_korr_transid(buff) \ + transid_korr((buff) + LSN_STORE_SIZE) +#define _ma_store_keypage_flag(share,x,flag) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (flag) +#define _ma_mark_page_with_transid(share, page) \ + do { (page)->flag|= KEYPAGE_FLAG_HAS_TRANSID; \ + (page)->buff[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (page)->flag; } while (0) + +#define KEYPAGE_KEY_VERSION(share, x) ((x) + \ + (share)->keypage_header - \ + (KEYPAGE_USED_SIZE + \ + KEYPAGE_FLAG_SIZE + \ + KEYPAGE_KEYID_SIZE + \ + KEYPAGE_KEY_VERSION_SIZE)) + +#define _ma_get_key_version(share,x) \ + ((uint) uint4korr(KEYPAGE_KEY_VERSION((share), (x)))) + +#define _ma_store_key_version(share,x,kv) \ + int4store(KEYPAGE_KEY_VERSION((share), (x)), (kv)) + +/* + TODO: write int4store_aligned as *((uint32 *) (T))= (uint32) (A) for + architectures where it is possible +*/ +#define int4store_aligned(A,B) int4store((A),(B)) + +#define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \ + DBUG_PRINT("error", ("Marked table crashed")); \ + }while(0) +#define maria_mark_crashed_share(x) \ + do{(x)->state.changed|= STATE_CRASHED; \ + DBUG_PRINT("error", ("Marked table crashed")); \ + }while(0) +#define maria_mark_crashed_on_repair(x) do{(x)->s->state.changed|= \ + STATE_CRASHED|STATE_CRASHED_ON_REPAIR; \ + (x)->update|= HA_STATE_CHANGED; \ + DBUG_PRINT("error", ("Marked table crashed on repair")); \ + }while(0) +#define maria_mark_in_repair(x) do{(x)->s->state.changed|= \ + STATE_CRASHED | STATE_IN_REPAIR; \ + (x)->update|= HA_STATE_CHANGED; \ + DBUG_PRINT("error", ("Marked table crashed for repair")); \ + }while(0) +#define maria_is_crashed(x) ((x)->s->state.changed & STATE_CRASHED) +#define maria_is_crashed_on_repair(x) ((x)->s->state.changed & STATE_CRASHED_ON_REPAIR) +#define maria_in_repair(x) ((x)->s->state.changed & STATE_IN_REPAIR) + +#define DBUG_DUMP_KEY(name, key) DBUG_DUMP(name, (key)->data, (key)->data_length + (key)->ref_length) + +/* Functions to store length of space packed keys, VARCHAR or BLOB keys */ + +#define store_key_length(key,length) \ +{ if ((length) < 255) \ + { *(key)=(length); } \ + else \ + { *(key)=255; mi_int2store((key)+1,(length)); } \ +} + +#define get_key_full_length(length,key) \ + { if (*(const uchar*) (key) != 255) \ + length= ((uint) *(const uchar*) ((key)++))+1; \ + else \ + { length=mi_uint2korr((key)+1)+3; (key)+=3; } \ +} + +#define get_key_full_length_rdonly(length,key) \ +{ if (*(const uchar*) (key) != 255) \ + length= ((uint) *(const uchar*) ((key)))+1; \ + else \ + { length=mi_uint2korr((key)+1)+3; } \ +} + +#define _ma_max_key_length() ((maria_block_size - MAX_KEYPAGE_HEADER_SIZE)/3 - MARIA_INDEX_OVERHEAD_SIZE) +#define get_pack_length(length) ((length) >= 255 ? 3 : 1) +#define _ma_have_versioning(info) ((info)->row_flag & ROW_FLAG_TRANSID) + +#define MARIA_MIN_BLOCK_LENGTH 20 /* Because of delete-link */ +/* Don't use to small record-blocks */ +#define MARIA_EXTEND_BLOCK_LENGTH 20 +#define MARIA_SPLIT_LENGTH ((MARIA_EXTEND_BLOCK_LENGTH+4)*2) + /* Max prefix of record-block */ +#define MARIA_MAX_DYN_BLOCK_HEADER 20 +#define MARIA_BLOCK_INFO_HEADER_LENGTH 20 +#define MARIA_DYN_DELETE_BLOCK_HEADER 20 /* length of delete-block-header */ +#define MARIA_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L) +#define MARIA_DYN_MAX_ROW_LENGTH (MARIA_DYN_MAX_BLOCK_LENGTH - MARIA_SPLIT_LENGTH) +#define MARIA_DYN_ALIGN_SIZE 4 /* Align blocks on this */ +#define MARIA_MAX_DYN_HEADER_BYTE 13 /* max header uchar for dynamic rows */ +#define MARIA_MAX_BLOCK_LENGTH ((((ulong) 1 << 24)-1) & (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1))) +#define MARIA_REC_BUFF_OFFSET ALIGN_SIZE(MARIA_DYN_DELETE_BLOCK_HEADER+sizeof(uint32)) + +#define MEMMAP_EXTRA_MARGIN 7 /* Write this as a suffix for file */ + +#define PACK_TYPE_SELECTED 1U /* Bits in field->pack_type */ +#define PACK_TYPE_SPACE_FIELDS 2U +#define PACK_TYPE_ZERO_FILL 4U + +#define MARIA_FOUND_WRONG_KEY INT_MAX32 /* Impossible value from ha_key_cmp */ + +#define MARIA_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size)) +#define MARIA_MAX_KEYPTR_SIZE 5 /* For calculating block lengths */ + +/* Marker for impossible delete link */ +#define IMPOSSIBLE_PAGE_NO 0xFFFFFFFFFFLL + +/* The UNIQUE check is done with a hashed long key */ + +#define MARIA_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT +#define maria_unique_store(A,B) mi_int4store((A),(B)) + +extern mysql_mutex_t THR_LOCK_maria; +#ifdef DONT_USE_RW_LOCKS +#define mysql_rwlock_wrlock(A) {} +#define mysql_rwlock_rdlock(A) {} +#define mysql_rwlock_unlock(A) {} +#endif + +/* Some tuning parameters */ +#define MARIA_MIN_KEYBLOCK_LENGTH 50 /* When to split delete blocks */ +#define MARIA_MIN_SIZE_BULK_INSERT_TREE 16384U /* this is per key */ +#define MARIA_MIN_ROWS_TO_USE_BULK_INSERT 100 +#define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100 +#define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10 +/* Keep a small buffer for tables only using small blobs */ +#define MARIA_SMALL_BLOB_BUFFER 1024U +#define MARIA_MAX_CONTROL_FILE_LOCK_RETRY 30 /* Retry this many times */ + +/* Some extern variables */ +extern LIST *maria_open_list; +extern uchar maria_file_magic[], maria_pack_file_magic[]; +extern uchar maria_uuid[MY_UUID_SIZE]; +extern uint32 maria_read_vec[], maria_readnext_vec[]; +extern uint maria_quick_table_bits; +extern const char *maria_data_root; +extern uchar maria_zero_string[]; +extern my_bool maria_inited, maria_in_ha_maria, maria_recovery_changed_data; +extern my_bool maria_recovery_verbose, maria_checkpoint_disabled; +extern my_bool maria_assert_if_crashed_table, aria_readonly; +extern ulong maria_checkpoint_min_log_activity; +extern HASH maria_stored_state; +extern int (*maria_create_trn_hook)(MARIA_HA *); +extern my_bool (*ma_killed)(MARIA_HA *); +extern void (*ma_debug_crash_here)(const char *keyword); + +#ifdef HAVE_PSI_INTERFACE +extern PSI_mutex_key key_SHARE_BITMAP_lock, key_SORT_INFO_mutex, + key_THR_LOCK_maria, key_TRANSLOG_BUFFER_mutex, + key_LOCK_soft_sync, + key_TRANSLOG_DESCRIPTOR_dirty_buffer_mask_lock, + key_TRANSLOG_DESCRIPTOR_sent_to_disk_lock, + key_TRANSLOG_DESCRIPTOR_log_flush_lock, + key_TRANSLOG_DESCRIPTOR_file_header_lock, + key_TRANSLOG_DESCRIPTOR_unfinished_files_lock, + key_TRANSLOG_DESCRIPTOR_purger_lock, + key_SHARE_intern_lock, key_SHARE_key_del_lock, + key_SHARE_close_lock, + key_SERVICE_THREAD_CONTROL_lock, + key_PAGECACHE_cache_lock; + +extern PSI_mutex_key key_CRYPT_DATA_lock; + +extern PSI_cond_key key_SHARE_key_del_cond, key_SERVICE_THREAD_CONTROL_cond, + key_SORT_INFO_cond, key_SHARE_BITMAP_cond, + key_COND_soft_sync, key_TRANSLOG_BUFFER_waiting_filling_buffer, + key_TRANSLOG_BUFFER_prev_sent_to_disk_cond, + key_TRANSLOG_DESCRIPTOR_log_flush_cond, + key_TRANSLOG_DESCRIPTOR_new_goal_cond; + +extern PSI_rwlock_key key_KEYINFO_root_lock, key_SHARE_mmap_lock, + key_TRANSLOG_DESCRIPTOR_open_files_lock; + +extern PSI_thread_key key_thread_checkpoint, key_thread_find_all_keys, + key_thread_soft_sync; + +extern PSI_file_key key_file_translog, key_file_kfile, key_file_dfile, + key_file_control, key_file_tmp; + +#endif + +/* Note that PSI_stage_info globals must always be declared. */ +extern PSI_stage_info stage_waiting_for_a_resource; + +/* This is used by _ma_calc_xxx_key_length och _ma_store_key */ +typedef struct st_maria_s_param +{ + const uchar *key; + uchar *prev_key, *next_key_pos; + uchar *key_pos; /* For balance page */ + uint ref_length, key_length, n_ref_length; + uint n_length, totlength, part_of_prev_key, prev_length, pack_marker; + uint changed_length; + int move_length; /* For balance_page */ + my_bool store_not_null; +} MARIA_KEY_PARAM; + + +/* Used to store reference to pinned page */ +typedef struct st_pinned_page +{ + PAGECACHE_BLOCK_LINK *link; + enum pagecache_page_lock unlock, write_lock; + my_bool changed; +} MARIA_PINNED_PAGE; + + +/* Keeps all information about a page and related to a page */ +typedef struct st_maria_page +{ + MARIA_HA *info; + const MARIA_KEYDEF *keyinfo; + uchar *buff; /* Data for page */ + my_off_t pos; /* Disk address to page */ + uint size; /* Size of data on page */ + uint org_size; /* Size of page at read or after log */ + uint node; /* 0 or share->base.key_reflength */ + uint flag; /* Page flag */ + uint link_offset; +} MARIA_PAGE; + + +/* Prototypes for intern functions */ +extern int _ma_read_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS); +extern int _ma_read_rnd_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern my_bool _ma_write_dynamic_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_dynamic_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern my_bool _ma_delete_dynamic_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_cmp_dynamic_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_write_blob_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_blob_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern int _ma_read_static_record(MARIA_HA *info, uchar *, MARIA_RECORD_POS); +extern int _ma_read_rnd_static_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern my_bool _ma_write_static_record(MARIA_HA *, const uchar *); +extern my_bool _ma_update_static_record(MARIA_HA *, MARIA_RECORD_POS, + const uchar *, const uchar *); +extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record); + +extern my_bool _ma_write_no_record(MARIA_HA *info, const uchar *record); +extern my_bool _ma_update_no_record(MARIA_HA *info, MARIA_RECORD_POS pos, + const uchar *oldrec, const uchar *record); +extern my_bool _ma_delete_no_record(MARIA_HA *info, const uchar *record); +extern int _ma_read_no_record(MARIA_HA *info, uchar *record, + MARIA_RECORD_POS pos); +extern int _ma_read_rnd_no_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos, + my_bool skip_deleted_blocks); +my_off_t _ma_no_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos); + +extern my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key); +extern my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key, + MARIA_RECORD_POS *root); +int _ma_insert(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff, + MARIA_PAGE *father_page, uchar *father_key_pos, + my_bool insert_last); +extern my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key, + MARIA_RECORD_POS *root, uint32 comp_flag); +extern int _ma_split_page(MARIA_HA *info, MARIA_KEY *key, + MARIA_PAGE *split_page, + uint org_split_length, + uchar *inserted_key_pos, uint changed_length, + int move_length, + uchar *key_buff, my_bool insert_last_key); +extern uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *page, + uchar ** after_key); +extern int _ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag, + uchar *key_pos, uchar *org_key, + uchar *key_buff, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_var_pack_key_length(const MARIA_KEY *key, + uint nod_flag, uchar *next_key, + uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp); +extern int _ma_calc_bin_pack_key_length(const MARIA_KEY *key, + uint nod_flag, uchar *next_key, + uchar *org_key, uchar *prev_key, + MARIA_KEY_PARAM *s_temp); +extern void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +extern void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +#ifdef NOT_USED +extern void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +#endif +extern void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); + +extern my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key); +extern my_bool _ma_ck_real_delete(MARIA_HA *info, MARIA_KEY *key, + my_off_t *root); +extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer); +extern int _ma_writeinfo(MARIA_HA *info, uint options); +extern int _ma_test_if_changed(MARIA_HA *info); +extern int _ma_mark_file_changed(MARIA_SHARE *info); +extern int _ma_mark_file_changed_now(MARIA_SHARE *info); +extern void _ma_mark_file_crashed(MARIA_SHARE *share); +extern void _ma_set_fatal_error(MARIA_HA *share, int error); +extern void _ma_set_fatal_error_with_share(MARIA_SHARE *share, int error); +extern my_bool _ma_set_uuid(MARIA_SHARE *info, my_bool reset_uuid); +extern my_bool _ma_check_if_zero(uchar *pos, size_t size); +extern int _ma_decrement_open_count(MARIA_HA *info, my_bool lock_table); +extern int _ma_check_index(MARIA_HA *info, int inx); +extern int _ma_search(MARIA_HA *info, MARIA_KEY *key, uint32 nextflag, + my_off_t pos); +extern int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *page, + uint32 comp_flag, uchar **ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *page, + uint comp_flag, uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *page, + uint32 comp_flag, uchar ** ret_pos, uchar *buff, + my_bool *was_last_key); +extern my_off_t _ma_kpos(uint nod_flag, const uchar *after_key); +extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos); +MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key); +TrID _ma_trid_from_key(const MARIA_KEY *key); +extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr); +extern void _ma_dpointer(MARIA_SHARE *share, uchar *buff, + MARIA_RECORD_POS pos); +extern uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag, + uchar **page); +extern uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page); +extern uint _ma_get_pack_key(MARIA_KEY *key, uint page_flag, uint nod_flag, + uchar **page); +extern uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page); +extern uint _ma_get_binary_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar **page_pos); +uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag, + uint nod_flag, uchar *page); +extern uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *page, + uchar *endpos); +extern uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *page, uchar *keypos); +extern uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key); +extern uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, const uchar *key, + HA_KEYSEG *end); +extern int _ma_search_next(MARIA_HA *info, MARIA_KEY *key, + uint32 nextflag, my_off_t pos); +extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos); +extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + my_off_t pos); +extern my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos); +extern my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos); +extern my_off_t _ma_transparent_recpos(MARIA_SHARE *share, my_off_t pos); +extern my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *, my_off_t pos); +extern my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *, my_off_t pos); + +extern void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, my_off_t pos, + uchar *buff); +extern my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info, + const MARIA_KEYDEF *keyinfo, + my_off_t pos, enum pagecache_page_lock lock, + int level, uchar *buff, + my_bool return_buffer); +extern my_bool _ma_write_keypage(MARIA_PAGE *page, + enum pagecache_page_lock lock, int level); +extern int _ma_dispose(MARIA_HA *info, my_off_t pos, my_bool page_not_read); +extern my_off_t _ma_new(MARIA_HA *info, int level, + MARIA_PINNED_PAGE **page_link); +extern my_bool _ma_compact_keypage(MARIA_PAGE *page, TrID min_read_from); +extern uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid); +extern ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from); +#define transid_packed_length(data) \ + ((data)[0] < MARIA_MIN_TRANSID_PACK_OFFSET ? 1 : \ + (uint) ((uchar) (data)[0]) - (MARIA_TRANSID_PACK_OFFSET - 1)) +#define key_has_transid(key) (*(key) & 1) + +#define page_mark_changed(info, page) \ + dynamic_element(&(info)->pinned_pages, (page)->link_offset, \ + MARIA_PINNED_PAGE*)->changed= 1; +#define page_store_size(share, page) \ + _ma_store_page_used((share), (page)->buff, (page)->size); +#define page_store_info(share, page) \ + _ma_store_keypage_flag((share), (page)->buff, (page)->flag); \ + _ma_store_page_used((share), (page)->buff, (page)->size); +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY +void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page) +#else +#define page_cleanup(A,B) do { } while (0) +#endif + +extern MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr, + uchar *key, const uchar *record, + MARIA_RECORD_POS filepos, ulonglong trid); +extern MARIA_KEY *_ma_pack_key(MARIA_HA *info, MARIA_KEY *int_key, + uint keynr, uchar *key, + const uchar *old, key_part_map keypart_map, + HA_KEYSEG ** last_used_keyseg); +extern void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from); +extern int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS); +extern my_bool _ma_read_cache(MARIA_HA *, IO_CACHE *info, uchar *buff, + MARIA_RECORD_POS pos, size_t length, + uint re_read_if_possibly); +extern ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type); +extern my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size, + size_t new_size, myf flag); +extern size_t _ma_rec_unpack(MARIA_HA *info, uchar *to, uchar *from, + size_t reclength); +extern my_bool _ma_rec_check(MARIA_HA *info, const uchar *record, + uchar *packpos, ulong packed_length, + my_bool with_checkum, ha_checksum checksum); +extern int _ma_write_part_record(MARIA_HA *info, my_off_t filepos, + ulong length, my_off_t next_filepos, + uchar ** record, ulong *reclength, + int *flag); +extern void _ma_print_key(FILE *stream, MARIA_KEY *key); +extern void _ma_print_keydata(FILE *stream, HA_KEYSEG *keyseg, + const uchar *key, uint length); +extern my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile); +extern my_bool _ma_once_end_pack_row(MARIA_SHARE *share); +extern int _ma_read_pack_record(MARIA_HA *info, uchar *buf, + MARIA_RECORD_POS filepos); +extern int _ma_read_rnd_pack_record(MARIA_HA *, uchar *, MARIA_RECORD_POS, + my_bool); +extern int _ma_pack_rec_unpack(MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, + uchar *to, uchar *from, ulong reclength); +extern ulonglong _ma_safe_mul(ulonglong a, ulonglong b); +extern int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf, + const uchar *oldrec, const uchar *newrec, + my_off_t pos); + +/* + Parameter to _ma_get_block_info + The dynamic row header is read into this struct. For an explanation of + the fields, look at the function _ma_get_block_info(). +*/ + +typedef struct st_maria_block_info +{ + uchar header[MARIA_BLOCK_INFO_HEADER_LENGTH]; + ulong rec_len; + ulong data_len; + ulong block_len; + ulong blob_len; + MARIA_RECORD_POS filepos; + MARIA_RECORD_POS next_filepos; + MARIA_RECORD_POS prev_filepos; + uint second_read; + uint offset; +} MARIA_BLOCK_INFO; + + +/* bits in return from _ma_get_block_info */ + +#define BLOCK_FIRST 1U +#define BLOCK_LAST 2U +#define BLOCK_DELETED 4U +#define BLOCK_ERROR 8U /* Wrong data */ +#define BLOCK_SYNC_ERROR 16U /* Right data at wrong place */ +#define BLOCK_FATAL_ERROR 32U /* hardware-error */ + +#define NEED_MEM ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */ +#define MAXERR 20 +#define BUFFERS_WHEN_SORTING 16 /* Alloc for sort-key-tree */ +#define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE +#define INDEX_TMP_EXT ".TMM" +#define DATA_TMP_EXT ".TMD" + +#define UPDATE_TIME 1U +#define UPDATE_STAT 2U +#define UPDATE_SORT 4U +#define UPDATE_AUTO_INC 8U +#define UPDATE_OPEN_COUNT 16U + +/* We use MY_ALIGN_DOWN here mainly to ensure that we get stable values for mysqld --help ) */ +#define PAGE_BUFFER_INIT MY_ALIGN_DOWN(1024L*1024L*256L-MALLOC_OVERHEAD, 8192) +#define READ_BUFFER_INIT MY_ALIGN_DOWN(1024L*256L-MALLOC_OVERHEAD, 1024) +#define SORT_BUFFER_INIT MY_ALIGN_DOWN(1024L*1024L*256L-MALLOC_OVERHEAD, 1024) + +#define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0) +#define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1) + +extern uint _ma_get_block_info(MARIA_HA *, MARIA_BLOCK_INFO *, File, my_off_t); +extern uint _ma_rec_pack(MARIA_HA *info, uchar *to, const uchar *from); +extern uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff, + MARIA_BLOCK_INFO *info, uchar **rec_buff_p, + size_t *rec_buff_size, + File file, my_off_t filepos); +extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length); +extern void _ma_report_error(int errcode, const LEX_STRING *file_name, + myf flags); +extern void _ma_print_error(MARIA_HA *info, int error, my_bool write_to_log); +extern my_bool _ma_memmap_file(MARIA_HA *info); +extern void _ma_unmap_file(MARIA_HA *info); +extern uint _ma_save_pack_length(uint version, uchar * block_buff, + ulong length); +extern uint _ma_calc_pack_length(uint version, ulong length); +extern ulong _ma_calc_blob_length(uint length, const uchar *pos); +extern size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); +extern size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); +extern size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); +extern size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer, + size_t Count, my_off_t offset, myf MyFlags); + +/* my_pwrite instead of my_write used */ +#define MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET 1 +/* info should be written */ +#define MA_STATE_INFO_WRITE_FULL_INFO 2 +/* intern_lock taking is needed */ +#define MA_STATE_INFO_WRITE_LOCK 4 +uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)__attribute__((visibility("default"))) ; +uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite); +uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state); +uint _ma_base_info_write(File file, MARIA_BASE_INFO *base); +my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg); +uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg); +my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef); +uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef); +my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *keydef); +uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *keydef); +my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef); +uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef); +my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns); +uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns); +ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record); +ha_checksum _ma_checksum(MARIA_HA *info, const uchar *buf); +ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *buf); +my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, ha_checksum unique_hash, + MARIA_RECORD_POS pos); +ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *buf); +my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, + const uchar *record, MARIA_RECORD_POS pos); +my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b, + my_bool null_are_equal); +void _ma_reset_status(MARIA_HA *maria); +int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos); +int _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos); + +#include "ma_commit.h" + +extern MARIA_HA *_ma_test_if_reopen(const char *filename); +my_bool _ma_check_table_is_closed(const char *name, const char *where); +int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share); +int _ma_open_keyfile(MARIA_SHARE *share); +void _ma_setup_functions(MARIA_SHARE *share); +my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size); +void _ma_remap_file(MARIA_HA *info, my_off_t size); + +MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record); +my_bool _ma_write_abort_default(MARIA_HA *info); +int maria_delete_table_files(const char *name, my_bool temporary, + myf flags)__attribute__((visibility("default"))) ; + + +/* + This cannot be in my_base.h as it clashes with HA_SPATIAL. + But it was introduced for Aria engine, and is only used there. + So it can safely stay here, only visible to Aria +*/ +#define HA_RTREE_INDEX 16384 /* For RTREE search */ + +#define MARIA_FLUSH_DATA 1 +#define MARIA_FLUSH_INDEX 2 +int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index, + enum flush_type flush_type_for_data, + enum flush_type flush_type_for_index); +/* + Functions needed by _ma_check (are overridden in MySQL/ha_maria.cc). + See ma_check_standalone.h . +*/ +int _ma_killed_ptr(HA_CHECK *param); +void _ma_report_progress(HA_CHECK *param, ulonglong progress, + ulonglong max_progress); +void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...) + ATTRIBUTE_FORMAT(printf, 2, 3); +my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info); + +int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param); +int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param); +int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param); +pthread_handler_t _ma_thr_find_all_keys(void *arg); + +int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param); +int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, + size_t); +int _ma_sync_table_files(const MARIA_HA *info); +int _ma_initialize_data_file(MARIA_SHARE *share, File dfile); +int _ma_update_state_lsns(MARIA_SHARE *share, + LSN lsn, TrID create_trid, my_bool do_sync, + my_bool update_create_rename_lsn); +int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn, + TrID create_trid, my_bool do_sync, + my_bool update_create_rename_lsn); +void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share); +void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file, + MARIA_SHARE *share); +void _ma_tmp_disable_logging_for_table(MARIA_HA *info, + my_bool log_incomplete); +my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages); +my_bool write_log_record_for_bulk_insert(MARIA_HA *info); +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn); + +#define MARIA_NO_CRC_NORMAL_PAGE 0xffffffff +#define MARIA_NO_CRC_BITMAP_PAGE 0xfffffffe +extern my_bool maria_page_crc_set_index(PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_crc_set_normal(PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_crc_check_bitmap(int, PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_crc_check_data(int, PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_crc_check_index(int, PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_crc_check_none(int, PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_crc_check(uchar *page, pgcache_page_no_t page_no, + MARIA_SHARE *share, uint32 no_crc_val, + int data_length); +extern my_bool maria_page_filler_set_bitmap(PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_filler_set_normal(PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_page_filler_set_none(PAGECACHE_IO_HOOK_ARGS *args); +extern void maria_page_write_failure(int error, PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_flush_log_for_page(PAGECACHE_IO_HOOK_ARGS *args); +extern my_bool maria_flush_log_for_page_none(PAGECACHE_IO_HOOK_ARGS *args); + +extern PAGECACHE *maria_log_pagecache; +extern void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func, + void *func_arg); +check_result_t ma_check_index_cond(MARIA_HA *info, uint keynr, uchar *record); + +extern my_bool ma_yield_and_check_if_killed(MARIA_HA *info, int inx); +extern my_bool ma_killed_standalone(MARIA_HA *); + +extern uint _ma_file_callback_to_id(void *callback_data); +extern void free_maria_share(MARIA_SHARE *share); + +static inline void unmap_file(MARIA_HA *info __attribute__((unused))) +{ +#ifdef HAVE_MMAP + if (info->s->file_map) + _ma_unmap_file(info); +#endif +} + +static inline void decrement_share_in_trans(MARIA_SHARE *share) +{ + /* Internal tables doesn't have transactions */ + DBUG_ASSERT(!share->internal_table); + if (!--share->in_trans) + free_maria_share(share); + else + mysql_mutex_unlock(&share->intern_lock); +} +C_MODE_END +#endif + +#define CRASH_IF_S3_TABLE(share) DBUG_ASSERT(!share->no_status_updates) diff --git a/storage/maria/s3.cnf b/storage/maria/s3.cnf new file mode 100644 index 00000000..345bddd1 --- /dev/null +++ b/storage/maria/s3.cnf @@ -0,0 +1,22 @@ +[mariadbd] +# +# Uncomment line to enable +# +#plugin-maturity = alpha + +[mariadb] +# +# Uncomment line to enable +# +#plugin-load-add = ha_s3 + +# +# Uncomment to configure the S3 engine +# See all options at https://mariadb.com/kb/en/s3-storage-engine/ +# +#s3-host-name = s3.amazonaws.com +#s3-protocol-version = Amazon +#s3-bucket = ... +#s3-access-key = ... +#s3-secret-key = ... +#s3-region = eu-north-1 diff --git a/storage/maria/s3_func.c b/storage/maria/s3_func.c new file mode 100644 index 00000000..3d18ba88 --- /dev/null +++ b/storage/maria/s3_func.c @@ -0,0 +1,1625 @@ +/* Copyright (C) 2019 MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ + +/* + Interface function used by S3 storage engine and aria_copy_for_s3 +*/ + +#include "maria_def.h" +#include "s3_func.h" +#include <aria_backup.h> +#include <mysqld_error.h> +#include <sql_const.h> +#include <mysys_err.h> +#include <mysql_com.h> +#include <zlib.h> + +/* number of '.' to print during a copy in verbose mode */ +#define DISPLAY_WITH 79 + +static void convert_index_to_s3_format(uchar *header, ulong block_size, + int compression); +static void convert_index_to_disk_format(uchar *header); +static void convert_frm_to_s3_format(uchar *header); +static void convert_frm_to_disk_format(uchar *header); +static int s3_read_file_from_disk(const char *filename, uchar **to, + size_t *to_size, my_bool print_error); + +/* Used by ha_s3.cc and tools to define different protocol options */ + +static const char *protocol_types[]= {"Auto", "Original", "Amazon", NullS}; +TYPELIB s3_protocol_typelib= {array_elements(protocol_types)-1,"", + protocol_types, NULL}; + +/****************************************************************************** + Allocations handler for libmarias3 + To be removed when we do the init allocation in mysqld.cc +******************************************************************************/ + +static void *s3_wrap_malloc(size_t size) +{ + return my_malloc(PSI_NOT_INSTRUMENTED, size, MYF(MY_WME)); +} + +static void *s3_wrap_calloc(size_t nmemb, size_t size) +{ + return my_malloc(PSI_NOT_INSTRUMENTED, nmemb * size, + MYF(MY_WME | MY_ZEROFILL)); +} + +static void *s3_wrap_realloc(void *ptr, size_t size) +{ + return my_realloc(PSI_NOT_INSTRUMENTED, ptr, size, + MYF(MY_WME | MY_ALLOW_ZERO_PTR)); +} + +static char *s3_wrap_strdup(const char *str) +{ + return my_strdup(PSI_NOT_INSTRUMENTED, str, MYF(MY_WME)); +} + +static void s3_wrap_free(void *ptr) +{ + if (ptr) /* Avoid tracing of null */ + my_free(ptr); +} + +void s3_init_library() +{ + ms3_library_init_malloc(s3_wrap_malloc, s3_wrap_free, s3_wrap_realloc, + s3_wrap_strdup, s3_wrap_calloc); +} + +void s3_deinit_library() +{ + ms3_library_deinit(); +} + +/****************************************************************************** + Functions on S3_INFO and S3_BLOCK +******************************************************************************/ + +/* + Free memory allocated by s3_get_object +*/ + +void s3_free(S3_BLOCK *data) +{ + my_free(data->alloc_ptr); + data->alloc_ptr= 0; +} + + +/* + Copy a S3_INFO structure +*/ + +S3_INFO *s3_info_copy(S3_INFO *old) +{ + S3_INFO *to, tmp; + + /* Copy lengths */ + memcpy(&tmp, old, sizeof(tmp)); + /* Allocate new buffers */ + if (!my_multi_malloc(PSI_NOT_INSTRUMENTED, MY_WME, &to, sizeof(S3_INFO), + &tmp.access_key.str, old->access_key.length+1, + &tmp.secret_key.str, old->secret_key.length+1, + &tmp.region.str, old->region.length+1, + &tmp.bucket.str, old->bucket.length+1, + &tmp.database.str, old->database.length+1, + &tmp.table.str, old->table.length+1, + &tmp.base_table.str, old->base_table.length+1, + NullS)) + return 0; + /* Copy lengths and new pointers to to */ + memcpy(to, &tmp, sizeof(tmp)); + /* Copy data */ + strmov((char*) to->access_key.str, old->access_key.str); + strmov((char*) to->secret_key.str, old->secret_key.str); + strmov((char*) to->region.str, old->region.str); + strmov((char*) to->bucket.str, old->bucket.str); + /* Database may not be null terminated */ + strmake((char*) to->database.str, old->database.str, old->database.length); + strmov((char*) to->table.str, old->table.str); + strmov((char*) to->base_table.str, old->base_table.str); + return to; +} + +/** + Open a connection to s3 +*/ + +ms3_st *s3_open_connection(S3_INFO *s3) +{ + ms3_st *s3_client; + if (!(s3_client= ms3_init(s3->access_key.str, + s3->secret_key.str, + s3->region.str, + s3->host_name.str))) + { + my_printf_error(HA_ERR_NO_SUCH_TABLE, + "Can't open connection to S3, error: %d %s", MYF(0), + errno, ms3_error(errno)); + my_errno= HA_ERR_NO_SUCH_TABLE; + } + if (s3->protocol_version) + ms3_set_option(s3_client, MS3_OPT_FORCE_PROTOCOL_VERSION, + &s3->protocol_version); + if (s3->port) + ms3_set_option(s3_client, MS3_OPT_PORT_NUMBER, &s3->port); + + if (s3->use_http) + ms3_set_option(s3_client, MS3_OPT_USE_HTTP, NULL); + + return s3_client; +} + +/** + close a connection to s3 +*/ + +void s3_deinit(ms3_st *s3_client) +{ + DBUG_PUSH(""); /* Avoid tracing free calls */ + ms3_deinit(s3_client); + DBUG_POP(); +} + + +/****************************************************************************** + High level functions to copy tables to and from S3 +******************************************************************************/ + +/** + Create suffix for object name + @param to_end end of suffix (from previous call or 000000 at start) + + The suffix is a 6 length '0' prefixed number. If the number + gets longer than 6, then it's extended to 7 and more digits. +*/ + +static void fix_suffix(char *to_end, ulong nr) +{ + char buff[11]; + uint length= (uint) (int10_to_str(nr, buff, 10) - buff); + set_if_smaller(length, 6); + strmov(to_end - length, buff); +} + +/** + Copy file to 'aws_path' in blocks of block_size + + @return 0 ok + @return 1 error. Error message is printed to stderr + + Notes: + file is always closed before return +*/ + +static my_bool copy_from_file(ms3_st *s3_client, const char *aws_bucket, + char *aws_path, + File file, my_off_t start, my_off_t file_end, + uchar *block, size_t block_size, + my_bool compression, my_bool display) +{ + my_off_t pos; + char *path_end= strend(aws_path); + ulong bnr; + my_bool print_done= 0; + size_t length; + + for (pos= start, bnr=1 ; pos < file_end ; pos+= length, bnr++) + { + if ((length= my_pread(file, block, block_size, pos, MYF(MY_WME))) == + MY_FILE_ERROR) + goto err; + if (length == 0) + { + my_error(EE_EOFERR, MYF(0), my_filename(file), my_errno); + goto err; + } + + fix_suffix(path_end, bnr); + if (s3_put_object(s3_client, aws_bucket, aws_path, block, length, + compression)) + goto err; + + /* Write up to DISPLAY_WITH number of '.' during copy */ + if (display && + ((pos + block_size) * DISPLAY_WITH / file_end) > + (pos * DISPLAY_WITH/file_end)) + { + fputc('.', stdout); fflush(stdout); + print_done= 1; + } + } + if (print_done) + { + fputc('\n', stdout); fflush(stdout); + } + my_close(file, MYF(MY_WME)); + return 0; + +err: + my_close(file, MYF(MY_WME)); + if (print_done) + { + fputc('\n', stdout); fflush(stdout); + } + return 1; +} + + +/** + Copy an Aria table to S3 + @param s3_client connection to S3 + @param aws_bucket Aws bucket + @param path Path for Aria table (can be temp table) + @param database database name + @param table_name table name + @param block_size Block size in s3. If 0 then use block size + and compression as specified in the .MAI file as + specified as part of open. + @param compression Compression algorithm (0 = none, 1 = zip) + If block size is 0 then use .MAI file. + @return 0 ok + @return 1 error + + The table will be copied in S3 into the following locations: + + frm file (for discovery): + aws_bucket/database/table/frm + + First index block (contains description if the Aria file): + aws_bucket/database/table/aria + + Rest of the index file: + aws_bucket/database/table/index/block_number + + Data file: + aws_bucket/database/table/data/block_number + + block_number is 6 digits decimal number, prefixed with 0 + (Can be larger than 6 numbers, the prefix is just for nice output) + + frm and base blocks are small (just the needed data). + index and blocks are of size 's3_block_size' + + If compression is used, then original block size is s3_block_size + but the stored block will be the size of the compressed block. +*/ + +int aria_copy_to_s3(ms3_st *s3_client, const char *aws_bucket, + const char *path, + const char *database, const char *table_name, + ulong block_size, my_bool compression, + my_bool force, my_bool display, my_bool copy_frm) +{ + ARIA_TABLE_CAPABILITIES cap; + char aws_path[FN_REFLEN+100]; + char filename[FN_REFLEN]; + char *aws_path_end, *end; + uchar *alloc_block= 0, *block; + ms3_status_st status; + File file= -1; + my_off_t file_size; + size_t frm_length; + int error; + my_bool frm_created= 0; + DBUG_ENTER("aria_copy_to_s3"); + DBUG_PRINT("enter",("from: %s database: %s table: %s", + path, database, table_name)); + + aws_path_end= strxmov(aws_path, database, "/", table_name, NullS); + strmov(aws_path_end, "/aria"); + + if (!ms3_status(s3_client, aws_bucket, aws_path, &status)) + { + if (!force) + { + my_printf_error(EE_CANTCREATEFILE, "File %s exists in s3", MYF(0), + aws_path); + DBUG_RETURN(EE_CANTCREATEFILE); + } + if ((error= aria_delete_from_s3(s3_client, aws_bucket, database, + table_name, display))) + DBUG_RETURN(error); + } + + if (copy_frm) + { + /* + Copy frm file if it exists + We do this first to ensure that .frm always exists. This is needed to + ensure that discovery of the table will work. + */ + fn_format(filename, path, "", ".frm", MY_REPLACE_EXT); + if (!s3_read_file_from_disk(filename, &alloc_block, &frm_length,0)) + { + if (display) + printf("Copying frm file %s\n", filename); + + strmov(aws_path_end,"/frm"); + convert_frm_to_s3_format(alloc_block); + + /* Note that frm is not compressed! */ + if (s3_put_object(s3_client, aws_bucket, aws_path, alloc_block, frm_length, + 0)) + goto err; + + frm_created= 1; + my_free(alloc_block); + alloc_block= 0; + } + } + + if (display) + printf("Copying aria table: %s.%s to s3\n", database, table_name); + + /* Index file name */ + fn_format(filename, path, "", ".MAI", MY_REPLACE_EXT); + if ((file= my_open(filename, + O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(MY_WME))) < 0) + DBUG_RETURN(1); + if ((error= aria_get_capabilities(file, &cap))) + { + fprintf(stderr, "Got error %d when reading Aria header from %s\n", + error, path); + goto err; + } + if (cap.transactional || cap.data_file_type != BLOCK_RECORD || + cap.encrypted) + { + fprintf(stderr, + "Aria table %s doesn't match criteria to be copied to S3.\n" + "It should be non-transactional and should have row_format page\n", + path); + goto err; + } + /* + If block size is not specified, use the values specified as part of + create + */ + if (block_size == 0) + { + block_size= cap.s3_block_size; + compression= cap.compression; + } + + /* Align S3_BLOCK size with table block size */ + block_size= (block_size/cap.block_size)*cap.block_size; + + /* Allocate block for data + flag for compress header */ + if (!(alloc_block= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, + block_size+ALIGN_SIZE(1), + MYF(MY_WME)))) + goto err; + /* Read/write data here, but with prefix space for compression flag */ + block= alloc_block+ ALIGN_SIZE(1); + + if (my_pread(file, block, cap.header_size, 0, MYF(MY_WME | MY_FNABP))) + goto err; + + strmov(aws_path_end, "/aria"); + + if (display) + printf("Creating aria table information %s\n", aws_path); + + convert_index_to_s3_format(block, block_size, compression); + + /* + The first page is not compressed as we need it to know if the rest is + compressed + */ + if (s3_put_object(s3_client, aws_bucket, aws_path, block, cap.header_size, + 0 /* no compression */ )) + goto err; + + file_size= my_seek(file, 0L, MY_SEEK_END, MYF(0)); + + end= strmov(aws_path_end,"/index"); + + if (display) + printf("Copying index information %s\n", aws_path); + + /* The 000000 will be update with block number by fix_suffix() */ + end= strmov(end, "/000000"); + + error= copy_from_file(s3_client, aws_bucket, aws_path, file, cap.header_size, + file_size, block, block_size, compression, display); + file= -1; + if (error) + goto err; + + /* Copy data file */ + fn_format(filename, path, "", ".MAD", MY_REPLACE_EXT); + if ((file= my_open(filename, + O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(MY_WME))) < 0) + DBUG_RETURN(1); + + file_size= my_seek(file, 0L, MY_SEEK_END, MYF(0)); + + end= strmov(aws_path_end, "/data"); + + if (display) + printf("Copying data information %s\n", aws_path); + + /* The 000000 will be update with block number by fix_suffix() */ + end= strmov(end, "/000000"); + + error= copy_from_file(s3_client, aws_bucket, aws_path, file, 0, file_size, + block, block_size, compression, display); + file= -1; + if (error) + goto err; + + my_free(alloc_block); + DBUG_RETURN(0); + +err: + if (frm_created) + { + end= strmov(aws_path_end,"/frm"); + (void) s3_delete_object(s3_client, aws_bucket, aws_path, MYF(ME_NOTE)); + } + if (file >= 0) + my_close(file, MYF(0)); + my_free(alloc_block); + DBUG_RETURN(1); +} + + +/** + Copy file to 'aws_path' in blocks of block_size + + @return 0 ok + @return 1 error. Error message is printed to stderr + + Notes: + file is always closed before return +*/ + +static my_bool copy_to_file(ms3_st *s3_client, const char *aws_bucket, + char *aws_path, File file, my_off_t start, + my_off_t file_end, my_bool compression, + my_bool display) +{ + my_off_t pos; + char *path_end= strend(aws_path); + size_t error; + ulong bnr; + my_bool print_done= 0; + S3_BLOCK block; + DBUG_ENTER("copy_to_file"); + DBUG_PRINT("enter", ("path: %s start: %llu end: %llu", + aws_path, (ulonglong) start, (ulonglong) file_end)); + + for (pos= start, bnr=1 ; pos < file_end ; pos+= block.length, bnr++) + { + fix_suffix(path_end, bnr); + if (s3_get_object(s3_client, aws_bucket, aws_path, &block, compression, 1)) + goto err; + + error= my_write(file, block.str, block.length, MYF(MY_WME | MY_FNABP)); + s3_free(&block); + if (error == MY_FILE_ERROR) + goto err; + + /* Write up to DISPLAY_WITH number of '.' during copy */ + if (display && + ((pos + block.length) * DISPLAY_WITH /file_end) > + (pos * DISPLAY_WITH/file_end)) + { + fputc('.', stdout); fflush(stdout); + print_done= 1; + } + } + if (print_done) + { + fputc('\n', stdout); fflush(stdout); + } + my_close(file, MYF(MY_WME)); + DBUG_RETURN(0); + +err: + my_close(file, MYF(MY_WME)); + if (print_done) + { + fputc('\n', stdout); fflush(stdout); + } + DBUG_RETURN(1); +} + + +/** + Copy a table from S3 to current directory +*/ + +int aria_copy_from_s3(ms3_st *s3_client, const char *aws_bucket, + const char *path, const char *database, + my_bool compression, my_bool force, my_bool display) + +{ + MARIA_STATE_INFO state; + MY_STAT stat_info; + char table_name[FN_REFLEN], aws_path[FN_REFLEN+100]; + char filename[FN_REFLEN]; + char *aws_path_end, *end; + File file= -1; + S3_BLOCK block; + my_off_t index_file_size, data_file_size; + uint offset; + int error; + DBUG_ENTER("aria_copy_from_s3"); + + /* Check if index file exists */ + fn_format(filename, path, "", ".MAI", MY_REPLACE_EXT); + if (!force && my_stat(filename, &stat_info, MYF(0))) + { + my_printf_error(EE_CANTCREATEFILE, "Table %s already exists on disk", + MYF(0), filename); + DBUG_RETURN(EE_CANTCREATEFILE); + } + + fn_format(table_name, path, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + block.str= 0; + + aws_path_end= strxmov(aws_path, database, "/", table_name, NullS); + strmov(aws_path_end, "/aria"); + + if (s3_get_object(s3_client, aws_bucket, aws_path, &block, 0, 0)) + { + my_printf_error(EE_FILENOTFOUND, "File %s/%s doesn't exist in s3", MYF(0), + database,filename); + goto err; + } + if (block.length < MARIA_STATE_INFO_SIZE) + { + fprintf(stderr, "Wrong block length for first block: %lu\n", + (ulong) block.length); + goto err_with_free; + } + + if (display) + printf("Copying aria table: %s.%s from s3\n", database, table_name); + + /* For offset positions, check _ma_state_info_readlength() */ + offset= sizeof(state.header) + 4+ LSN_STORE_SIZE*3 + 8*5; + index_file_size= mi_sizekorr(block.str + offset); + data_file_size= mi_sizekorr(block.str + offset+8); + + if ((file= my_create(filename, 0, + O_WRONLY | O_TRUNC | O_NOFOLLOW, MYF(MY_WME))) < 0) + goto err_with_free; + + convert_index_to_disk_format(block.str); + + if (my_write(file, block.str, block.length, MYF(MY_WME | MY_FNABP))) + goto err_with_free; + + if (display) + printf("Copying index information %s\n", aws_path); + + end= strmov(aws_path_end,"/index/000000"); + + error= copy_to_file(s3_client, aws_bucket, aws_path, file, block.length, + index_file_size, compression, display); + file= -1; + if (error) + goto err_with_free; + + /* Copy data file */ + fn_format(filename, path, "", ".MAD", MY_REPLACE_EXT); + if ((file= my_create(filename, 0, + O_WRONLY | O_TRUNC | O_NOFOLLOW, MYF(MY_WME))) < 0) + DBUG_RETURN(1); + + end= strmov(aws_path_end, "/data"); + + if (display) + printf("Copying data information %s\n", aws_path); + + /* The 000000 will be update with block number by fix_suffix() */ + strmov(end, "/000000"); + + error= copy_to_file(s3_client, aws_bucket, aws_path, file, 0, data_file_size, + compression, display); + file= -1; + s3_free(&block); + block.str= 0; + if (error) + goto err; + + /* Copy frm file if it exists */ + strmov(aws_path_end, "/frm"); + if (!s3_get_object(s3_client, aws_bucket, aws_path, &block, 0, 0)) + { + fn_format(filename, path, "", ".frm", MY_REPLACE_EXT); + if ((file= my_create(filename, 0, + O_WRONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(0))) >= 0) + { + if (display) + printf("Copying frm file %s\n", filename); + + convert_frm_to_disk_format(block.str); + + if (my_write(file, block.str, block.length, MYF(MY_WME | MY_FNABP))) + goto err_with_free; + } + s3_free(&block); + my_close(file, MYF(MY_WME)); + file= -1; + } + + DBUG_RETURN(0); + +err_with_free: + s3_free(&block); +err: + if (file >= 0) + my_close(file, MYF(0)); + DBUG_RETURN(1); +} + + +/** + Drop all files related to a table from S3 +*/ + +int aria_delete_from_s3(ms3_st *s3_client, const char *aws_bucket, + const char *database, const char *table, + my_bool display) +{ + ms3_status_st status; + char aws_path[FN_REFLEN+100]; + char *aws_path_end; + int error; + DBUG_ENTER("aria_delete_from_s3"); + + aws_path_end= strxmov(aws_path, database, "/", table, NullS); + strmov(aws_path_end, "/aria"); + + /* Check if either /aria or /frm exists */ + + if (ms3_status(s3_client, aws_bucket, aws_path, &status)) + { + strmov(aws_path_end, "/frm"); + if (ms3_status(s3_client, aws_bucket, aws_path, &status)) + { + my_printf_error(HA_ERR_NO_SUCH_TABLE, + "Table %s.%s doesn't exist in s3", MYF(0), + database, table); + my_errno= HA_ERR_NO_SUCH_TABLE; + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + } + + if (display) + printf("Delete of aria table: %s.%s\n", database, table); + + strmov(aws_path_end,"/index"); + + if (display) + printf("Delete of index information %s\n", aws_path); + + error= s3_delete_directory(s3_client, aws_bucket, aws_path); + + strmov(aws_path_end,"/data"); + if (display) + printf("Delete of data information %s\n", aws_path); + + error|= s3_delete_directory(s3_client, aws_bucket, aws_path); + + if (display) + printf("Delete of base information and frm\n"); + + strmov(aws_path_end,"/aria"); + if (s3_delete_object(s3_client, aws_bucket, aws_path, MYF(MY_WME))) + error= 1; + + /* + Delete .frm last as this is used by discovery to check if a s3 table + exists + */ + strmov(aws_path_end,"/frm"); + /* Ignore error if .frm file doesn't exist */ + s3_delete_object(s3_client, aws_bucket, aws_path, MYF(ME_NOTE)); + + DBUG_RETURN(error); +} + + +/** + Rename a table in s3 +*/ + +int aria_rename_s3(ms3_st *s3_client, const char *aws_bucket, + const char *from_database, const char *from_table, + const char *to_database, const char *to_table, + my_bool rename_frm) +{ + ms3_status_st status; + char to_aws_path[FN_REFLEN+100], from_aws_path[FN_REFLEN+100]; + char *to_aws_path_end, *from_aws_path_end; + int error; + DBUG_ENTER("aria_rename_s3"); + + from_aws_path_end= strxmov(from_aws_path, from_database, "/", from_table, + NullS); + to_aws_path_end= strxmov(to_aws_path, to_database, "/", to_table, NullS); + strmov(from_aws_path_end, "/aria"); + + if (ms3_status(s3_client, aws_bucket, from_aws_path, &status)) + { + my_printf_error(HA_ERR_NO_SUCH_TABLE, + "Table %s.%s doesn't exist in s3", MYF(0), from_database, + from_table); + my_errno= HA_ERR_NO_SUCH_TABLE; + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + strmov(from_aws_path_end,"/index"); + strmov(to_aws_path_end,"/index"); + + error= s3_rename_directory(s3_client, aws_bucket, from_aws_path, to_aws_path, + MYF(MY_WME)); + + strmov(from_aws_path_end,"/data"); + strmov(to_aws_path_end,"/data"); + + error|= s3_rename_directory(s3_client, aws_bucket, from_aws_path, + to_aws_path, MYF(MY_WME)); + + if (rename_frm) { + strmov(from_aws_path_end, "/frm"); + strmov(to_aws_path_end, "/frm"); + + s3_rename_object(s3_client, aws_bucket, from_aws_path, to_aws_path, + MYF(MY_WME)); + } + + strmov(from_aws_path_end,"/aria"); + strmov(to_aws_path_end,"/aria"); + if (s3_rename_object(s3_client, aws_bucket, from_aws_path, to_aws_path, + MYF(MY_WME))) + error= 1; + DBUG_RETURN(error); +} + +/** + Copy all partition files related to a table from S3 (.frm and .par) + + @param s3_client s3 client connection + @param aws_bucket bucket to use + @param path The path to the partitioned table files (no extension) + @param old_path In some cases the partioned files are not yet renamed. + This points to the temporary files that will later + be renamed to the partioned table + @param database Database for the partitioned table + @param database table name for the partitioned table +*/ + +int partition_copy_to_s3(ms3_st *s3_client, const char *aws_bucket, + const char *path, const char *old_path, + const char *database, const char *table_name) +{ + char aws_path[FN_REFLEN+100]; + char filename[FN_REFLEN]; + char *aws_path_end; + uchar *alloc_block= 0; + ms3_status_st status; + size_t frm_length; + int error; + DBUG_ENTER("partition_copy_to_s3"); + DBUG_PRINT("enter",("from: %s database: %s table: %s", + path, database, table_name)); + + if (!old_path) + old_path= path; + + aws_path_end= strxmov(aws_path, database, "/", table_name, "/", NullS); + strmov(aws_path_end, "frm"); + fn_format(filename, old_path, "", ".frm", MY_REPLACE_EXT); + + /* Just to be safe, delete any conflicting object */ + if (!ms3_status(s3_client, aws_bucket, aws_path, &status)) + { + if ((error= s3_delete_object(s3_client, aws_bucket, aws_path, + MYF(ME_FATAL)))) + DBUG_RETURN(error); + } + if ((error= s3_read_file_from_disk(filename, &alloc_block, &frm_length, 0))) + { + /* + In case of ADD PARTITION PARTITON the .frm file is already renamed. + Copy the renamed file if it exists. + */ + fn_format(filename, path, "", ".frm", MY_REPLACE_EXT); + if ((error= s3_read_file_from_disk(filename, &alloc_block, &frm_length, + 1))) + goto err; + } + if ((error= s3_put_object(s3_client, aws_bucket, aws_path, alloc_block, + frm_length, 0))) + goto err; + + /* + Note that because ha_partiton::rename_table() is called before + this function, the .par table already has it's final name! + */ + fn_format(filename, path, "", ".par", MY_REPLACE_EXT); + strmov(aws_path_end, "par"); + if (!ms3_status(s3_client, aws_bucket, aws_path, &status)) + { + if ((error= s3_delete_object(s3_client, aws_bucket, aws_path, + MYF(ME_FATAL)))) + goto err; + } + + my_free(alloc_block); + alloc_block= 0; + if ((error=s3_read_file_from_disk(filename, &alloc_block, &frm_length, 1))) + goto err; + if ((error= s3_put_object(s3_client, aws_bucket, aws_path, alloc_block, + frm_length, 0))) + { + /* Delete the .frm file created above */ + strmov(aws_path_end, "frm"); + (void) s3_delete_object(s3_client, aws_bucket, aws_path, + MYF(ME_FATAL)); + goto err; + } + error= 0; + +err: + my_free(alloc_block); + DBUG_RETURN(error); +} + + +/** + Drop all partition files related to a table from S3 +*/ + +int partition_delete_from_s3(ms3_st *s3_client, const char *aws_bucket, + const char *database, const char *table, + myf error_flags) +{ + char aws_path[FN_REFLEN+100]; + char *aws_path_end; + int error=0, res; + DBUG_ENTER("partition_delete_from_s3"); + + aws_path_end= strxmov(aws_path, database, "/", table, NullS); + strmov(aws_path_end, "/par"); + + if ((res= s3_delete_object(s3_client, aws_bucket, aws_path, error_flags))) + error= res; + /* + Delete .frm last as this is used by discovery to check if a s3 table + exists + */ + strmov(aws_path_end, "/frm"); + if ((res= s3_delete_object(s3_client, aws_bucket, aws_path, error_flags))) + error= res; + + DBUG_RETURN(error); +} + +/****************************************************************************** + Low level functions interfacing with libmarias3 +******************************************************************************/ + +/** + Create an object for index or data information + + Note that if compression is used, the data may be overwritten and + there must be COMPRESS_HEADER length of free space before the data! + +*/ + +int s3_put_object(ms3_st *s3_client, const char *aws_bucket, + const char *name, uchar *data, size_t length, + my_bool compression) +{ + uint8_t error; + const char *errmsg; + DBUG_ENTER("s3_put_object"); + DBUG_PRINT("enter", ("name: %s", name)); + + if (compression) + { + size_t comp_len; + + data[-COMPRESS_HEADER]= 0; // No compression + if (!my_compress(data, &length, &comp_len)) + data[-COMPRESS_HEADER]= 1; // Compressed package + data-= COMPRESS_HEADER; + length+= COMPRESS_HEADER; + int3store(data+1, comp_len); // Original length or 0 + } + + if (likely(!(error= ms3_put(s3_client, aws_bucket, name, data, length)))) + DBUG_RETURN(0); + + if (!(errmsg= ms3_server_error(s3_client))) + errmsg= ms3_error(error); + + my_printf_error(EE_WRITE, "Got error from put_object(%s): %d %s", MYF(0), + name, error, errmsg); + DBUG_RETURN(EE_WRITE); +} + + +/** + Read an object for index or data information + + @param print_error 0 Don't print error + @param print_error 1 Print error that object doesn't exists + @param print_error 2 Print error that table doesn't exists +*/ + +int s3_get_object(ms3_st *s3_client, const char *aws_bucket, + const char *name, S3_BLOCK *block, + my_bool compression, int print_error) +{ + uint8_t error; + int result= 0; + uchar *data; + DBUG_ENTER("s3_get_object"); + DBUG_PRINT("enter", ("name: %s compression: %d", name, compression)); + + block->str= block->alloc_ptr= 0; + if (likely(!(error= ms3_get(s3_client, aws_bucket, name, + (uint8_t**) &block->alloc_ptr, + &block->length)))) + { + block->str= block->alloc_ptr; + if (compression) + { + ulong length; + + /* If not compressed */ + if (!block->str[0]) + { + block->length-= COMPRESS_HEADER; + block->str+= COMPRESS_HEADER; + + /* Simple check to ensure that it's a correct block */ + if (block->length % 1024) + { + s3_free(block); + my_printf_error(HA_ERR_NOT_A_TABLE, + "Block '%s' is not compressed", MYF(0), name); + DBUG_RETURN(HA_ERR_NOT_A_TABLE); + } + DBUG_RETURN(0); + } + + if (((uchar*)block->str)[0] > 1) + { + s3_free(block); + my_printf_error(HA_ERR_NOT_A_TABLE, + "Block '%s' is not compressed", MYF(0), name); + DBUG_RETURN(HA_ERR_NOT_A_TABLE); + } + + length= uint3korr(block->str+1); + + if (!(data= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, + length, MYF(MY_WME | MY_THREAD_SPECIFIC)))) + { + s3_free(block); + DBUG_RETURN(EE_OUTOFMEMORY); + } + if (uncompress(data, &length, block->str + COMPRESS_HEADER, + block->length - COMPRESS_HEADER)) + { + my_printf_error(ER_NET_UNCOMPRESS_ERROR, + "Got error uncompressing s3 packet", MYF(0)); + s3_free(block); + my_free(data); + DBUG_RETURN(ER_NET_UNCOMPRESS_ERROR); + } + s3_free(block); + block->str= block->alloc_ptr= data; + block->length= length; + } + DBUG_RETURN(0); + } + + if (error == 9) + { + result= my_errno= (print_error == 1 ? EE_FILENOTFOUND : + HA_ERR_NO_SUCH_TABLE); + if (print_error) + my_printf_error(my_errno, "Expected object '%s' didn't exist", + MYF(0), name); + } + else + { + result= my_errno= EE_READ; + if (print_error) + { + const char *errmsg; + if (!(errmsg= ms3_server_error(s3_client))) + errmsg= ms3_error(error); + + my_printf_error(EE_READ, "Got error from get_object(%s): %d %s", MYF(0), + name, error, errmsg); + } + } + s3_free(block); + DBUG_RETURN(result); +} + + +int s3_delete_object(ms3_st *s3_client, const char *aws_bucket, + const char *name, myf error_flags) +{ + uint8_t error; + int result= 0; + DBUG_ENTER("s3_delete_object"); + DBUG_PRINT("enter", ("name: %s", name)); + + if (likely(!(error= ms3_delete(s3_client, aws_bucket, name)))) + DBUG_RETURN(0); + + if (error_flags) + { + error_flags&= ~MY_WME; + if (error == 9) + my_printf_error(result= EE_FILENOTFOUND, + "Expected object '%s' didn't exist", + error_flags, name); + else + { + const char *errmsg; + if (!(errmsg= ms3_server_error(s3_client))) + errmsg= ms3_error(error); + + my_printf_error(result= EE_READ, + "Got error from delete_object(%s): %d %s", + error_flags, name, error, errmsg); + } + } + DBUG_RETURN(result); +} + + +/* + Drop all files in a 'directory' in s3 +*/ + +int s3_delete_directory(ms3_st *s3_client, const char *aws_bucket, + const char *path) +{ + ms3_list_st *list, *org_list= 0; + my_bool error; + DBUG_ENTER("delete_directory"); + DBUG_PRINT("enter", ("path: %s", path)); + + if ((error= ms3_list(s3_client, aws_bucket, path, &org_list))) + { + const char *errmsg; + if (!(errmsg= ms3_server_error(s3_client))) + errmsg= ms3_error(error); + + my_printf_error(EE_FILENOTFOUND, + "Can't get list of files from %s. Error: %d %s", MYF(0), + path, error, errmsg); + DBUG_RETURN(EE_FILENOTFOUND); + } + + for (list= org_list ; list ; list= list->next) + if (s3_delete_object(s3_client, aws_bucket, list->key, MYF(MY_WME))) + error= 1; + if (org_list) + ms3_list_free(org_list); + DBUG_RETURN(error); +} + + +my_bool s3_rename_object(ms3_st *s3_client, const char *aws_bucket, + const char *from_name, const char *to_name, + myf error_flags) +{ + uint8_t error; + DBUG_ENTER("s3_rename_object"); + DBUG_PRINT("enter", ("from: %s to: %s", from_name, to_name)); + + if (likely(!(error= ms3_move(s3_client, + aws_bucket, from_name, + aws_bucket, to_name)))) + DBUG_RETURN(FALSE); + + if (error_flags) + { + error_flags&= ~MY_WME; + if (error == 9) + { + my_printf_error(EE_FILENOTFOUND, "Expected object '%s' didn't exist", + error_flags, from_name); + } + else + { + const char *errmsg; + if (!(errmsg= ms3_server_error(s3_client))) + errmsg= ms3_error(error); + + my_printf_error(EE_READ, "Got error from move_object(%s -> %s): %d %", + error_flags, + from_name, to_name, error, errmsg); + } + } + DBUG_RETURN(TRUE); +} + + +int s3_rename_directory(ms3_st *s3_client, const char *aws_bucket, + const char *from_name, const char *to_name, + myf error_flags) +{ + ms3_list_st *list, *org_list= 0; + my_bool error= 0; + char name[AWS_PATH_LENGTH], *end; + DBUG_ENTER("s3_delete_directory"); + + if ((error= ms3_list(s3_client, aws_bucket, from_name, &org_list))) + { + const char *errmsg; + if (!(errmsg= ms3_server_error(s3_client))) + errmsg= ms3_error(error); + + my_printf_error(EE_FILENOTFOUND, + "Can't get list of files from %s. Error: %d %s", + MYF(error_flags & ~MY_WME), + from_name, error, errmsg); + DBUG_RETURN(EE_FILENOTFOUND); + } + + end= strmov(name, to_name); + for (list= org_list ; list ; list= list->next) + { + const char *sep= strrchr(list->key, '/'); + if (sep) /* Safety */ + { + strmake(end, sep, (sizeof(name) - (end-name) - 1)); + if (s3_rename_object(s3_client, aws_bucket, list->key, name, + error_flags)) + error= 1; + } + } + if (org_list) + ms3_list_free(org_list); + DBUG_RETURN(error); +} + + +/****************************************************************************** + Converting index and frm files to from S3 storage engine +******************************************************************************/ + +/** + Change index information to be of type s3 + + @param header Copy of header in index file + @param block_size S3 block size + @param compression Compression algorithm to use + + The position are from _ma_base_info_write() +*/ + +static void convert_index_to_s3_format(uchar *header, ulong block_size, + int compression) +{ + MARIA_STATE_INFO state; + uchar *base_pos; + uint base_offset; + + memcpy(&state.header, header, sizeof(state.header)); + base_offset= mi_uint2korr(state.header.base_pos); + base_pos= header + base_offset; + + base_pos[107]= (uchar) compression; + mi_int3store(base_pos+119, block_size); +} + + +/** + Change index information to be a normal disk based table +*/ + +static void convert_index_to_disk_format(uchar *header) +{ + MARIA_STATE_INFO state; + uchar *base_pos; + uint base_offset; + + memcpy(&state.header, header, sizeof(state.header)); + base_offset= mi_uint2korr(state.header.base_pos); + base_pos= header + base_offset; + + base_pos[107]= 0; + mi_int3store(base_pos+119, 0); +} + +/** + Change storage engine in the .frm file from Aria to s3 + + For information about engine types, see legacy_db_type +*/ + +static void convert_frm_to_s3_format(uchar *header) +{ + DBUG_ASSERT(header[3] == 42 || header[3] == 41); /* Aria or S3 */ + header[3]= 41; /* S3 */ +} + +/** + Change storage engine in the .frm file from S3 to Aria + + For information about engine types, see legacy_db_type +*/ + +static void convert_frm_to_disk_format(uchar *header) +{ + DBUG_ASSERT(header[3] == 41); /* S3 */ + header[3]= 42; /* Aria */ +} + + +/****************************************************************************** + Helper functions +******************************************************************************/ + +/** + Set database and table name from path + + s3->database and s3->table_name will be pointed into path + Note that s3->database will not be null terminated! +*/ + +my_bool set_database_and_table_from_path(S3_INFO *s3, const char *path) +{ + size_t org_length= dirname_length(path); + size_t length= 0; + + if (!org_length) + return 1; + + s3->table.str= path+org_length; + s3->table.length= strlen(s3->table.str); + for (length= --org_length; length > 0 ; length --) + { + if (path[length-1] == FN_LIBCHAR || path[length-1] == '/') + break; +#ifdef FN_DEVCHAR + if (path[length-1] == FN_DEVCHAR) + break; +#endif + } + if (length && + (path[length] != FN_CURLIB || org_length - length != 1)) + { + s3->database.str= path + length; + s3->database.length= org_length - length; + return 0; + } + return 1; /* Can't find database */ +} + + +/** + Read frm from the disk +*/ + +static int s3_read_file_from_disk(const char *filename, uchar **to, + size_t *to_size, my_bool print_error) +{ + File file; + uchar *alloc_block; + size_t file_size; + int error; + + *to= 0; + if ((file= my_open(filename, + O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(print_error ? MY_WME: 0))) < 0) + return(my_errno); + + file_size= (size_t) my_seek(file, 0L, MY_SEEK_END, MYF(0)); + if (!(alloc_block= my_malloc(PSI_NOT_INSTRUMENTED, file_size, MYF(MY_WME)))) + goto err; + + if (my_pread(file, alloc_block, file_size, 0, MYF(MY_WME | MY_FNABP))) + goto err; + + *to= alloc_block; + *to_size= file_size; + my_close(file, MYF(0)); + return 0; + +err: + error= my_errno; + my_free(alloc_block); + my_close(file, MYF(0)); + return error; +} + + +/** + Get .frm or par from S3 + + @return 0 ok + @return 1 error +*/ + +my_bool s3_get_def(ms3_st *s3_client, S3_INFO *s3_info, S3_BLOCK *block, + const char *ext) +{ + char aws_path[AWS_PATH_LENGTH]; + + strxnmov(aws_path, sizeof(aws_path)-1, s3_info->database.str, "/", + s3_info->table.str, "/", ext, NullS); + + return s3_get_object(s3_client, s3_info->bucket.str, aws_path, block, + 0, 0); +} + +/** + Check if .frm exits in S3 + + @return 0 frm exists + @return 1 error +*/ + +my_bool s3_frm_exists(ms3_st *s3_client, S3_INFO *s3_info) +{ + char aws_path[AWS_PATH_LENGTH]; + ms3_status_st status; + + strxnmov(aws_path, sizeof(aws_path)-1, s3_info->database.str, "/", + s3_info->table.str, "/frm", NullS); + + return ms3_status(s3_client, s3_info->bucket.str, aws_path, &status); +} + + +/** + Get version from frm file + + @param out Store the table_version_here. It's of size MY_UUID_SIZE + @param frm_image Frm image + @param frm_length size of image + + @return 0 Was able to read table version + @return 1 Wrong information in frm file +*/ + +#define FRM_HEADER_SIZE 64 +#define EXTRA2_TABLEDEF_VERSION 0 + +static inline my_bool is_binary_frm_header(const uchar *head) +{ + return head[0] == 254 + && head[1] == 1 + && head[2] >= FRM_VER + && head[2] <= FRM_VER_CURRENT; +} + +static my_bool get_tabledef_version_from_frm(char *out, const uchar *frm_image, + size_t frm_length) +{ + uint segment_len; + const uchar *extra, *extra_end; + if (!is_binary_frm_header(frm_image) || frm_length <= FRM_HEADER_SIZE) + return 1; + + /* Length of the MariaDB extra2 segment in the form file. */ + segment_len= uint2korr(frm_image + 4); + if (frm_length < FRM_HEADER_SIZE + segment_len) + return 1; + + extra= frm_image + FRM_HEADER_SIZE; + if (*extra == '/') // old frm had '/' there + return 1; + + extra_end= extra + segment_len; + while (extra + 4 < extra_end) + { + uchar type= *extra++; + size_t length= *extra++; + if (!length) + { + length= uint2korr(extra); + extra+= 2; + if (length < 256) + return 1; /* Something is wrong */ + } + if (extra + length > extra_end) + return 1; + if (type == EXTRA2_TABLEDEF_VERSION) + { + if (length != MY_UUID_SIZE) + return 1; + memcpy(out, extra, length); + return 0; /* Found it */ + } + extra+= length; + } + return 1; +} + + +/** + Check if version in frm file matches what the server expects + + @return 0 table definitions matches + @return 1 table definitions doesn't match + @return 2 Can't find the frm version + @return 3 Can't read the frm version +*/ + +int s3_check_frm_version(ms3_st *s3_client, S3_INFO *s3_info) +{ + my_bool res= 0; + char aws_path[AWS_PATH_LENGTH]; + char uuid[MY_UUID_SIZE]; + S3_BLOCK block; + DBUG_ENTER("s3_check_frm_version"); + + strxnmov(aws_path, sizeof(aws_path)-1, s3_info->database.str, "/", + s3_info->base_table.str, "/frm", NullS); + + if (s3_get_object(s3_client, s3_info->bucket.str, aws_path, &block, 0, 0)) + { + DBUG_PRINT("exit", ("No object found")); + DBUG_RETURN(2); /* Ignore check, use old frm */ + } + + if (get_tabledef_version_from_frm(uuid, (uchar*) block.str, block.length) || + s3_info->tabledef_version.length != MY_UUID_SIZE) + { + s3_free(&block); + DBUG_PRINT("error", ("Wrong definition")); + DBUG_RETURN(3); /* Wrong definition */ + } + /* res is set to 1 if versions numbers doesn't match */ + res= bcmp(s3_info->tabledef_version.str, uuid, MY_UUID_SIZE) != 0; + s3_free(&block); + if (res) + DBUG_PRINT("error", ("Wrong table version")); + else + DBUG_PRINT("exit", ("Version strings matches")); + DBUG_RETURN(res); +} + + +/****************************************************************************** + Reading blocks from index or data from S3 +******************************************************************************/ + +/* + Read the index header (first page) from the index file + + In case of error, my_error() is called +*/ + +my_bool read_index_header(ms3_st *client, S3_INFO *s3, S3_BLOCK *block) +{ + char aws_path[AWS_PATH_LENGTH]; + DBUG_ENTER("read_index_header"); + strxnmov(aws_path, sizeof(aws_path)-1, s3->database.str, "/", s3->table.str, + "/aria", NullS); + DBUG_RETURN(s3_get_object(client, s3->bucket.str, aws_path, block, 0, 2)); +} + + +#ifdef FOR_FUTURE_IF_NEEDED_FOR_DEBUGGING_WITHOUT_S3 +/** + Read a big block from disk +*/ + +my_bool s3_block_read(struct st_pagecache *pagecache, + PAGECACHE_IO_HOOK_ARGS *args, + struct st_pagecache_file *file, + LEX_STRING *data) +{ + MARIA_SHARE *share= (MARIA_SHARE*) file->callback_data; + my_bool datafile= file != &share->kfile; + + DBUG_ASSERT(file->big_block_size > 0); + DBUG_ASSERT(((((my_off_t) args->pageno - file->head_blocks) << + pagecache->shift) % + file->big_block_size) == 0); + + if (!(data->str= (char *) my_malloc(file->big_block_size, MYF(MY_WME)))) + return TRUE; + + data->length= mysql_file_pread(file->file, + (unsigned char *)data->str, + file->big_block_size, + ((my_off_t) args->pageno << pagecache->shift), + MYF(MY_WME)); + if (data->length == 0 || data->length == MY_FILE_ERROR) + { + if (data->length == 0) + { + LEX_STRING *file_name= (datafile ? + &share->data_file_name : + &share->index_file_name); + my_error(EE_EOFERR, MYF(0), file_name->str, my_errno); + } + my_free(data->str); + data->length= 0; + data->str= 0; + return TRUE; + } + return FALSE; +} +#endif + + +/** + Read a block from S3 to page cache +*/ + +my_bool s3_block_read(struct st_pagecache *pagecache, + PAGECACHE_IO_HOOK_ARGS *args, + struct st_pagecache_file *file, + S3_BLOCK *block) +{ + char aws_path[AWS_PATH_LENGTH]; + MARIA_SHARE *share= (MARIA_SHARE*) file->callback_data; + my_bool datafile= file->file != share->kfile.file; + MARIA_HA *info= (MARIA_HA*) my_thread_var->keycache_file; + ms3_st *client= info->s3; + const char *path_suffix= datafile ? "/data/" : "/index/"; + char *end; + S3_INFO *s3= share->s3_path; + ulong block_number; + DBUG_ENTER("s3_block_read"); + + DBUG_ASSERT(file->big_block_size > 0); + DBUG_ASSERT(((((my_off_t) args->pageno - file->head_blocks) << + pagecache->shift) % + file->big_block_size) == 0); + + block_number= (((args->pageno - file->head_blocks) << pagecache->shift) / + file->big_block_size) + 1; + + end= strxnmov(aws_path, sizeof(aws_path)-12, s3->database.str, "/", + s3->table.str, path_suffix, "000000", NullS); + fix_suffix(end, block_number); + + DBUG_RETURN(s3_get_object(client, s3->bucket.str, aws_path, block, + share->base.compression_algorithm, 1)); +} + +/* + Start file numbers from 1000 to more easily find bugs when the file number + could be mistaken for a real file +*/ +static volatile int32 unique_file_number= 1000; + +int32 s3_unique_file_number() +{ + return my_atomic_add32_explicit(&unique_file_number, 1, + MY_MEMORY_ORDER_RELAXED); +} diff --git a/storage/maria/s3_func.h b/storage/maria/s3_func.h new file mode 100644 index 00000000..f73a95de --- /dev/null +++ b/storage/maria/s3_func.h @@ -0,0 +1,147 @@ +#ifndef S3_FUNC_INCLUDED +#define S3_FUNC_INCLUDED +/* Copyright (C) 2019, 2022, MariaDB Corporation Ab + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ + +/* + Interface function used by S3 storage engine and aria_copy_for_s3 +*/ + +#ifdef WITH_S3_STORAGE_ENGINE +#include <libmarias3/marias3.h> + +C_MODE_START +#define DEFAULT_AWS_HOST_NAME "s3.amazonaws.com" + +extern struct s3_func { + uint8_t (*set_option)(ms3_st *, ms3_set_option_t, void *); + void (*free)(S3_BLOCK *); + void (*deinit)(ms3_st *); + int32 (*unique_file_number)(void); + my_bool (*read_index_header)(ms3_st *, S3_INFO *, S3_BLOCK *); + int (*check_frm_version)(ms3_st *, S3_INFO *); + S3_INFO *(*info_copy)(S3_INFO *); + my_bool (*set_database_and_table_from_path)(S3_INFO *, const char *); + ms3_st *(*open_connection)(S3_INFO *); +} s3f; + +extern TYPELIB s3_protocol_typelib; + +/* Store information about a s3 connection */ + +struct s3_info +{ + /* Connection strings */ + LEX_CSTRING access_key, secret_key, region, bucket, host_name; + int port; // 0 means 'Use default' + my_bool use_http; + + /* Will be set by caller or by ma_open() */ + LEX_CSTRING database, table; + + /* + Name of the partition table if the table is partitioned. If not, it's set + to be same as table. This is used to know which frm file to read to + check table version. + */ + LEX_CSTRING base_table; + + /* Sent to open to verify version */ + LEX_CUSTRING tabledef_version; + + /* Protocol for the list bucket API call. 1 for Amazon, 2 for some others */ + uint8_t protocol_version; +}; + + +/* flag + length is stored in this header */ +#define COMPRESS_HEADER 4 + +/* Max length of an AWS PATH */ +#define AWS_PATH_LENGTH ((NAME_LEN)*3+3+10+6+11) + +void s3_init_library(void); +void s3_deinit_library(void); +int aria_copy_to_s3(ms3_st *s3_client, const char *aws_bucket, + const char *path, + const char *database, const char *table_name, + ulong block_size, my_bool compression, + my_bool force, my_bool display, my_bool copy_frm); +int aria_copy_from_s3(ms3_st *s3_client, const char *aws_bucket, + const char *path,const char *database, + my_bool compression, my_bool force, my_bool display); +int aria_delete_from_s3(ms3_st *s3_client, const char *aws_bucket, + const char *database, const char *table, + my_bool display); +int aria_rename_s3(ms3_st *s3_client, const char *aws_bucket, + const char *from_database, const char *from_table, + const char *to_database, const char *to_table, + my_bool rename_frm); +ms3_st *s3_open_connection(S3_INFO *s3); +void s3_deinit(ms3_st *s3_client); +int s3_put_object(ms3_st *s3_client, const char *aws_bucket, + const char *name, uchar *data, size_t length, + my_bool compression); +int s3_get_object(ms3_st *s3_client, const char *aws_bucket, + const char *name, S3_BLOCK *block, my_bool compression, + int print_error); +int s3_delete_object(ms3_st *s3_client, const char *aws_bucket, + const char *name, myf error_flags); +my_bool s3_rename_object(ms3_st *s3_client, const char *aws_bucket, + const char *from_name, const char *to_name, + myf error_flags); +void s3_free(S3_BLOCK *data); +my_bool s3_copy_from_file(ms3_st *s3_client, const char *aws_bucket, + char *aws_path, File file, my_off_t start, + my_off_t file_end, uchar *block, size_t block_size, + my_bool compression, my_bool display); +my_bool s3_copy_to_file(ms3_st *s3_client, const char *aws_bucket, + char *aws_path, File file, my_off_t start, + my_off_t file_end, my_bool compression, + my_bool display); +int s3_delete_directory(ms3_st *s3_client, const char *aws_bucket, + const char *path); +int s3_rename_directory(ms3_st *s3_client, const char *aws_bucket, + const char *from_name, const char *to_name, + myf error_flags); +int partition_delete_from_s3(ms3_st *s3_client, const char *aws_bucket, + const char *database, const char *table, + myf error_flags); +int partition_copy_to_s3(ms3_st *s3_client, const char *aws_bucket, + const char *path, const char *old_path, + const char *database, const char *table_name); + +S3_INFO *s3_info_copy(S3_INFO *old); +my_bool set_database_and_table_from_path(S3_INFO *s3, const char *path); +my_bool s3_get_def(ms3_st *s3_client, S3_INFO *S3_info, S3_BLOCK *block, + const char *ext); +my_bool s3_frm_exists(ms3_st *s3_client, S3_INFO *s3_info); +int s3_check_frm_version(ms3_st *s3_client, S3_INFO *s3_info); +my_bool read_index_header(ms3_st *client, S3_INFO *s3, S3_BLOCK *block); +int32 s3_unique_file_number(void); +my_bool s3_block_read(struct st_pagecache *pagecache, + PAGECACHE_IO_HOOK_ARGS *args, + struct st_pagecache_file *file, + S3_BLOCK *block); +C_MODE_END +#else + +C_MODE_START +/* Dummy structures and interfaces to be used when compiling without S3 */ +struct s3_info; +struct ms3_st; +C_MODE_END +#endif /* WITH_S3_STORAGE_ENGINE */ +#endif /* HA_S3_FUNC_INCLUDED */ diff --git a/storage/maria/tablockman.c b/storage/maria/tablockman.c new file mode 100644 index 00000000..180487a8 --- /dev/null +++ b/storage/maria/tablockman.c @@ -0,0 +1,672 @@ +/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */ +/* QQ: automatically place S instead of LS if possible */ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <my_base.h> +#include <hash.h> +#include "tablockman.h" + +/* + Lock Manager for Table Locks + + The code below handles locks on resources - but it is optimized for a + case when a number of resources is not very large, and there are many of + locks per resource - that is a resource is likely to be a table or a + database, but hardly a row in a table. + + Locks belong to "lock owners". A Lock Owner is uniquely identified by a + 16-bit number - loid (lock owner identifier). A function loid_to_tlo must + be provided by the application that takes such a number as an argument + and returns a TABLE_LOCK_OWNER structure. + + Lock levels are completely defined by three tables. Lock compatibility + matrix specifies which locks can be held at the same time on a resource. + Lock combining matrix specifies what lock level has the same behaviour as + a pair of two locks of given levels. getlock_result matrix simplifies + intention locking and lock escalation for an application, basically it + defines which locks are intention locks and which locks are "loose" + locks. It is only used to provide better diagnostics for the + application, lock manager itself does not differentiate between normal, + intention, and loose locks. + + The assumptions are: few distinct resources, many locks are held at the + same time on one resource. Thus: a lock structure _per resource_ can be + rather large; a lock structure _per lock_ does not need to be very small + either; we need to optimize for _speed_. Operations we need are: place a + lock, check if a particular transaction already has a lock on this + resource, check if a conflicting lock exists, if yes - find who owns it. + + Solution: every resource has a structure with + 1. Hash of latest (see the lock upgrade section below) granted locks with + loid as a key. Thus, checking if a given transaction has a lock on + this resource is O(1) operation. + 2. Doubly-linked lists of all granted locks - one list for every lock + type. Thus, checking if a conflicting lock exists is a check whether + an appropriate list head pointer is not null, also O(1). + 3. Every lock has a loid of the owner, thus checking who owns a + conflicting lock is also O(1). + 4. Deque of waiting locks. It's a deque (double-ended queue) not a fifo, + because for lock upgrades requests are added to the queue head, not + tail. This is a single place where there it gets O(N) on number + of locks - when a transaction wakes up from waiting on a condition, + it may need to scan the queue backward to the beginning to find + a conflicting lock. It is guaranteed though that "all transactions + before it" received the same - or earlier - signal. In other words a + transaction needs to scan all transactions before it that received the + signal but didn't have a chance to resume the execution yet, so + practically OS scheduler won't let the scan to be O(N). + + Waiting: if there is a conflicting lock or if wait queue is not empty, a + requested lock cannot be granted at once. It is added to the end of the + wait queue. If a queue was empty and there is a conflicting lock - the + "blocker" transaction is the owner of this lock. If a queue is not empty, + an owner of the previous lock in the queue is the "blocker". But if the + previous lock is compatible with the request, then the "blocker" is the + transaction that the owner of the lock at the end of the queue is waiting + for (in other words, our lock is added to the end of the wait queue, and + our blocker is the same as of the lock right before us). + + Lock upgrades: when a thread that has a lock on a given resource, + requests a new lock on the same resource and the old lock is not enough + to satisfy new lock requirements (which is defined by + lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock + (defined by lock_combining_matrix as above) is placed. Depending on + other granted locks it is immediately granted or it has to wait. Here the + lock is added to the start of the waiting queue, not to the end. Old + lock, is removed from the hash, but not from the doubly-linked lists. + (indeed, a transaction checks "do I have a lock on this resource ?" by + looking in a hash, and it should find a latest lock, so old locks must be + removed; but a transaction checks "are there conflicting locks ?" by + checking doubly-linked lists, it doesn't matter if it will find an old + lock - if it would be removed, a new lock would be also a conflict). + So, a hash contains only "latest" locks - there can be only one latest + lock per resource per transaction. But doubly-linked lists contain all + locks, even "obsolete" ones, because it doesnt't hurt. Note that old + locks can not be freed early, in particular they stay in the + 'active_locks' list of a lock owner, because they may be "re-enabled" + on a savepoint rollback. + + To better support table-row relations where one needs to lock the table + with an intention lock before locking the row, extended diagnostics is + provided. When an intention lock (presumably on a table) is granted, + lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row, + perhaps the thread already has a normal lock on this table), + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual), + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check + whether it's possible to lock the row, but no need to lock it - perhaps + the thread has a loose lock on this table). This is defined by + getlock_result[] table. + + Instant duration locks are not supported. Though they're trivial to add, + they are normally only used on rows, not on tables. So, presumably, + they are not needed here. + + Mutexes: there're table mutexes (LOCKED_TABLE::mutex), lock owner mutexes + (TABLE_LOCK_OWNER::mutex), and a pool mutex (TABLOCKMAN::pool_mutex). + table mutex protects operations on the table lock structures, and lock + owner pointers waiting_for and waiting_for_loid. + lock owner mutex is only used to wait on lock owner condition + (TABLE_LOCK_OWNER::cond), there's no need to protect owner's lock + structures, and only lock owner itself may access them. + The pool mutex protects a pool of unused locks. Note the locking order: + first the table mutex, then the owner mutex or a pool mutex. + Table mutex lock cannot be attempted when owner or pool mutex are locked. + No mutex lock can be attempted if owner or pool mutex are locked. +*/ + +/* + Lock compatibility matrix. + + It's asymmetric. Read it as "Somebody has the lock <value in the row + label>, can I set the lock <value in the column label> ?" + + ') Though you can take LS lock while somebody has S lock, it makes no + sense - it's simpler to take S lock too. + + 1 - compatible + 0 - incompatible + -1 - "impossible", so that we can assert the impossibility. +*/ +static const int lock_compatibility_matrix[10][10]= +{ /* N S X IS IX SIX LS LX SLX LSIX */ + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, /* N */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* S */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* X */ + { -1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, /* IS */ + { -1, 0, 0, 1, 1, 0, 1, 1, 0, 1 }, /* IX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 }, /* SIX */ + { -1, 1, 0, 1, 0, 0, 1, 0, 0, 0 }, /* LS */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* LX */ + { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* SLX */ + { -1, 0, 0, 1, 0, 0, 1, 0, 0, 0 } /* LSIX */ +}; + +/* + Lock combining matrix. + + It's symmetric. Read it as "what lock level L is identical to the + set of two locks A and B" + + One should never get N from it, we assert the impossibility +*/ +static const enum lockman_lock_type lock_combining_matrix[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { N, N, N, N, N, N, N, N, N, N}, /* N */ + { N, S, X, S, SIX, SIX, S, SLX, SLX, SIX}, /* S */ + { N, X, X, X, X, X, X, X, X, X}, /* X */ + { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX}, /* IS */ + { N, SIX, X, IX, IX, SIX, LSIX, LX, SLX, LSIX}, /* IX */ + { N, SIX, X, SIX, SIX, SIX, SIX, SLX, SLX, SIX}, /* SIX */ + { N, S, X, LS, LSIX, SIX, LS, LX, SLX, LSIX}, /* LS */ + { N, SLX, X, LX, LX, SLX, LX, LX, SLX, LX}, /* LX */ + { N, SLX, X, SLX, SLX, SLX, SLX, SLX, SLX, SLX}, /* SLX */ + { N, SIX, X, LSIX, LSIX, SIX, LSIX, LX, SLX, LSIX} /* LSIX */ +}; + +/* + the return codes for lockman_getlock + + It's asymmetric. Read it as "I have the lock <value in the row label>, + what value should be returned for <value in the column label> ?" + + 0 means impossible combination (assert!) + + Defines below help to preserve the table structure. + I/L/A values are self explanatory + x means the combination is possible (assert should not crash) + but it cannot happen in row locks, only in table locks (S,X), + or lock escalations (LS,LX) +*/ +#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE +#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +#define A GOT_THE_LOCK +#define x GOT_THE_LOCK +static const enum lockman_getlock_result getlock_result[10][10]= +{/* N S X IS IX SIX LS LX SLX LSIX */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* N */ + { 0, x, 0, A, 0, 0, x, 0, 0, 0}, /* S */ + { 0, x, x, A, A, 0, x, x, 0, 0}, /* X */ + { 0, 0, 0, I, 0, 0, 0, 0, 0, 0}, /* IS */ + { 0, 0, 0, I, I, 0, 0, 0, 0, 0}, /* IX */ + { 0, x, 0, A, I, 0, x, 0, 0, 0}, /* SIX */ + { 0, 0, 0, L, 0, 0, x, 0, 0, 0}, /* LS */ + { 0, 0, 0, L, L, 0, x, x, 0, 0}, /* LX */ + { 0, x, 0, A, L, 0, x, x, 0, 0}, /* SLX */ + { 0, 0, 0, L, I, 0, x, 0, 0, 0} /* LSIX */ +}; +#undef I +#undef L +#undef A +#undef x + +/* + this structure is optimized for a case when there're many locks + on the same resource - e.g. a table +*/ + +struct st_table_lock { + /* QQ: do we need upgraded_from ? */ + struct st_table_lock *next_in_lo, *upgraded_from, *next, *prev; + struct st_locked_table *table; + uint16 loid; + uchar lock_type; +}; + +static inline +TABLE_LOCK *find_by_loid(LOCKED_TABLE *table, uint16 loid) +{ + return (TABLE_LOCK *)my_hash_search(& table->latest_locks, + (uchar *)& loid, sizeof(loid)); +} + +static inline +void remove_from_wait_queue(TABLE_LOCK *lock, LOCKED_TABLE *table) +{ + DBUG_ASSERT(table == lock->table); + if (lock->prev) + { + DBUG_ASSERT(table->wait_queue_out != lock); + lock->prev->next= lock->next; + } + else + { + DBUG_ASSERT(table->wait_queue_out == lock); + table->wait_queue_out= lock->next; + } + if (lock->next) + { + DBUG_ASSERT(table->wait_queue_in != lock); + lock->next->prev= lock->prev; + } + else + { + DBUG_ASSERT(table->wait_queue_in == lock); + table->wait_queue_in= lock->prev; + } +} + +/* + DESCRIPTION + tries to lock a resource 'table' with a lock level 'lock'. + + RETURN + see enum lockman_getlock_result +*/ +enum lockman_getlock_result +tablockman_getlock(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo, + LOCKED_TABLE *table, enum lockman_lock_type lock) +{ + TABLE_LOCK *old, *new, *blocker, *blocker2; + TABLE_LOCK_OWNER *wait_for; + struct timespec timeout; + enum lockman_lock_type new_lock; + enum lockman_getlock_result res; + int i; + + DBUG_ASSERT(lo->waiting_lock == 0); + DBUG_ASSERT(lo->waiting_for == 0); + DBUG_ASSERT(lo->waiting_for_loid == 0); + + mysql_mutex_lock(& table->mutex); + /* do we already have a lock on this resource ? */ + old= find_by_loid(table, lo->loid); + + /* calculate the level of the upgraded lock, if yes */ + new_lock= old ? lock_combining_matrix[old->lock_type][lock] : lock; + + /* and check if old lock is enough to satisfy the new request */ + if (old && new_lock == old->lock_type) + { + /* yes */ + res= getlock_result[old->lock_type][lock]; + goto ret; + } + + /* no, placing a new lock. first - take a free lock structure from the pool */ + mysql_mutex_lock(& lm->pool_mutex); + new= lm->pool; + if (new) + { + lm->pool= new->next; + mysql_mutex_unlock(& lm->pool_mutex); + } + else + { + mysql_mutex_unlock(& lm->pool_mutex); + new= (TABLE_LOCK *)my_malloc(sizeof(*new), MYF(MY_WME)); + if (unlikely(!new)) + { + res= NO_MEMORY_FOR_LOCK; + goto ret; + } + } + + new->loid= lo->loid; + new->lock_type= new_lock; + new->table= table; + + /* and try to place it */ + for (new->prev= table->wait_queue_in;;) + { + wait_for= 0; + if (!old) + { + /* not upgrading - a lock must be added to the _end_ of the wait queue */ + for (blocker= new->prev; blocker && !wait_for; blocker= blocker->prev) + { + TABLE_LOCK_OWNER *tmp= lm->loid_to_tlo(blocker->loid); + + /* find a blocking lock */ + DBUG_ASSERT(table->wait_queue_out); + DBUG_ASSERT(table->wait_queue_in); + if (!lock_compatibility_matrix[blocker->lock_type][lock]) + { + /* found! */ + wait_for= tmp; + break; + } + + /* + hmm, the lock before doesn't block us, let's look one step further. + the condition below means: + + if we never waited on a condition yet + OR + the lock before ours (blocker) waits on a lock (blocker2) that is + present in the hash AND and conflicts with 'blocker' + + the condition after OR may fail if 'blocker2' was removed from + the hash, its signal woke us up, but 'blocker' itself didn't see + the signal yet. + */ + if (!lo->waiting_lock || + ((blocker2= find_by_loid(table, tmp->waiting_for_loid)) && + !lock_compatibility_matrix[blocker2->lock_type] + [blocker->lock_type])) + { + /* but it's waiting for a real lock. we'll wait for the same lock */ + wait_for= tmp->waiting_for; + /* + We don't really need tmp->waiting_for, as tmp->waiting_for_loid + is enough. waiting_for is just a local cache to avoid calling + loid_to_tlo(). + But it's essensial that tmp->waiting_for pointer can ONLY + be dereferenced if find_by_loid() above returns a non-null + pointer, because a TABLE_LOCK_OWNER object that it points to + may've been freed when we come here after a signal. + In particular tmp->waiting_for_loid cannot be replaced + with tmp->waiting_for->loid. + */ + DBUG_ASSERT(wait_for == lm->loid_to_tlo(tmp->waiting_for_loid)); + break; + } + + /* + otherwise - a lock it's waiting for doesn't exist. + We've no choice but to scan the wait queue backwards, looking + for a conflicting lock or a lock waiting for a real lock. + QQ is there a way to avoid this scanning ? + */ + } + } + + if (wait_for == 0) + { + /* checking for compatibility with existing locks */ + for (blocker= 0, i= 0; i < LOCK_TYPES; i++) + { + if (table->active_locks[i] && !lock_compatibility_matrix[i+1][lock]) + { + blocker= table->active_locks[i]; + /* if the first lock in the list is our own - skip it */ + if (blocker->loid == lo->loid) + blocker= blocker->next; + if (blocker) /* found a conflicting lock, need to wait */ + break; + } + } + if (!blocker) /* free to go */ + break; + wait_for= lm->loid_to_tlo(blocker->loid); + } + + /* ok, we're here - the wait is inevitable */ + lo->waiting_for= wait_for; + lo->waiting_for_loid= wait_for->loid; + if (!lo->waiting_lock) /* first iteration of the for() loop */ + { + /* lock upgrade or new lock request ? */ + if (old) + { + /* upgrade - add the lock to the _start_ of the wait queue */ + new->prev= 0; + if ((new->next= table->wait_queue_out)) + new->next->prev= new; + table->wait_queue_out= new; + if (!table->wait_queue_in) + table->wait_queue_in= table->wait_queue_out; + } + else + { + /* new lock - add the lock to the _end_ of the wait queue */ + new->next= 0; + if ((new->prev= table->wait_queue_in)) + new->prev->next= new; + table->wait_queue_in= new; + if (!table->wait_queue_out) + table->wait_queue_out= table->wait_queue_in; + } + lo->waiting_lock= new; + + set_timespec_nsec(timeout,lm->lock_timeout * 1000000); + + } + + /* + prepare to wait. + we must lock blocker's mutex to wait on blocker's cond. + and we must release table's mutex. + note that blocker's mutex is locked _before_ table's mutex is released + */ + mysql_mutex_lock(wait_for->mutex); + mysql_mutex_unlock(& table->mutex); + + /* now really wait */ + i= mysql_cond_timedwait(wait_for->cond, wait_for->mutex, & timeout); + + mysql_mutex_unlock(wait_for->mutex); + + if (i == ETIMEDOUT || i == ETIME) + { + /* we rely on the caller to rollback and release all locks */ + res= LOCK_TIMEOUT; + goto ret2; + } + + mysql_mutex_lock(& table->mutex); + + /* ... and repeat from the beginning */ + } + /* yeah! we can place the lock now */ + + /* remove the lock from the wait queue, if it was there */ + if (lo->waiting_lock) + { + remove_from_wait_queue(new, table); + lo->waiting_lock= 0; + lo->waiting_for= 0; + lo->waiting_for_loid= 0; + } + + /* add it to the list of all locks of this lock owner */ + new->next_in_lo= lo->active_locks; + lo->active_locks= new; + + /* and to the list of active locks of this lock type */ + new->prev= 0; + if ((new->next= table->active_locks[new_lock-1])) + new->next->prev= new; + table->active_locks[new_lock-1]= new; + + /* update the latest_locks hash */ + if (old) + my_hash_delete(& table->latest_locks, (uchar *)old); + my_hash_insert(& table->latest_locks, (uchar *)new); + + new->upgraded_from= old; + + res= getlock_result[lock][lock]; + +ret: + mysql_mutex_unlock(& table->mutex); +ret2: + DBUG_ASSERT(res); + return res; +} + +/* + DESCRIPTION + release all locks belonging to a transaction. + signal waiters to continue +*/ +void tablockman_release_locks(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo) +{ + TABLE_LOCK *lock, *local_pool= 0, *local_pool_end; + + /* + instead of adding released locks to a pool one by one, we'll link + them in a list and add to a pool in one short action (under a mutex) + */ + local_pool_end= lo->waiting_lock ? lo->waiting_lock : lo->active_locks; + if (!local_pool_end) + return; + + /* release a waiting lock, if any */ + if ((lock= lo->waiting_lock)) + { + DBUG_ASSERT(lock->loid == lo->loid); + mysql_mutex_lock(& lock->table->mutex); + remove_from_wait_queue(lock, lock->table); + + /* + a special case: if this lock was not the last in the wait queue + and it's compatible with the next lock, than the next lock + is waiting for our blocker though really it waits for us, indirectly. + Signal our blocker to release this next lock (after we removed our + lock from the wait queue, of course). + */ + /* + An example to clarify the above: + trn1> S-lock the table. Granted. + trn2> IX-lock the table. Added to the wait queue. trn2 waits on trn1 + trn3> IS-lock the table. The queue is not empty, so IS-lock is added + to the queue. It's compatible with the waiting IX-lock, so trn3 + waits for trn2->waiting_for, that is trn1. + if trn1 releases the lock it signals trn1->cond and both waiting + transactions are awaken. But if trn2 times out, trn3 must be notified + too (as IS and S locks are compatible). So trn2 must signal trn1->cond. + */ + if (lock->next && + lock_compatibility_matrix[lock->next->lock_type][lock->lock_type]) + { + mysql_mutex_lock(lo->waiting_for->mutex); + mysql_cond_broadcast(lo->waiting_for->cond); + mysql_mutex_unlock(lo->waiting_for->mutex); + } + lo->waiting_for= 0; + lo->waiting_for_loid= 0; + mysql_mutex_unlock(& lock->table->mutex); + + lock->next= local_pool; + local_pool= lock; + } + + /* now release granted locks */ + lock= lo->active_locks; + while (lock) + { + TABLE_LOCK *cur= lock; + mysql_mutex_t *mutex= & lock->table->mutex; + DBUG_ASSERT(cur->loid == lo->loid); + + DBUG_ASSERT(lock != lock->next_in_lo); + lock= lock->next_in_lo; + + /* TODO ? group locks by table to reduce the number of mutex locks */ + mysql_mutex_lock(mutex); + my_hash_delete(& cur->table->latest_locks, (uchar *)cur); + + if (cur->prev) + cur->prev->next= cur->next; + if (cur->next) + cur->next->prev= cur->prev; + if (cur->table->active_locks[cur->lock_type-1] == cur) + cur->table->active_locks[cur->lock_type-1]= cur->next; + + cur->next= local_pool; + local_pool= cur; + + mysql_mutex_unlock(mutex); + } + + lo->waiting_lock= lo->active_locks= 0; + + /* + okay, all locks released. now signal that we're leaving, + in case somebody's waiting for it + */ + mysql_mutex_lock(lo->mutex); + mysql_cond_broadcast(lo->cond); + mysql_mutex_unlock(lo->mutex); + + /* and push all freed locks to the lockman's pool */ + mysql_mutex_lock(& lm->pool_mutex); + local_pool_end->next= lm->pool; + lm->pool= local_pool; + mysql_mutex_unlock(& lm->pool_mutex); +} + +void tablockman_init(TABLOCKMAN *lm, loid_to_tlo_func *func, uint timeout) +{ + lm->pool= 0; + lm->loid_to_tlo= func; + lm->lock_timeout= timeout; + mysql_mutex_init(& lm->pool_mutex, MY_MUTEX_INIT_FAST); + my_interval_timer(); /* ensure that my_interval_timer() is initialized */ +} + +void tablockman_destroy(TABLOCKMAN *lm) +{ + while (lm->pool) + { + TABLE_LOCK *tmp= lm->pool; + lm->pool= tmp->next; + my_free(tmp); + } + mysql_mutex_destroy(& lm->pool_mutex); +} + +/* + initialize a LOCKED_TABLE structure + + SYNOPSYS + lt a LOCKED_TABLE to initialize + initial_hash_size initial size for 'latest_locks' hash +*/ +void tablockman_init_locked_table(LOCKED_TABLE *lt, int initial_hash_size) +{ + bzero(lt, sizeof(*lt)); + mysql_mutex_init(& lt->mutex, MY_MUTEX_INIT_FAST); + my_hash_init(& lt->latest_locks, & my_charset_bin, initial_hash_size, + offsetof(TABLE_LOCK, loid), + sizeof(((TABLE_LOCK*)0)->loid), 0, 0, 0); +} + +void tablockman_destroy_locked_table(LOCKED_TABLE *lt) +{ + int i; + + DBUG_ASSERT(lt->wait_queue_out == 0); + DBUG_ASSERT(lt->wait_queue_in == 0); + DBUG_ASSERT(lt->latest_locks.records == 0); + for (i= 0; i<LOCK_TYPES; i++) + DBUG_ASSERT(lt->active_locks[i] == 0); + + my_hash_free(& lt->latest_locks); + mysql_mutex_destroy(& lt->mutex); +} + +#ifdef EXTRA_DEBUG +static const char *lock2str[LOCK_TYPES+1]= {"N", "S", "X", "IS", "IX", "SIX", + "LS", "LX", "SLX", "LSIX"}; + +void tablockman_print_tlo(TABLE_LOCK_OWNER *lo) +{ + TABLE_LOCK *lock; + + printf("lo%d>", lo->loid); + if ((lock= lo->waiting_lock)) + printf(" (%s.0x%lx)", lock2str[lock->lock_type], (ulong)lock->table); + for (lock= lo->active_locks; + lock && lock != lock->next_in_lo; + lock= lock->next_in_lo) + printf(" %s.0x%lx", lock2str[lock->lock_type], (ulong)lock->table); + if (lock && lock == lock->next_in_lo) + printf("!"); + printf("\n"); +} +#endif + diff --git a/storage/maria/tablockman.h b/storage/maria/tablockman.h new file mode 100644 index 00000000..fd756fae --- /dev/null +++ b/storage/maria/tablockman.h @@ -0,0 +1,87 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _tablockman_h +#define _tablockman_h + +/* + Lock levels: + ^^^^^^^^^^^ + + N - "no lock", not a lock, used sometimes internally to simplify the code + S - Shared + X - eXclusive + IS - Intention Shared + IX - Intention eXclusive + SIX - Shared + Intention eXclusive + LS - Loose Shared + LX - Loose eXclusive + SLX - Shared + Loose eXclusive + LSIX - Loose Shared + Intention eXclusive +*/ +#ifndef _lockman_h +/* QQ: TODO remove N-locks */ +enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST }; +enum lockman_getlock_result { + NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT, + GOT_THE_LOCK, + GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE, + GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE +}; +#endif + +#define LOCK_TYPES (LOCK_TYPE_LAST-1) + +typedef struct st_table_lock TABLE_LOCK; + +typedef struct st_table_lock_owner { + TABLE_LOCK *active_locks; /* list of active locks */ + TABLE_LOCK *waiting_lock; /* waiting lock (one lock only) */ + struct st_table_lock_owner *waiting_for; /* transaction we're waiting for */ + pthread_cond_t *cond; /* transactions waiting for us, wait on 'cond' */ + mysql_mutex_t *mutex; /* mutex is required to use 'cond' */ + uint16 loid, waiting_for_loid; /* Lock Owner IDentifier */ +} TABLE_LOCK_OWNER; + +typedef struct st_locked_table { + mysql_mutex_t mutex; /* mutex for everything below */ + HASH latest_locks; /* latest locks in a hash */ + TABLE_LOCK *active_locks[LOCK_TYPES]; /* dl-list of locks per type */ + TABLE_LOCK *wait_queue_in, *wait_queue_out; /* wait deque (double-end queue)*/ +} LOCKED_TABLE; + +typedef TABLE_LOCK_OWNER *loid_to_tlo_func(uint16); + +typedef struct { + mysql_mutex_t pool_mutex; + TABLE_LOCK *pool; /* lifo pool of free locks */ + uint lock_timeout; /* lock timeout in milliseconds */ + loid_to_tlo_func *loid_to_tlo; /* for mapping loid to TABLE_LOCK_OWNER */ +} TABLOCKMAN; + +void tablockman_init(TABLOCKMAN *, loid_to_tlo_func *, uint); +void tablockman_destroy(TABLOCKMAN *); +enum lockman_getlock_result tablockman_getlock(TABLOCKMAN *, TABLE_LOCK_OWNER *, + LOCKED_TABLE *, enum lockman_lock_type); +void tablockman_release_locks(TABLOCKMAN *, TABLE_LOCK_OWNER *); +void tablockman_init_locked_table(LOCKED_TABLE *, int); +void tablockman_destroy_locked_table(LOCKED_TABLE *); + +#ifdef EXTRA_DEBUG +void tablockman_print_tlo(TABLE_LOCK_OWNER *); +#endif + +#endif + diff --git a/storage/maria/test_aria_s3_copy.sh b/storage/maria/test_aria_s3_copy.sh new file mode 100755 index 00000000..ad39df69 --- /dev/null +++ b/storage/maria/test_aria_s3_copy.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# +# Note that this test expact that there are tables test1 and test2 in +# the current directory where test2 has also a .frm file +# + +TMPDIR=tmpdir +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/ + +my_cmp() +{ + if ! cmp $1 $TMPDIR/$1 + then + echo "aborting" + exit 1; + fi +} + +run_test() +{ + OPT=$1; + echo "******* Running test with options '$OPT' **********" + rm -rf $TMPDIR + mkdir $TMPDIR + cp test?.* $TMPDIR + if ! ./aria_s3_copy --op=to --force $OPT test1 test2 + then + echo Got error $? + exit 1; + fi + rm test?.* + if ! ./aria_s3_copy --op=from $OPT test1 test2 + then + echo Got error $? + exit 1; + fi + if ! ./aria_s3_copy --op=delete $OPT test1 test2 + then + echo Got error $? + exit 1; + fi + my_cmp test1.MAI + my_cmp test1.MAD + my_cmp test2.MAI + my_cmp test2.MAD + my_cmp test2.frm + rm test?.* + cp $TMPDIR/* . + rm -r $TMPDIR +} + +run_test "" +run_test "--s3_block_size=64K --compress" +run_test "--s3_block_size=4M" +echo "ok" diff --git a/storage/maria/test_ma_backup.c b/storage/maria/test_ma_backup.c new file mode 100644 index 00000000..c57ec6ec --- /dev/null +++ b/storage/maria/test_ma_backup.c @@ -0,0 +1,444 @@ +/* Copyright (C) 2018, 2021, MariaDB corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ + +/* Code for doing backups of Aria tables */ + +/****************************************************************************** + Testing ma_backup interface + Table creation code is taken from ma_test1 +******************************************************************************/ + +#define ROWS_IN_TEST 100000 + +#include "maria_def.h" +#include "ma_blockrec.h" /* PAGE_SUFFIX_SIZE */ +#include "ma_checkpoint.h" +#include <aria_backup.h> + +static int silent; +static int create_test_table(const char *table_name, int stage); +static int copy_table(const char *table_name, int stage); +static void create_record(uchar *record,uint rownr); + +int main(int argc __attribute__((unused)), char *argv[]) +{ + int error= 1; + int i; + char buff[FN_REFLEN]; +#ifdef SAFE_MUTEX + safe_mutex_deadlock_detector= 1; +#endif + MY_INIT(argv[0]); + maria_data_root= "."; + + /* Maria requires that we always have a page cache */ + if (maria_init() || + (init_pagecache(maria_pagecache, maria_block_size * 2000, 0, 0, + maria_block_size, 0, MY_WME) == 0) || + ma_control_file_open(TRUE, TRUE, TRUE) || + (init_pagecache(maria_log_pagecache, + TRANSLOG_PAGECACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, MY_WME) == 0) || + translog_init(maria_data_root, TRANSLOG_FILE_SIZE, + 0, 0, maria_log_pagecache, + TRANSLOG_DEFAULT_FLAGS, 0) || + (trnman_init(0) || ma_checkpoint_init(0))) + { + fprintf(stderr, "Error in initialization\n"); + exit(1); + } + init_thr_lock(); + + fn_format(buff, "test_copy", maria_data_root, "", MYF(0)); + + for (i= 0; i < 5 ; i++) + { + printf("Stage: %d\n", i); + fflush(stdout); + if (create_test_table(buff, i)) + goto err; + if (copy_table(buff, i)) + goto err; + } + error= 0; + printf("test ok\n"); +err: + if (error) + fprintf(stderr, "Test %i failed\n", i); + maria_end(); + my_uuid_end(); + my_end(MY_CHECK_ERROR); + exit(error); +} + + +/** + Example of how to read an Aria table +*/ + +static int copy_table(const char *table_name, int stage) +{ + char old_name[FN_REFLEN]; + uchar *copy_buffer= 0; + ARIA_TABLE_CAPABILITIES cap; + ulonglong block; + File org_file= -1; + int error= 1; + + strxmov(old_name, table_name, ".MAI", NullS); + + if ((org_file= my_open(old_name, + O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(MY_WME))) < 0) + goto err; + if ((error= aria_get_capabilities(org_file, &cap))) + { + fprintf(stderr, "aria_get_capabilities failed: %d\n", error); + goto err; + } + + printf("- Capabilities read. oneline_backup_safe: %d\n", + cap.online_backup_safe); + printf("- Copying index file\n"); + + copy_buffer= my_malloc(PSI_NOT_INSTRUMENTED, cap.block_size, MYF(0)); + for (block= 0 ; ; block++) + { + if ((error= aria_read_index(org_file, &cap, block, copy_buffer) == + HA_ERR_END_OF_FILE)) + break; + if (error) + { + fprintf(stderr, "aria_read_index failed: %d\n", error); + goto err; + } + } + my_close(org_file, MYF(MY_WME)); + + + printf("- Copying data file\n"); + strxmov(old_name, table_name, ".MAD", NullS); + if ((org_file= my_open(old_name, O_RDONLY | O_SHARE | O_NOFOLLOW | O_CLOEXEC, + MYF(MY_WME))) < 0) + goto err; + + for (block= 0 ; ; block++) + { + size_t length; + if ((error= aria_read_data(org_file, &cap, block, copy_buffer, + &length) == HA_ERR_END_OF_FILE)) + break; + if (error) + { + fprintf(stderr, "aria_read_index failed: %d\n", error); + goto err; + } + } + error= 0; + +err: + my_free(copy_buffer); + if (org_file >= 0) + my_close(org_file, MYF(MY_WME)); + if (error) + fprintf(stderr, "Failed in copy_table stage: %d\n", stage); + return error; +} + + +/* Code extracted from ma_test1.c */ +#define MAX_REC_LENGTH 1024 + +static MARIA_COLUMNDEF recinfo[4]; +static MARIA_KEYDEF keyinfo[10]; +static HA_KEYSEG keyseg[10]; +static HA_KEYSEG uniqueseg[10]; + + +/** + Create a test table and fill it with some data +*/ + +static int create_test_table(const char *table_name, int type_of_table) +{ + MARIA_HA *file; + int i,error,uniques=0; + int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE; + int key_type=HA_KEYTYPE_NUM; + int create_flag=0; + uint pack_seg=0, pack_keys= 0; + uint key_length; + uchar record[MAX_REC_LENGTH]; + MARIA_UNIQUEDEF uniquedef; + MARIA_CREATE_INFO create_info; + enum data_file_type record_type= DYNAMIC_RECORD; + my_bool null_fields= 0, unique_key= 0; + my_bool opt_unique= 0; + my_bool transactional= 0; + + key_length= 12; + switch (type_of_table) { + case 0: + break; + case 1: + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + break; + case 2: /* transactional */ + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + record_type= BLOCK_RECORD; + transactional= 1; + break; + case 3: /* transactional */ + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + record_type= BLOCK_RECORD; + transactional= 1; + key_field=FIELD_VARCHAR; /* varchar keys */ + extra_field= FIELD_VARCHAR; + key_type= HA_KEYTYPE_VARTEXT1; + pack_seg|= HA_VAR_LENGTH_PART; + null_fields= 1; + break; + case 4: /* transactional */ + create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM; + record_type= BLOCK_RECORD; + transactional= 1; + key_field=FIELD_BLOB; /* blob key */ + extra_field= FIELD_BLOB; + pack_seg|= HA_BLOB_PART; + key_type= HA_KEYTYPE_VARTEXT1; + break; + } + + + bzero((char*) recinfo,sizeof(recinfo)); + bzero((char*) &create_info,sizeof(create_info)); + + /* First define 2 columns */ + create_info.null_bytes= 1; + recinfo[0].type= key_field; + recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr : + key_length); + if (key_field == FIELD_VARCHAR) + recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length); + recinfo[1].type=extra_field; + recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24); + if (extra_field == FIELD_VARCHAR) + recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length); + recinfo[1].null_bit= null_fields ? 2 : 0; + + if (opt_unique) + { + recinfo[2].type=FIELD_CHECK; + recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH; + } + + if (key_type == HA_KEYTYPE_VARTEXT1 && + key_length > 255) + key_type= HA_KEYTYPE_VARTEXT2; + + /* Define a key over the first column */ + keyinfo[0].seg=keyseg; + keyinfo[0].keysegs=1; + keyinfo[0].block_length= 0; /* Default block length */ + keyinfo[0].key_alg=HA_KEY_ALG_BTREE; + keyinfo[0].seg[0].type= key_type; + keyinfo[0].seg[0].flag= pack_seg; + keyinfo[0].seg[0].start=1; + keyinfo[0].seg[0].length=key_length; + keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0; + keyinfo[0].seg[0].null_pos=0; + keyinfo[0].seg[0].language= default_charset_info->number; + if (pack_seg & HA_BLOB_PART) + { + keyinfo[0].seg[0].bit_start=4; /* Length of blob length */ + } + keyinfo[0].flag = (uint8) (pack_keys | unique_key); + + if (opt_unique) + { + uint start; + uniques=1; + bzero((char*) &uniquedef,sizeof(uniquedef)); + bzero((char*) uniqueseg,sizeof(uniqueseg)); + uniquedef.seg=uniqueseg; + uniquedef.keysegs=2; + + /* Make a unique over all columns (except first NULL fields) */ + for (i=0, start=1 ; i < 2 ; i++) + { + uniqueseg[i].start=start; + start+=recinfo[i].length; + uniqueseg[i].length=recinfo[i].length; + uniqueseg[i].language= default_charset_info->number; + } + uniqueseg[0].type= key_type; + uniqueseg[0].null_bit= null_fields ? 2 : 0; + uniqueseg[1].type= HA_KEYTYPE_TEXT; + if (extra_field == FIELD_BLOB) + { + uniqueseg[1].length=0; /* The whole blob */ + uniqueseg[1].bit_start=4; /* long blob */ + uniqueseg[1].flag|= HA_BLOB_PART; + } + else if (extra_field == FIELD_VARCHAR) + { + uniqueseg[1].flag|= HA_VAR_LENGTH_PART; + uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ? + HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2); + } + } + else + uniques=0; + + if (!silent) + printf("- Creating Aria file\n"); + create_info.max_rows= 0; + create_info.transactional= transactional; + if (maria_create(table_name, record_type, 1, keyinfo,2+opt_unique,recinfo, + uniques, &uniquedef, &create_info, + create_flag)) + goto err; + if (!(file=maria_open(table_name,2,HA_OPEN_ABORT_IF_LOCKED, 0))) + goto err; + if (!silent) + printf("- Writing key:s\n"); + + if (maria_begin(file)) + goto err; + my_errno=0; + for (i= 0 ; i < ROWS_IN_TEST ; i++) + { + create_record(record,i); + if ((error=maria_write(file,record))) + goto err; + } + + if (maria_commit(file) | maria_close(file)) + goto err; + printf("- Data copied\n"); + return 0; + +err: + printf("got error: %3d when using maria-database\n",my_errno); + return 1; /* skip warning */ +} + + +static void create_key_part(uchar *key,uint rownr) +{ + if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM) + { + sprintf((char*) key,"%*d",keyinfo[0].seg[0].length,rownr); + } + else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 || + keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2) + { /* Alpha record */ + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr % 100); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b'); + } + } + else + { /* Alpha record */ + if (keyinfo[0].seg[0].flag & HA_SPACE_PACK) + sprintf((char*) key,"%-*d",keyinfo[0].seg[0].length,rownr); + else + { + /* Create a key that may be easily packed */ + bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B'); + sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr % 100); + if ((rownr & 7) == 0) + { + /* Change the key to force a unpack of the next key */ + key[1]= (rownr < 10 ? 'a' : 'b'); + } + } + } +} + + +static uchar blob_key[MAX_REC_LENGTH]; +static uchar blob_record[MAX_REC_LENGTH+20*20]; + + +static void create_record(uchar *record,uint rownr) +{ + uchar *pos; + bzero((char*) record,MAX_REC_LENGTH); + record[0]=1; /* delete marker */ + if (rownr == 0 && keyinfo[0].seg[0].null_bit) + record[0]|=keyinfo[0].seg[0].null_bit; /* Null key */ + + pos=record+1; + if (recinfo[0].type == FIELD_BLOB) + { + size_t tmp; + uchar *ptr; + create_key_part(blob_key,rownr); + tmp=strlen((char*) blob_key); + int4store(pos,tmp); + ptr=blob_key; + memcpy(pos+4,&ptr,sizeof(char*)); + pos+=recinfo[0].length; + } + else if (recinfo[0].type == FIELD_VARCHAR) + { + size_t tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1); + create_key_part(pos+pack_length,rownr); + tmp= strlen((char*) pos+pack_length); + if (pack_length == 1) + *(uchar*) pos= (uchar) tmp; + else + int2store(pos,tmp); + pos+= recinfo[0].length; + } + else + { + create_key_part(pos,rownr); + pos+=recinfo[0].length; + } + if (recinfo[1].type == FIELD_BLOB) + { + size_t tmp; + uchar *ptr;; + sprintf((char*) blob_record,"... row: %d", rownr); + strappend((char*) blob_record, rownr % MAX_REC_LENGTH,'x'); + tmp=strlen((char*) blob_record); + int4store(pos,tmp); + ptr=blob_record; + memcpy(pos+4,&ptr,sizeof(char*)); + } + else if (recinfo[1].type == FIELD_VARCHAR) + { + size_t tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1); + sprintf((char*) pos+pack_length, "... row: %d", rownr); + tmp= strlen((char*) pos+pack_length); + if (pack_length == 1) + *pos= (uchar) tmp; + else + int2store(pos,tmp); + } + else + { + sprintf((char*) pos,"... row: %d", rownr); + strappend((char*) pos,recinfo[1].length,' '); + } +} + +#include "ma_check_standalone.h" diff --git a/storage/maria/test_pack b/storage/maria/test_pack new file mode 100755 index 00000000..689645b1 --- /dev/null +++ b/storage/maria/test_pack @@ -0,0 +1,10 @@ +silent="-s" +suffix="" + +ma_test1$suffix -s ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -us test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -S ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ;maria_chk$suffix -us test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -b ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 +ma_test1$suffix -s -w ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -ros test1 ; maria_chk$suffix -es test1 + +ma_test2$suffix -s -t4 ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2 +ma_test2$suffix -s -t4 -b ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2 diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c new file mode 100644 index 00000000..7cac6a2d --- /dev/null +++ b/storage/maria/trnman.c @@ -0,0 +1,995 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + +#include <my_global.h> +#include <my_sys.h> +#include <m_string.h> +#include "trnman.h" +#include "ma_checkpoint.h" +#include "ma_control_file.h" + +/* + status variables: + how many trns in the active list currently, + in the committed list currently, allocated since startup. +*/ +uint trnman_active_transactions, trnman_committed_transactions, + trnman_allocated_transactions; + +#ifdef WORKAROUND_GCC_4_3_2_BUG +volatile +#endif +/* list of active transactions in the trid order */ +static TRN active_list_min, active_list_max; +/* list of committed transactions in the trid order */ +static TRN committed_list_min, committed_list_max; + +/* a counter, used to generate transaction ids */ +static TrID global_trid_generator; + +/* + The minimum existing transaction id for trnman_get_min_trid() + The default value is used when transaction manager not initialize; + Probably called from maria_chk +*/ +static TrID trid_min_read_from= MAX_TRID; + +/* the mutex for everything above */ +static mysql_mutex_t LOCK_trn_list; + +/* LIFO pool of unused TRN structured for reuse */ +static TRN *pool; + +/* a hash for committed transactions that maps trid to a TRN structure */ +static LF_HASH trid_to_trn; + +/* an array that maps short_id of an active transaction to a TRN structure */ +static TRN **short_trid_to_active_trn; + +/* locks for short_trid_to_active_trn and pool */ +static my_bool default_trnman_end_trans_hook(TRN *, my_bool, my_bool); +static void trnman_free_trn(TRN *); + +my_bool (*trnman_end_trans_hook)(TRN *, my_bool, my_bool)= + default_trnman_end_trans_hook; + +/* + Simple interface functions + QQ: if they stay so simple, should we make them inline? +*/ + +uint trnman_increment_locked_tables(TRN *trn) +{ + return trn->locked_tables++; +} + +uint trnman_has_locked_tables(TRN *trn) +{ + return trn->locked_tables; +} + +uint trnman_decrement_locked_tables(TRN *trn) +{ + return --trn->locked_tables; +} + +void trnman_reset_locked_tables(TRN *trn, uint locked_tables) +{ + trn->locked_tables= locked_tables; +} + +#ifdef EXTRA_DEBUG +uint16 trnman_get_flags(TRN *trn) +{ + return trn->flags; +} + +void trnman_set_flags(TRN *trn, uint16 flags) +{ + trn->flags= flags; +} +#endif + +/** Wake up threads waiting for this transaction */ +static void wt_thd_release_self(TRN *trn) +{ + if (trn->wt) + { + WT_RESOURCE_ID rc; + rc.type= &ma_rc_dup_unique; + rc.value= (intptr)trn; + wt_thd_release(trn->wt, & rc); + trn->wt= 0; + } +} + +static my_bool +default_trnman_end_trans_hook(TRN *trn __attribute__ ((unused)), + my_bool commit __attribute__ ((unused)), + my_bool active_transactions + __attribute__ ((unused))) +{ + return 0; +} + + +static uchar *trn_get_hash_key(const uchar *trn, size_t *len, + my_bool unused __attribute__ ((unused))) +{ + *len= sizeof(TrID); + return (uchar *) & ((*((TRN **)trn))->trid); +} + + +/** + @brief Initializes transaction manager. + + @param initial_trid Generated TrIDs will start from initial_trid+1. + + @return Operation status + @retval 0 OK + @retval !=0 Error +*/ + +int trnman_init(TrID initial_trid) +{ + DBUG_ENTER("trnman_init"); + DBUG_PRINT("enter", ("initial_trid: %lu", (ulong) initial_trid)); + + short_trid_to_active_trn= (TRN **)my_malloc(PSI_INSTRUMENT_ME, SHORT_TRID_MAX*sizeof(TRN*), + MYF(MY_WME|MY_ZEROFILL)); + if (unlikely(!short_trid_to_active_trn)) + DBUG_RETURN(1); + short_trid_to_active_trn--; /* min short_id is 1 */ + + /* + Initialize lists. + active_list_max.min_read_from must be larger than any trid, + so that when an active list is empty we would could free + all committed list. + And committed_list_max itself can not be freed so + committed_list_max.commit_trid must not be smaller that + active_list_max.min_read_from + */ + + active_list_max.trid= active_list_min.trid= 0; + active_list_max.min_read_from= MAX_TRID; + active_list_max.next= active_list_min.prev= 0; + active_list_max.prev= &active_list_min; + active_list_min.next= &active_list_max; + + committed_list_max.commit_trid= MAX_TRID; + committed_list_max.next= committed_list_min.prev= 0; + committed_list_max.prev= &committed_list_min; + committed_list_min.next= &committed_list_max; + + trnman_active_transactions= 0; + trnman_committed_transactions= 0; + trnman_allocated_transactions= 0; + /* This is needed for recovery and repair */ + dummy_transaction_object.min_read_from= ~(TrID) 0; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + + pool= 0; + global_trid_generator= initial_trid; + trid_min_read_from= initial_trid; + lf_hash_init(&trid_to_trn, sizeof(TRN*), LF_HASH_UNIQUE, + 0, 0, trn_get_hash_key, 0); + DBUG_PRINT("info", ("mysql_mutex_init LOCK_trn_list")); + mysql_mutex_init(key_LOCK_trn_list, &LOCK_trn_list, MY_MUTEX_INIT_FAST); + + DBUG_RETURN(0); +} + + +/* + NOTE + this could only be called in the "idle" state - no transaction can be + running. See asserts below. +*/ +void trnman_destroy() +{ + DBUG_ENTER("trnman_destroy"); + + if (short_trid_to_active_trn == NULL) /* trnman already destroyed */ + DBUG_VOID_RETURN; + DBUG_ASSERT(trid_to_trn.count == 0); + DBUG_ASSERT(trnman_active_transactions == 0); + DBUG_ASSERT(trnman_committed_transactions == 0); + DBUG_ASSERT(active_list_max.prev == &active_list_min); + DBUG_ASSERT(active_list_min.next == &active_list_max); + DBUG_ASSERT(committed_list_max.prev == &committed_list_min); + DBUG_ASSERT(committed_list_min.next == &committed_list_max); + while (pool) + { + TRN *trn= pool; + pool= pool->next; + DBUG_ASSERT(trn->wt == NULL); + mysql_mutex_destroy(&trn->state_lock); + my_free(trn); + } + lf_hash_destroy(&trid_to_trn); + DBUG_PRINT("info", ("mysql_mutex_destroy LOCK_trn_list")); + mysql_mutex_destroy(&LOCK_trn_list); + my_free(short_trid_to_active_trn+1); + short_trid_to_active_trn= NULL; + + DBUG_VOID_RETURN; +} + + +/* + NOTE + TrID is limited to 6 bytes. Initial value of the generator + is set by the recovery code - being read from the last checkpoint + (or 1 on a first run). +*/ +static TrID new_trid() +{ + DBUG_ENTER("new_trid"); + DBUG_ASSERT(global_trid_generator < MAX_INTERNAL_TRID); + DBUG_PRINT("info", ("mysql_mutex_assert_owner LOCK_trn_list")); + mysql_mutex_assert_owner(&LOCK_trn_list); + DBUG_RETURN(++global_trid_generator); +} + +static uint get_short_trid(TRN *trn) +{ + int i= (int) ((global_trid_generator + (intptr)trn) * 312089 % + SHORT_TRID_MAX) + 1; + uint res=0; + + for ( ; !res ; i= 1) + { + for ( ; i <= SHORT_TRID_MAX; i++) /* the range is [1..SHORT_TRID_MAX] */ + { + void *tmp= NULL; + if (short_trid_to_active_trn[i] == NULL && + my_atomic_casptr((void **)&short_trid_to_active_trn[i], &tmp, trn)) + { + res= i; + break; + } + } + } + return res; +} + +/** + Allocates and initializes a new TRN object + + @note the 'wt' parameter can only be 0 in a single-threaded code (or, + generally, where threads cannot block each other), otherwise the + first call to the deadlock detector will sigsegv. +*/ + +TRN *trnman_new_trn(WT_THD *wt) +{ + int res; + TRN *trn; + union { TRN *trn; void *v; } tmp; + DBUG_ENTER("trnman_new_trn"); + + /* + we have a mutex, to do simple things under it - allocate a TRN, + increment trnman_active_transactions, set trn->min_read_from. + + Note that all the above is fast. generating short_id may be slow, + as it involves scanning a large array - so it's done outside of the + mutex. + */ + + DBUG_PRINT("info", ("mysql_mutex_lock LOCK_trn_list")); + mysql_mutex_lock(&LOCK_trn_list); + + /* Allocating a new TRN structure */ + tmp.trn= pool; + /* + Popping an unused TRN from the pool + (ABA isn't possible, we're behind a mutex + */ + while (tmp.trn && !my_atomic_casptr((void **)(char*) &pool, &tmp.v, + (void *)tmp.trn->next)) + /* no-op */; + + /* Nothing in the pool ? Allocate a new one */ + if (!(trn= tmp.trn)) + { + /* + trn should be completely initialized at create time to allow + one to keep a known state on it. + (Like redo_lns, which is assumed to be 0 at start of row handling + and reset to zero before end of row handling) + */ + trn= (TRN *)my_malloc(PSI_INSTRUMENT_ME, sizeof(TRN), MYF(MY_WME | MY_ZEROFILL)); + if (unlikely(!trn)) + { + DBUG_PRINT("info", ("mysql_mutex_unlock LOCK_trn_list")); + mysql_mutex_unlock(&LOCK_trn_list); + return 0; + } + trnman_allocated_transactions++; + mysql_mutex_init(key_TRN_state_lock, &trn->state_lock, MY_MUTEX_INIT_FAST); + } + trn->wt= wt; + trn->pins= lf_hash_get_pins(&trid_to_trn); + if (!trn->pins) + { + trnman_free_trn(trn); + mysql_mutex_unlock(&LOCK_trn_list); + return 0; + } + + trnman_active_transactions++; + + trn->min_read_from= active_list_min.next->trid; + + trn->trid= new_trid(); + + trn->next= &active_list_max; + trn->prev= active_list_max.prev; + active_list_max.prev= trn->prev->next= trn; + trid_min_read_from= active_list_min.next->min_read_from; + DBUG_PRINT("info", ("mysql_mutex_unlock LOCK_trn_list")); + mysql_mutex_unlock(&LOCK_trn_list); + + if (unlikely(!trn->min_read_from)) + { + /* + We are the only transaction. Set min_read_from so that we can read + our own rows + */ + trn->min_read_from= trn->trid + 1; + } + + /* no other transaction can read changes done by this one */ + trn->commit_trid= MAX_TRID; + trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0; + trn->used_tables= 0; + trn->used_instances= 0; + + trn->locked_tables= 0; + trn->flags= 0; + + /* + only after the following function TRN is considered initialized, + so it must be done the last + */ + mysql_mutex_lock(&trn->state_lock); + trn->short_id= get_short_trid(trn); + mysql_mutex_unlock(&trn->state_lock); + + res= lf_hash_insert(&trid_to_trn, trn->pins, &trn); + DBUG_ASSERT(res <= 0); + if (res) + { + trnman_end_trn(trn, 0); + return 0; + } + + DBUG_PRINT("exit", ("trn: %p trid: 0x%lu min_read_from: 0x%lu", + trn, (ulong) trn->trid, (ulong) trn->min_read_from)); + + DBUG_RETURN(trn); +} + + +/* + Initialize a temporary TRN object for logging a new transaction id (trid) + to it. Used by create table to associate a create trid to the table. + + Out: trn->trid is updated with next available trid +*/ + +void trnman_init_tmp_trn_for_logging_trid(TRN *trn) +{ + *trn= dummy_transaction_object; + /* Avoid logging short_id */ + trn->short_id= 1; + /* Trid gets logged in translog_write_record */ + trn->first_undo_lsn= 0; + /* Get next free trid */ + trn->trid= trnman_get_min_safe_trid(); +} + + +/* + remove a trn from the active list. + if necessary - move to committed list and set commit_trid + + NOTE + Locks are released at the end. In particular, after placing the + transaction in commit list, and after setting commit_trid. It's + important, as commit_trid affects visibility. Locks don't affect + anything they simply delay execution of other threads - they could be + released arbitrarily late. In other words, when locks are released it + serves as a start banner for other threads, they start to run. So + everything they may need must be ready at that point. + + RETURN + 0 ok + 1 error +*/ +my_bool trnman_end_trn(TRN *trn, my_bool commit) +{ + int res= 1; + uint16 cached_short_id= trn->short_id; /* we have to cache it, see below */ + TRN *free_me= 0; + LF_PINS *pins= trn->pins; + DBUG_ENTER("trnman_end_trn"); + DBUG_PRINT("enter", ("trn: %p commit: %d", trn, commit)); + + /* if a rollback, all UNDO records should have been executed */ + DBUG_ASSERT(commit || trn->undo_lsn == 0); + DBUG_ASSERT(trn != &dummy_transaction_object); + DBUG_ASSERT(trn->locked_tables == 0 && trn->used_instances == 0); + DBUG_PRINT("info", ("mysql_mutex_lock LOCK_trn_list")); + + mysql_mutex_lock(&LOCK_trn_list); + + /* remove from active list */ + trn->next->prev= trn->prev; + trn->prev->next= trn->next; + + /* + if trn was the oldest active transaction, now that it goes away there + may be committed transactions in the list which no active transaction + needs to bother about - clean up the committed list + */ + if (trn->prev == &active_list_min) + { + uint free_me_count; + TRN *t; + for (t= committed_list_min.next, free_me_count= 0; + t->commit_trid < active_list_min.next->min_read_from; + t= t->next, free_me_count++) /* no-op */; + + DBUG_ASSERT((t != committed_list_min.next && free_me_count > 0) || + (t == committed_list_min.next && free_me_count == 0)); + /* found transactions committed before the oldest active one */ + if (t != committed_list_min.next) + { + free_me= committed_list_min.next; + committed_list_min.next= t; + t->prev->next= 0; + t->prev= &committed_list_min; + trnman_committed_transactions-= free_me_count; + } + } + + mysql_mutex_lock(&trn->state_lock); + if (commit) + trn->commit_trid= global_trid_generator; + wt_thd_release_self(trn); + mysql_mutex_unlock(&trn->state_lock); + + /* + if transaction is committed and it was not the only active transaction - + add it to the committed list + */ + if (commit && active_list_min.next != &active_list_max) + { + trn->next= &committed_list_max; + trn->prev= committed_list_max.prev; + trnman_committed_transactions++; + committed_list_max.prev= trn->prev->next= trn; + } + else + { + trn->next= free_me; + free_me= trn; + } + trid_min_read_from= active_list_min.next->min_read_from; + + if ((*trnman_end_trans_hook)(trn, commit, + active_list_min.next != &active_list_max)) + res= -1; + trnman_active_transactions--; + + DBUG_PRINT("info", ("mysql_mutex_unlock LOCK_trn_list")); + mysql_mutex_unlock(&LOCK_trn_list); + + /* + the rest is done outside of a critical section + + note that we don't own trn anymore, it may be in a shared list now. + Thus, we cannot dereference it, and must use cached_short_id below. + */ + my_atomic_storeptr((void **)&short_trid_to_active_trn[cached_short_id], 0); + + /* + we, under the mutex, removed going-in-free_me transactions from the + active and committed lists, thus nobody else may see them when it scans + those lists, and thus nobody may want to free them. Now we don't + need a mutex to access free_me list + */ + /* QQ: send them to the purge thread */ + while (free_me) + { + TRN *t= free_me; + free_me= free_me->next; + + /* ignore OOM. it's harmless, and we can do nothing here anyway */ + (void)lf_hash_delete(&trid_to_trn, pins, &t->trid, sizeof(TrID)); + + trnman_free_trn(t); + } + + lf_hash_put_pins(pins); + + DBUG_RETURN(res < 0); +} + +/* + free a trn (add to the pool, that is) + note - we can never really free() a TRN if there's at least one other + running transaction - see, e.g., how lock waits are implemented in + lockman.c + The same is true for other lock-free data structures too. We may need some + kind of FLUSH command to reset them all - ensuring that no transactions are + running. It may even be called automatically on checkpoints if no + transactions are running. +*/ +static void trnman_free_trn(TRN *trn) +{ + /* + union is to solve strict aliasing issue. + without it gcc 3.4.3 doesn't notice that updating *(void **)&tmp + modifies the value of tmp. + */ + union { TRN *trn; void *v; } tmp; + + DBUG_ASSERT(trn != &dummy_transaction_object); + + mysql_mutex_lock(&trn->state_lock); + trn->short_id= 0; + mysql_mutex_unlock(&trn->state_lock); + + tmp.trn= pool; + + do + { + /* + without this volatile cast gcc-3.4.4 moves the assignment + down after the loop at -O2 + */ + *(TRN * volatile *)&(trn->next)= tmp.trn; + } while (!my_atomic_casptr((void **)(char*)&pool, &tmp.v, trn)); +} + +/* + NOTE + here we access the hash in a lock-free manner. + It's safe, a 'found' TRN can never be freed/reused before we access it. + In fact, it cannot be freed before 'trn' ends, because a 'found' TRN + can only be removed from the hash when: + found->commit_trid < ALL (trn->min_read_from) + that is, at least + found->commit_trid < trn->min_read_from + but + found->trid >= trn->min_read_from + and + found->commit_trid > found->trid + + RETURN + 1 can + 0 cannot + -1 error (OOM) +*/ +int trnman_can_read_from(TRN *trn, TrID trid) +{ + TRN **found; + my_bool can; + + if (trid < trn->min_read_from) + return 1; /* Row is visible by all transactions in the system */ + + if (trid >= trn->trid) + { + /* + We have now two cases + trid > trn->trid, in which case the row is from a new transaction + and not visible, in which case we should return 0. + trid == trn->trid in which case the row is from the current transaction + and we should return 1 + */ + return trid == trn->trid; + } + + found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid)); + if (found == NULL) + return 0; /* not in the hash of transactions = cannot read */ + if (found == MY_ERRPTR) + return -1; + + can= (*found)->commit_trid < trn->trid; + lf_hash_search_unpin(trn->pins); + return can; +} + +/** + Finds a TRN by its TrID + + @param trn current trn. Needed for pinning pointers (see lf_pin) + @param trid trid to search for + + @return found trn or 0 + + @note that trn is returned with its state locked! +*/ +TRN *trnman_trid_to_trn(TRN *trn, TrID trid) +{ + TRN **found; + + if (trid < trn->min_read_from) + return 0; /* it's committed eons ago */ + + found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid)); + if (found == NULL || found == MY_ERRPTR) + return 0; /* no luck */ + + /* we've found something */ + mysql_mutex_lock(&(*found)->state_lock); + + if ((*found)->short_id == 0) + { + mysql_mutex_unlock(&(*found)->state_lock); + lf_hash_search_unpin(trn->pins); + return 0; /* but it was a ghost */ + } + lf_hash_search_unpin(trn->pins); + + /* Gotcha! */ + return *found; +} + +/* TODO: the stubs below are waiting for savepoints to be implemented */ + +void trnman_new_statement(TRN *trn __attribute__ ((unused))) +{ +} + +void trnman_rollback_statement(TRN *trn __attribute__ ((unused))) +{ +} + + +/** + @brief Allocates buffers and stores in them some info about transactions + + Does the allocation because the caller cannot know the size itself. + Memory freeing is to be done by the caller (if the "str" member of the + LEX_STRING is not NULL). + The caller has the intention of doing checkpoints. + + @param[out] str_act pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about active transactions + @param[out] str_com pointer to where the allocated buffer, + and its size, will be put; buffer will be filled + with info about committed transactions + @param[out] min_first_undo_lsn pointer to where the minimum + first_undo_lsn of all transactions will be put + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com, + LSN *min_rec_lsn, LSN *min_first_undo_lsn) +{ + my_bool error; + TRN *trn; + char *ptr; + uint stored_transactions= 0; + LSN minimum_rec_lsn= LSN_MAX, minimum_first_undo_lsn= LSN_MAX; + DBUG_ENTER("trnman_collect_transactions"); + + DBUG_ASSERT((NULL == str_act->str) && (NULL == str_com->str)); + + /* validate the use of read_non_atomic() in general: */ + compile_time_assert((sizeof(LSN) == 8) && (sizeof(LSN_WITH_FLAGS) == 8)); + mysql_mutex_lock(&LOCK_trn_list); + str_act->length= 2 + /* number of active transactions */ + LSN_STORE_SIZE + /* minimum of their rec_lsn */ + TRANSID_SIZE + /* current TrID generator value */ + (2 + /* short id */ + 6 + /* long id */ + LSN_STORE_SIZE + /* undo_lsn */ +#ifdef MARIA_VERSIONING /* not enabled yet */ + LSN_STORE_SIZE + /* undo_purge_lsn */ +#endif + LSN_STORE_SIZE /* first_undo_lsn */ + ) * trnman_active_transactions; + str_com->length= 4 + /* number of committed transactions */ + (6 + /* long id */ +#ifdef MARIA_VERSIONING /* not enabled yet */ + LSN_STORE_SIZE + /* undo_purge_lsn */ +#endif + LSN_STORE_SIZE /* first_undo_lsn */ + ) * trnman_committed_transactions; + if ((NULL == (str_act->str= my_malloc(PSI_INSTRUMENT_ME, str_act->length, MYF(MY_WME)))) || + (NULL == (str_com->str= my_malloc(PSI_INSTRUMENT_ME, str_com->length, MYF(MY_WME))))) + goto err; + /* First, the active transactions */ + ptr= str_act->str + 2 + LSN_STORE_SIZE; + transid_store(ptr, global_trid_generator); + ptr+= TRANSID_SIZE; + for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next) + { + uint sid; + LSN rec_lsn, undo_lsn, first_undo_lsn; + mysql_mutex_lock(&trn->state_lock); + sid= trn->short_id; + mysql_mutex_unlock(&trn->state_lock); + if (sid == 0) + { + /* + Not even inited, has done nothing. Or it is the + dummy_transaction_object, which does only non-transactional + immediate-sync operations (CREATE/DROP/RENAME/REPAIR TABLE), and so + can be forgotten for Checkpoint. + */ + continue; + } + /* needed for low-water mark calculation */ + if (((rec_lsn= lsn_read_non_atomic(trn->rec_lsn)) > 0) && + (cmp_translog_addr(rec_lsn, minimum_rec_lsn) < 0)) + minimum_rec_lsn= rec_lsn; + /* + If trn has not logged LOGREC_LONG_TRANSACTION_ID, this trn will be + discovered when seeing that log record which is for sure located after + checkpoint_start_log_horizon. + */ + if ((LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn) & + TRANSACTION_LOGGED_LONG_ID) == 0) + continue; + /* + On the other hand, if undo_lsn is LSN_IMPOSSIBLE, trn may later log + records; so we must include trn in the checkpoint now, because we cannot + count on LOGREC_LONG_TRANSACTION_ID (as we are already past it). + */ + undo_lsn= trn->undo_lsn; + stored_transactions++; + int2store(ptr, sid); + ptr+= 2; + int6store(ptr, trn->trid); + ptr+= 6; + lsn_store(ptr, undo_lsn); /* needed for rollback */ + ptr+= LSN_STORE_SIZE; + /* needed for low-water mark calculation */ + if (((first_undo_lsn= lsn_read_non_atomic(trn->first_undo_lsn)) > 0) && + (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0)) + minimum_first_undo_lsn= first_undo_lsn; + lsn_store(ptr, first_undo_lsn); + ptr+= LSN_STORE_SIZE; +#ifdef MARIA_VERSIONING /* not enabled yet */ + /* to know where purging should start (last delete of this trn) */ + lsn_store(ptr, trn->undo_purge_lsn); + ptr+= LSN_STORE_SIZE; +#endif + /** + @todo RECOVERY: add a comment explaining why we can dirtily read some + vars, inspired by the text of "assumption 8" in WL#3072 + */ + } + str_act->length= ptr - str_act->str; /* as we maybe over-estimated */ + ptr= str_act->str; + DBUG_PRINT("info",("collected %u active transactions", + (uint)stored_transactions)); + int2store(ptr, stored_transactions); + ptr+= 2; + /* this LSN influences how REDOs for any page can be ignored by Recovery */ + lsn_store(ptr, minimum_rec_lsn); + /* one day there will also be a list of prepared transactions */ + /* do the same for committed ones */ + ptr= str_com->str; + int4store(ptr, trnman_committed_transactions); + ptr+= 4; + DBUG_PRINT("info",("collected %u committed transactions", + (uint)trnman_committed_transactions)); + for (trn= committed_list_min.next; trn != &committed_list_max; + trn= trn->next) + { + LSN first_undo_lsn; + int6store(ptr, trn->trid); + ptr+= 6; +#ifdef MARIA_VERSIONING /* not enabled yet */ + lsn_store(ptr, trn->undo_purge_lsn); + ptr+= LSN_STORE_SIZE; +#endif + first_undo_lsn= LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn); + if (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0) + minimum_first_undo_lsn= first_undo_lsn; + lsn_store(ptr, first_undo_lsn); + ptr+= LSN_STORE_SIZE; + } + /* + TODO: if we see there exists no transaction (active and committed) we can + tell the lock-free structures to do some freeing (my_free()). + */ + error= 0; + *min_rec_lsn= minimum_rec_lsn; + *min_first_undo_lsn= minimum_first_undo_lsn; + goto end; +err: + error= 1; +end: + mysql_mutex_unlock(&LOCK_trn_list); + DBUG_RETURN(error); +} + + +TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid) +{ + TrID old_trid_generator= global_trid_generator; + TRN *trn; + DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded); + global_trid_generator= longid-1; /* force a correct trid in the new trn */ + if (unlikely((trn= trnman_new_trn(NULL)) == NULL)) + return NULL; + /* deallocate excessive allocations of trnman_new_trn() */ + global_trid_generator= old_trid_generator; + set_if_bigger(global_trid_generator, longid); + short_trid_to_active_trn[trn->short_id]= 0; + DBUG_ASSERT(short_trid_to_active_trn[shortid] == NULL); + short_trid_to_active_trn[shortid]= trn; + trn->short_id= shortid; + return trn; +} + + +TRN *trnman_get_any_trn() +{ + TRN *trn= active_list_min.next; + return (trn != &active_list_max) ? trn : NULL; +} + + +/** + Returns the minimum existing transaction id. May return a too small + number in race conditions, but this is ok as the value is used to + remove not visible transid from index/rows. +*/ + +TrID trnman_get_min_trid() +{ + return trid_min_read_from; +} + + +/** + Returns the minimum possible transaction id + + @notes + If there is no transactions running, returns number for next running + transaction. + If one has an active transaction, the returned number will be less or + equal to this. If one is not running in a transaction one will ge the + number for the next started transaction. This is used in create table + to get a safe minimum trid to use. +*/ + +TrID trnman_get_min_safe_trid() +{ + TrID trid; + mysql_mutex_lock(&LOCK_trn_list); + trid= MY_MIN(active_list_min.next->min_read_from, + global_trid_generator); + mysql_mutex_unlock(&LOCK_trn_list); + return trid; +} + + +/** + Returns maximum transaction id given to a transaction so far. +*/ + +TrID trnman_get_max_trid() +{ + TrID id; + /* Check if trnman has been initalized */ + if (short_trid_to_active_trn == NULL) + return 0; + mysql_mutex_lock(&LOCK_trn_list); + id= global_trid_generator; + mysql_mutex_unlock(&LOCK_trn_list); + return id; +} + +/** + @brief Check if there exist an active transaction between two commit_id's + + @todo + Improve speed of this. + - Store transactions in tree or skip list + - Have function to copying all active transaction id's to b-tree + and use b-tree for checking states. This could be a big win + for checkpoint that will call this function for a lot of objects. + + @return + 0 No transaction exists + 1 There is at least on active transaction in the given range +*/ + +my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id, + my_bool trnman_is_locked) +{ + TRN *trn; + my_bool ret= 0; + + if (!trnman_is_locked) + mysql_mutex_lock(&LOCK_trn_list); + mysql_mutex_assert_owner(&LOCK_trn_list); + for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next) + { + /* + We use <= for max_id as max_id is a commit_trid and trn->trid + is transaction id. When calculating commit_trid we use the + current value of global_trid_generator. global_trid_generator is + incremented for each new transaction. + + For example, assuming we have + min_id = 5 + max_id = 10 + + A trid of value 5 can't see the history event between 5 & 10 + at it vas started before min_id 5 was committed. + A trid of value 10 can't see the next history event (max_id = 10) + as it started before this was committed. In this case it must use + the this event. + */ + if (trn->trid > min_id && trn->trid <= max_id) + { + ret= 1; + break; + } + } + if (!trnman_is_locked) + mysql_mutex_unlock(&LOCK_trn_list); + return ret; +} + + +/** + lock transaction list +*/ + +void trnman_lock() +{ + mysql_mutex_lock(&LOCK_trn_list); +} + + +/** + unlock transaction list +*/ + +void trnman_unlock() +{ + mysql_mutex_unlock(&LOCK_trn_list); +} + + +/** + Is trman initialized +*/ + +my_bool trman_is_inited() +{ + return (short_trid_to_active_trn != NULL); +} diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h new file mode 100644 index 00000000..588bcdf6 --- /dev/null +++ b/storage/maria/trnman.h @@ -0,0 +1,73 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#ifndef _trnman_h +#define _trnman_h + +C_MODE_START + +#include <lf.h> +#include "trnman_public.h" +#include "ma_loghandler_lsn.h" + +/** + trid - 6 uchar transaction identifier. Assigned when a transaction + is created. Transaction can always be identified by its trid, + even after transaction has ended. + + short_id - 2-byte transaction identifier, identifies a running + transaction, is reassigned when transaction ends. + + when short_id is 0, TRN is not initialized, for all practical purposes + it could be considered unused. + + when commit_trid is MAX_TRID the transaction is running, otherwise it's + committed. + + state_lock mutex protects the state of a TRN, that is whether a TRN + is committed/running/unused. Meaning that modifications of short_id and + commit_trid happen under this mutex. +*/ + +struct st_ma_transaction +{ + LF_PINS *pins; + WT_THD *wt; + mysql_mutex_t state_lock; + void *used_tables; /**< Table shares used by transaction */ + void *used_instances; /* table files used by transaction */ + TRN *next, *prev; + TrID trid, min_read_from, commit_trid; + LSN rec_lsn, undo_lsn; + LSN_WITH_FLAGS first_undo_lsn; + uint locked_tables; + uint16 short_id; + uint16 flags; /**< Various flags */ +}; + +#define TRANSACTION_LOGGED_LONG_ID 0x8000000000000000ULL +#define MAX_TRID (~(TrID)0) +#define MAX_INTERNAL_TRID 0xffffffffffffLL + +extern WT_RESOURCE_TYPE ma_rc_dup_unique; + +#ifdef HAVE_PSI_INTERFACE +extern PSI_mutex_key key_LOCK_trn_list, key_TRN_state_lock; +#endif + +C_MODE_END + +#endif + diff --git a/storage/maria/trnman_public.h b/storage/maria/trnman_public.h new file mode 100644 index 00000000..97b8cc2f --- /dev/null +++ b/storage/maria/trnman_public.h @@ -0,0 +1,86 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + +/* + External definitions for trnman.h + We need to split this into two files as gcc 4.1.2 gives error if it tries + to include my_atomic.h in C++ code. +*/ + +#ifndef _trnman_public_h +#define _trnman_public_h + +#include "ma_loghandler_lsn.h" +#include <waiting_threads.h> + +C_MODE_START +typedef uint64 TrID; /* our TrID is 6 bytes */ +typedef struct st_ma_transaction TRN; + +#define SHORT_TRID_MAX 65535 + +extern uint trnman_active_transactions, trnman_allocated_transactions; +extern TRN dummy_transaction_object; +extern my_bool (*trnman_end_trans_hook)(TRN *trn, my_bool commit, + my_bool active_transactions); + +int trnman_init(TrID); +void trnman_destroy(void); +TRN *trnman_new_trn(WT_THD *wt); +my_bool trnman_end_trn(TRN *trn, my_bool commit); +#define trnman_commit_trn(T) trnman_end_trn(T, TRUE) +#define trnman_abort_trn(T) trnman_end_trn(T, FALSE) +#define trnman_rollback_trn(T) trnman_end_trn(T, FALSE) +int trnman_can_read_from(TRN *trn, TrID trid); +TRN *trnman_trid_to_trn(TRN *trn, TrID trid); +void trnman_new_statement(TRN *trn); +void trnman_rollback_statement(TRN *trn); +my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com, + LSN *min_rec_lsn, + LSN *min_first_undo_lsn); + +uint trnman_increment_locked_tables(TRN *trn); +uint trnman_decrement_locked_tables(TRN *trn); +uint trnman_has_locked_tables(TRN *trn); +void trnman_reset_locked_tables(TRN *trn, uint locked_tables); +TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid); +TRN *trnman_get_any_trn(void); +void trnman_init_tmp_trn_for_logging_trid(TRN *trn); +TrID trnman_get_min_trid(void); +TrID trnman_get_max_trid(void); +TrID trnman_get_min_safe_trid(); +my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id, + my_bool trnman_is_locked); +#define TRANSID_SIZE 6 +#define transid_store(dst, id) int6store(dst,id) +#define transid_korr(P) uint6korr(P) +void trnman_lock(); +void trnman_unlock(); +my_bool trman_is_inited(); +#ifdef EXTRA_DEBUG +uint16 trnman_get_flags(TRN *); +void trnman_set_flags(TRN *, uint16 flags); +#else +#define trnman_get_flags(A) 0 +#define trnman_set_flags(A, B) do { } while (0) +#endif + +/* Flag bits */ +#define TRN_STATE_INFO_LOGGED 1 /* Query is logged */ +#define TRN_STATE_TABLES_CAN_CHANGE 2 /* Things can change during trans. */ + +C_MODE_END +#endif diff --git a/storage/maria/unittest/CMakeLists.txt b/storage/maria/unittest/CMakeLists.txt new file mode 100644 index 00000000..a2da1507 --- /dev/null +++ b/storage/maria/unittest/CMakeLists.txt @@ -0,0 +1,136 @@ +# Copyright (C) 2007 MySQL AB +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib + ${CMAKE_SOURCE_DIR}/unittest/mytap) +LINK_LIBRARIES(aria myisam mytap mysys dbug strings ${ZLIB_LIBRARY}) + +MY_ADD_TESTS(ma_control_file trnman) + +ADD_EXECUTABLE(ma_test_loghandler-t + ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler) + +ADD_EXECUTABLE(ma_test_loghandler_multigroup-t + ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c) +MY_ADD_TEST(ma_test_loghandler_multigroup) + +ADD_EXECUTABLE(ma_test_loghandler_multithread-t + ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_multithread) + +ADD_EXECUTABLE(ma_test_loghandler_pagecache-t + ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_pagecache) + +ADD_EXECUTABLE(ma_test_loghandler_long-t + ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +SET_TARGET_PROPERTIES(ma_test_loghandler_long-t PROPERTIES COMPILE_FLAGS "-DLONG_LOG_TEST") +MY_ADD_TEST(ma_test_loghandler_long) + +ADD_EXECUTABLE(ma_test_loghandler_noflush-t + ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_noflush) + +ADD_EXECUTABLE(ma_test_loghandler_first_lsn-t + ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_first_lsn) + +ADD_EXECUTABLE(ma_test_loghandler_max_lsn-t + ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_max_lsn) + +ADD_EXECUTABLE(ma_test_loghandler_purge-t + ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_purge) + +ADD_EXECUTABLE(ma_test_loghandler_readonly-t + ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c) +MY_ADD_TEST(ma_test_loghandler_readonly) + +SET_TARGET_PROPERTIES(ma_test_loghandler_readonly-t PROPERTIES COMPILE_FLAGS "-DREADONLY_TEST") +ADD_EXECUTABLE(ma_test_loghandler_nologs-t + ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c) +MY_ADD_TEST(ma_test_loghandler_nologs) + +SET(ma_pagecache_single_src ma_pagecache_single.c test_file.c test_file.h) +SET(ma_pagecache_consist_src ma_pagecache_consist.c test_file.c test_file.h) +SET(ma_pagecache_common_cppflags "-DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN") + +ADD_EXECUTABLE(ma_pagecache_single_1k-t ${ma_pagecache_single_src}) +SET_TARGET_PROPERTIES(ma_pagecache_single_1k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024") +MY_ADD_TEST(ma_pagecache_single_1k) + +ADD_EXECUTABLE(ma_pagecache_single_8k-t ${ma_pagecache_single_src}) +SET_TARGET_PROPERTIES(ma_pagecache_single_8k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=8192 -DBIG") +MY_ADD_TEST(ma_pagecache_single_8k) + +ADD_EXECUTABLE(ma_pagecache_single_64k-t ${ma_pagecache_single_src}) +SET_TARGET_PROPERTIES(ma_pagecache_single_64k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DBIG") +MY_ADD_TEST(ma_pagecache_single_64k) + +ADD_EXECUTABLE(ma_pagecache_consist_1k-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024") +MY_ADD_TEST(ma_pagecache_consist_1k) + +ADD_EXECUTABLE(ma_pagecache_consist_64k-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64k-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536") +MY_ADD_TEST(ma_pagecache_consist_64k) + +ADD_EXECUTABLE(ma_pagecache_consist_1kHC-t + ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1kHC-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY") +MY_ADD_TEST(ma_pagecache_consist_1kHC) + +ADD_EXECUTABLE(ma_pagecache_consist_64kHC-t + ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64kHC-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY") +MY_ADD_TEST(ma_pagecache_consist_64kHC) + +ADD_EXECUTABLE(ma_pagecache_consist_1kRD-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1kRD-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_READERS") +MY_ADD_TEST(ma_pagecache_consist_1kRD) + +ADD_EXECUTABLE(ma_pagecache_consist_64kRD-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64kRD-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_READERS") +MY_ADD_TEST(ma_pagecache_consist_64kRD) + +ADD_EXECUTABLE(ma_pagecache_consist_1kWR-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_1kWR-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS") +MY_ADD_TEST(ma_pagecache_consist_1kWR) + +ADD_EXECUTABLE(ma_pagecache_consist_64kWR-t ${ma_pagecache_consist_src}) +SET_TARGET_PROPERTIES(ma_pagecache_consist_64kWR-t + PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS") +MY_ADD_TEST(ma_pagecache_consist_64kWR) + +ADD_EXECUTABLE(ma_pagecache_rwconsist_1k-t ma_pagecache_rwconsist.c) +SET_TARGET_PROPERTIES(ma_pagecache_rwconsist_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024") +MY_ADD_TEST(ma_pagecache_rwconsist_1k) + +ADD_EXECUTABLE(ma_pagecache_rwconsist2_1k-t ma_pagecache_rwconsist2.c) +SET_TARGET_PROPERTIES(ma_pagecache_rwconsist2_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024") +MY_ADD_TEST(ma_pagecache_rwconsist2_1k) + diff --git a/storage/maria/unittest/lockman-t.c b/storage/maria/unittest/lockman-t.c new file mode 100644 index 00000000..222618ac --- /dev/null +++ b/storage/maria/unittest/lockman-t.c @@ -0,0 +1,303 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + lockman for row and table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <lf.h> +#include "../lockman.h" + +#define Nlos 100 +LOCK_OWNER loarray[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKMAN lockman; + +#ifndef EXTRA_VERBOSE +#define print_lockhash(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +LOCK_OWNER *loid2lo(uint16 loid) +{ + return loarray+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + lockman_release_locks(&lockman, loid2lo(O));print_lockhash(&lockman) +#define test_lock(O, R, L, S, RES) \ + ok(lockman_getlock(&lockman, loid2lo(O), R, L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lockhash(&lockman) +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", DIDNT_GET_THE_LOCK); + +void test_lockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +char *res2str[4]= { + "DIDN'T GET THE LOCK", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + LOCK_OWNER *lo; + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo= loid2lo(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */ + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { /* table lock */ + res= lockman_getlock(&lockman, lo, table, lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= lockman_getlock(&lockman, lo, table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case DIDNT_GET_THE_LOCK: + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + DBUG_ASSERT(0); + } + } + } + + lockman_release_locks(&lockman, lo); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(35); + + lockman_init(&lockman, &loid2lo, 50); + + for (i= 0; i < Nlos; i++) + { + loarray[i].pins= lf_alloc_get_pins(&lockman.alloc); + loarray[i].all_locks= 0; + loarray[i].waiting_for= 0; + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + loarray[i].mutex= &mutexes[i]; + loarray[i].cond= &conds[i]; + loarray[i].loid= i+1; + } + + test_lockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); + + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); + + for (i= 0; i < Nlos; i++) + { + lockman_release_locks(&lockman, &loarray[i]); + pthread_mutex_destroy(loarray[i].mutex); + pthread_cond_destroy(loarray[i].cond); + lf_pinbox_put_pins(loarray[i].pins); + } + + { + ulonglong now= my_getsystime(); + lockman_destroy(&lockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/lockman1-t.c b/storage/maria/unittest/lockman1-t.c new file mode 100644 index 00000000..986ac73e --- /dev/null +++ b/storage/maria/unittest/lockman1-t.c @@ -0,0 +1,329 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + lockman for row locks, tablockman for table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <lf.h> +#include "../lockman.h" +#include "../tablockman.h" + +#define Nlos 100 +#define Ntbls 10 +LOCK_OWNER loarray[Nlos]; +TABLE_LOCK_OWNER loarray1[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKED_TABLE ltarray[Ntbls]; +LOCKMAN lockman; +TABLOCKMAN tablockman; + +#ifndef EXTRA_VERBOSE +#define print_lo1(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +LOCK_OWNER *loid2lo(uint16 loid) +{ + return loarray+loid-1; +} +TABLE_LOCK_OWNER *loid2lo1(uint16 loid) +{ + return loarray1+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + tablockman_release_locks(&tablockman, loid2lo1(O)); +#define test_lock(O, R, L, S, RES) \ + ok(tablockman_getlock(&tablockman, loid2lo1(O), <array[R], L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lo1(loid2lo1(O)); +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", LOCK_TIMEOUT); + +void test_tablockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +char *res2str[]= { + "DIDN'T GET THE LOCK", + "OUT OF MEMORY", + "DEADLOCK", + "LOCK TIMEOUT", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + LOCK_OWNER *lo; TABLE_LOCK_OWNER *lo1; DBUG_ASSERT(Ntables <= Ntbls); + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo= loid2lo(loid); lo1= loid2lo1(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */ + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { /* table lock */ + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res == DIDNT_GET_THE_LOCK) + { + lockman_release_locks(&lockman, lo); + tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + } + } + + lockman_release_locks(&lockman, lo); + tablockman_release_locks(&tablockman, lo1); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main() +{ + int i; + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(35); + + lockman_init(&lockman, &loid2lo, 50); + tablockman_init(&tablockman, &loid2lo1, 50); + + for (i= 0; i < Nlos; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + + loarray[i].pins= lf_alloc_get_pins(&lockman.alloc); + loarray[i].all_locks= 0; + loarray[i].waiting_for= 0; + loarray[i].mutex= &mutexes[i]; + loarray[i].cond= &conds[i]; + loarray[i].loid= i+1; + + loarray1[i].active_locks= 0; + loarray1[i].waiting_lock= 0; + loarray1[i].waiting_for= 0; + loarray1[i].mutex= &mutexes[i]; + loarray1[i].cond= &conds[i]; + loarray1[i].loid= i+1; + } + + for (i= 0; i < Ntbls; i++) + { + tablockman_init_locked_table(ltarray+i, Nlos); + } + + test_tablockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); + + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); + + for (i= 0; i < Nlos; i++) + { + lockman_release_locks(&lockman, &loarray[i]); + pthread_mutex_destroy(loarray[i].mutex); + pthread_cond_destroy(loarray[i].cond); + lf_pinbox_put_pins(loarray[i].pins); + } + + { + ulonglong now= my_getsystime(); + lockman_destroy(&lockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/lockman2-t.c b/storage/maria/unittest/lockman2-t.c new file mode 100644 index 00000000..7f9ed3cc --- /dev/null +++ b/storage/maria/unittest/lockman2-t.c @@ -0,0 +1,356 @@ +/* Copyright (C) 2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + tablockman for row and table locks +*/ + +/* #define EXTRA_VERBOSE */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <lf.h> +#include "../tablockman.h" + +#define Nlos 100 +#define Ntbls 110 +TABLE_LOCK_OWNER loarray1[Nlos]; +pthread_mutex_t mutexes[Nlos]; +pthread_cond_t conds[Nlos]; +LOCKED_TABLE ltarray[Ntbls]; +TABLOCKMAN tablockman; + +#ifndef EXTRA_VERBOSE +#define print_lo1(X) /* no-op */ +#define DIAG(X) /* no-op */ +#else +#define DIAG(X) diag X +#endif + +TABLE_LOCK_OWNER *loid2lo1(uint16 loid) +{ + return loarray1+loid-1; +} + +#define unlock_all(O) diag("lo" #O "> release all locks"); \ + tablockman_release_locks(&tablockman, loid2lo1(O)); +#define test_lock(O, R, L, S, RES) \ + ok(tablockman_getlock(&tablockman, loid2lo1(O), <array[R], L) == RES, \ + "lo" #O "> " S "lock resource " #R " with " #L "-lock"); \ + print_lo1(loid2lo1(O)); +#define lock_ok_a(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK) +#define lock_ok_i(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE) +#define lock_ok_l(O, R, L) \ + test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE) +#define lock_conflict(O, R, L) \ + test_lock(O, R, L, "cannot ", LOCK_TIMEOUT); + +void test_tablockman_simple() +{ + /* simple */ + lock_ok_a(1, 1, S); + lock_ok_i(2, 2, IS); + lock_ok_i(1, 2, IX); + /* lock escalation */ + lock_ok_a(1, 1, X); + lock_ok_i(2, 2, IX); + /* failures */ + lock_conflict(2, 1, X); + unlock_all(2); + lock_ok_a(1, 2, S); + lock_ok_a(1, 2, IS); + lock_ok_a(1, 2, LS); + lock_ok_i(1, 3, IX); + lock_ok_a(2, 3, LS); + lock_ok_i(1, 3, IX); + lock_ok_l(2, 3, IS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_conflict(2, 1, S); + lock_ok_a(1, 1, LS); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IX); + lock_ok_a(2, 1, LS); + lock_ok_a(1, 1, LS); + lock_ok_i(1, 1, IX); + lock_ok_i(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_i(1, 4, IS); + lock_ok_i(2, 4, IS); + lock_ok_i(3, 4, IS); + lock_ok_a(3, 4, LS); + lock_ok_i(4, 4, IS); + lock_conflict(4, 4, IX); + lock_conflict(2, 4, IX); + lock_ok_a(1, 4, LS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + unlock_all(4); + + lock_ok_i(1, 1, IX); + lock_ok_i(2, 1, IX); + lock_conflict(1, 1, S); + lock_conflict(2, 1, X); + unlock_all(1); + unlock_all(2); + + lock_ok_i(1, 1, IS); + lock_conflict(2, 1, X); + lock_conflict(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); + + lock_ok_a(1, 1, S); + lock_conflict(2, 1, IX); + lock_conflict(3, 1, IS); + unlock_all(1); + unlock_all(2); + unlock_all(3); +} + +int rt_num_threads; +int litmus; +int thread_number= 0, timeouts= 0; +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= my_getsystime(); + int i; + + thread_number= timeouts= 0; + litmus= 0; + + threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Running %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, 0, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= my_getsystime()-now; + ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus); + my_free((void*)threads, MYF(0)); +} + +static void reinit_tlo(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo) +{ +#ifdef NOT_USED_YET + TABLE_LOCK_OWNER backup= *lo; +#endif + + tablockman_release_locks(lm, lo); +#ifdef NOT_USED_YET + pthread_mutex_destroy(lo->mutex); + pthread_cond_destroy(lo->cond); + bzero(lo, sizeof(*lo)); + + lo->mutex= backup.mutex; + lo->cond= backup.cond; + lo->loid= backup.loid; + pthread_mutex_init(lo->mutex, MY_MUTEX_INIT_FAST); + pthread_cond_init(lo->cond, 0); +#endif +} + +pthread_mutex_t rt_mutex; +int Nrows= 100; +int Ntables= 10; +int table_lock_ratio= 10; +enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX}; +const char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"}; +const char *res2str[]= { + 0, + "OUT OF MEMORY", + "DEADLOCK", + "LOCK TIMEOUT", + "GOT THE LOCK", + "GOT THE LOCK NEED TO LOCK A SUBRESOURCE", + "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"}; + +pthread_handler_t test_lockman(void *arg) +{ + int m= (*(int *)arg); + uint x, loid, row, table, res, locklevel, timeout= 0; + TABLE_LOCK_OWNER *lo1; + DBUG_ASSERT(Ntables <= Ntbls); + DBUG_ASSERT(Nrows + Ntables <= Ntbls); + + pthread_mutex_lock(&rt_mutex); + loid= ++thread_number; + pthread_mutex_unlock(&rt_mutex); + lo1= loid2lo1(loid); + + for (x= ((int)(intptr)(&m)); m > 0; m--) + { + /* three prime numbers */ + x= (uint) ((x*LL(3628273133) + LL(1500450271)) % LL(9576890767)); + row= x % Nrows + Ntables; + table= row % Ntables; + locklevel= (x/Nrows) & 3; + if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0) + { + /* table lock */ + res= tablockman_getlock(&tablockman, lo1, ltarray+table, + lock_array[locklevel]); + DIAG(("loid %2d, table %d, lock %s, res %s", loid, table, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + } + else + { /* row lock */ + locklevel&= 1; + res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]); + DIAG(("loid %2d, row %d, lock %s, res %s", loid, row, + lock2str[locklevel+4], res2str[res])); + switch (res) + { + case GOT_THE_LOCK: + continue; + case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE: + /* not implemented, so take a regular lock */ + case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE: + res= tablockman_getlock(&tablockman, lo1, ltarray+row, lock_array[locklevel]); + DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row, + lock2str[locklevel], res2str[res])); + if (res < GOT_THE_LOCK) + { + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + DBUG_ASSERT(res == GOT_THE_LOCK); + continue; + default: + reinit_tlo(&tablockman, lo1); + DIAG(("loid %2d, release all locks", loid)); + timeout++; + continue; + } + } + } + + reinit_tlo(&tablockman, lo1); + + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + timeouts+= timeout; + if (!rt_num_threads) + diag("number of timeouts: %d", timeouts); + pthread_mutex_unlock(&rt_mutex); + + return 0; +} + +int main(int argc __attribute__((unused)), char **argv) +{ + int i; + MY_INIT(argv[0]); + + my_init(); + pthread_mutex_init(&rt_mutex, 0); + + plan(40); + + tablockman_init(&tablockman, &loid2lo1, 50); + + for (i= 0; i < Nlos; i++) + { + pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST); + pthread_cond_init (&conds[i], 0); + + loarray1[i].active_locks= 0; + loarray1[i].waiting_lock= 0; + loarray1[i].waiting_for= 0; + loarray1[i].mutex= &mutexes[i]; + loarray1[i].cond= &conds[i]; + loarray1[i].loid= i+1; + } + + for (i= 0; i < Ntbls; i++) + { + tablockman_init_locked_table(ltarray+i, Nlos); + } + + test_tablockman_simple(); + +#define CYCLES 10000 +#define THREADS Nlos /* don't change this line */ + + /* mixed load, stress-test with random locks */ + Nrows= 100; + Ntables= 10; + table_lock_ratio= 10; + run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES); +#if 0 + /* "real-life" simulation - many rows, no table locks */ + Nrows= 1000000; + Ntables= 10; + table_lock_ratio= 0; + run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10); +#endif + for (i= 0; i < Nlos; i++) + { + tablockman_release_locks(&tablockman, &loarray1[i]); + pthread_mutex_destroy(loarray1[i].mutex); + pthread_cond_destroy(loarray1[i].cond); + } + + { + ulonglong now= my_getsystime(); + for (i= 0; i < Ntbls; i++) + { + tablockman_destroy_locked_table(ltarray+i); + } + tablockman_destroy(&tablockman); + now= my_getsystime()-now; + diag("lockman_destroy: %g secs", ((double)now)/1e7); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c new file mode 100644 index 00000000..859d5514 --- /dev/null +++ b/storage/maria/unittest/ma_control_file-t.c @@ -0,0 +1,623 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* Unit test of the control file module of the Aria engine WL#3234 */ + +/* + Note that it is not possible to test the durability of the write (can't + pull the plug programmatically :) +*/ + +#include <my_global.h> +#include <my_sys.h> +#include <tap.h> +#ifdef _WIN32 +#include <direct.h> /* rmdir */ +#endif +#ifndef WITH_ARIA_STORAGE_ENGINE +/* + If Aria is not compiled in, normally we don't come to building this test. +*/ +#error "Aria engine is not compiled in, test cannot be built" +#endif + +#include "maria.h" +#include "../../../storage/maria/maria_def.h" +#include <my_getopt.h> + +#define EXTRACT_DEFINITIONS +#include "../ma_control_file.c" +#undef EXTRACT_DEFINITIONS + +char file_name[FN_REFLEN]; + +/* The values we'll set and expect the control file module to return */ +LSN expect_checkpoint_lsn; +uint32 expect_logno; +TrID expect_max_trid; +uint8 expect_recovery_failures; + +static int delete_file(myf my_flags); +/* + Those are test-specific wrappers around the module's API functions: after + calling the module's API functions they perform checks on the result. +*/ +static int close_file(void); /* wraps ma_control_file_end */ +/* wraps ma_control_file_open_or_create */ +static int open_file(void); +/* wraps ma_control_file_write_and_force */ +static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid, + uint8 rec_failures); + +/* Tests */ +static int test_one_log_and_recovery_failures(void); +static int test_five_logs_and_max_trid(void); +static int test_3_checkpoints_and_2_logs(void); +static int test_binary_content(void); +static int test_start_stop(void); +static int test_2_open_and_2_close(void); +static int test_bad_magic_string(void); +static int test_bad_checksum(void); +static int test_bad_hchecksum(void); +static int test_future_size(void); +static int test_bad_blocksize(void); +static int test_bad_size(void); + +/* Utility */ +static int verify_module_values_match_expected(void); +static int verify_module_values_are_impossible(void); +static void usage(void); +static void get_options(int argc, char *argv[]); + +/* + If "expr" is FALSE, this macro will make the function print a diagnostic + message and immediately return 1. + This is inspired from assert() but does not crash the binary (sometimes we + may want to see how other tests go even if one fails). + RET_ERR means "return error". +*/ + +#define RET_ERR_UNLESS(expr) \ + {if (!(expr)) {diag("line %d: failure: '%s'", __LINE__, #expr); assert(0);return 1;}} + + +/* Used to ignore error messages from ma_control_file_open() */ + +static void my_ignore_message(uint error __attribute__((unused)), + const char *str __attribute__((unused)), + myf MyFlags __attribute__((unused))) +{ + DBUG_ENTER("my_message_no_curses"); + DBUG_PRINT("enter",("message: %s",str)); + DBUG_VOID_RETURN; +} + +void (*default_error_handler_hook)(uint my_err, const char *str, + myf MyFlags) = 0; + + +/* like ma_control_file_open(), but without error messages */ + +static CONTROL_FILE_ERROR local_ma_control_file_open(void) +{ + CONTROL_FILE_ERROR error; + error_handler_hook= my_ignore_message; + error= ma_control_file_open(TRUE, TRUE, TRUE); + error_handler_hook= default_error_handler_hook; + return error; +} + +static char *create_tmpdir(const char *progname) +{ + static char test_dirname[FN_REFLEN]; + char tmp_name[FN_REFLEN]; + size_t length; + + /* Create a temporary directory of name TMP-'executable', but without the -t extension */ + fn_format(tmp_name, progname, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + length= strlen(tmp_name); + if (length > 2 && tmp_name[length-2] == '-' && tmp_name[length-1] == 't') + tmp_name[length-2]= 0; + strxmov(test_dirname, "TMP-", tmp_name, NullS); + + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + (void) my_mkdir(test_dirname, 0777, MYF(0)); + return test_dirname; +} + + +int main(int argc,char *argv[]) +{ + MY_INIT(argv[0]); + my_init(); + + default_error_handler_hook= error_handler_hook; + + plan(12); + + maria_data_root= create_tmpdir(argv[0]); + + diag("Unit tests for control file"); + + get_options(argc,argv); + + diag("Deleting control file at startup, if there is an old one"); + RET_ERR_UNLESS(0 == delete_file(0)); /* if fails, can't continue */ + + diag("Tests of normal conditions"); + ok(0 == test_one_log_and_recovery_failures(), + "test of creating one log and recording recovery failures"); + ok(0 == test_five_logs_and_max_trid(), + "test of creating five logs and many transactions"); + ok(0 == test_3_checkpoints_and_2_logs(), + "test of creating three checkpoints and two logs"); + ok(0 == test_binary_content(), "test of the binary content of the file"); + ok(0 == test_start_stop(), "test of multiple starts and stops"); + diag("Tests of abnormal conditions"); + ok(0 == test_2_open_and_2_close(), + "test of two open and two close (strange call sequence)"); + ok(0 == test_bad_magic_string(), "test of bad magic string"); + ok(0 == test_bad_checksum(), "test of bad checksum"); + ok(0 == test_bad_hchecksum(), "test of bad hchecksum"); + ok(0 == test_future_size(), "test of ability to handlr future versions"); + ok(0 == test_bad_blocksize(), "test of bad blocksize"); + ok(0 == test_bad_size(), "test of too small/big file"); + + delete_file(0); + rmdir(maria_data_root); + + my_uuid_end(); + my_end(0); + return exit_status(); +} + + +static int delete_file(myf my_flags) +{ + RET_ERR_UNLESS(fn_format(file_name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) != NullS); + /* + Maybe file does not exist, ignore error. + The error will however be printed on stderr. + */ + my_delete(file_name, my_flags); + expect_checkpoint_lsn= LSN_IMPOSSIBLE; + expect_logno= FILENO_IMPOSSIBLE; + expect_max_trid= expect_recovery_failures= 0; + + return 0; +} + +/* + Verifies that global values last_checkpoint_lsn, last_logno, + max_trid_in_control_file (belonging to the module) match what we expect. +*/ +static int verify_module_values_match_expected(void) +{ + RET_ERR_UNLESS(last_logno == expect_logno); + RET_ERR_UNLESS(last_checkpoint_lsn == expect_checkpoint_lsn); + RET_ERR_UNLESS(max_trid_in_control_file == expect_max_trid); + RET_ERR_UNLESS(recovery_failures == expect_recovery_failures); + return 0; +} + + +/* + Verifies that global values last_checkpoint_lsn and last_logno (belonging + to the module) are impossible (this is used when the file has been closed). +*/ +static int verify_module_values_are_impossible(void) +{ + RET_ERR_UNLESS(last_logno == FILENO_IMPOSSIBLE); + RET_ERR_UNLESS(last_checkpoint_lsn == LSN_IMPOSSIBLE); + RET_ERR_UNLESS(max_trid_in_control_file == 0); + return 0; +} + + +static int close_file(void) +{ + /* Simulate shutdown */ + ma_control_file_end(); + /* Verify amnesia */ + RET_ERR_UNLESS(verify_module_values_are_impossible() == 0); + return 0; +} + +static int open_file(void) +{ + RET_ERR_UNLESS(local_ma_control_file_open() == CONTROL_FILE_OK); + /* Check that the module reports expected information */ + RET_ERR_UNLESS(verify_module_values_match_expected() == 0); + return 0; +} + +static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid, + uint8 rec_failures) +{ + RET_ERR_UNLESS(ma_control_file_write_and_force(checkpoint_lsn, logno, trid, + rec_failures) + == 0); + /* Check that the module reports expected information */ + RET_ERR_UNLESS(verify_module_values_match_expected() == 0); + return 0; +} + +static int test_one_log_and_recovery_failures(void) +{ + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + expect_logno= 123; + RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + expect_recovery_failures= 158; + RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + expect_recovery_failures) == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_five_logs_and_max_trid(void) +{ + uint i; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + expect_logno= 100; + expect_max_trid= 14111978111ULL; + for (i= 0; i<5; i++) + { + expect_logno*= 3; + RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno, + expect_max_trid, + recovery_failures) == 0); + } + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_3_checkpoints_and_2_logs(void) +{ + /* + Simulate one checkpoint, one log creation, two checkpoints, one + log creation. + */ + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + expect_checkpoint_lsn= MAKE_LSN(5, 10000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_logno= 17; + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_checkpoint_lsn= MAKE_LSN(17, 20000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_checkpoint_lsn= MAKE_LSN(17, 45000); + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + + expect_logno= 19; + RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno, + max_trid_in_control_file, + recovery_failures) == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_binary_content(void) +{ + uint i; + int fd; + + /* + TEST4: actually check by ourselves the content of the file. + Note that constants (offsets) are hard-coded here, precisely to prevent + someone from changing them in the control file module and breaking + backward-compatibility. + TODO: when we reach the format-freeze state, we may even just do a + comparison with a raw binary string, to not depend on any uint4korr + future change/breakage. + */ + + uchar buffer[45]; + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_read(fd, buffer, 45, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + i= uint3korr(buffer + 34 ); + RET_ERR_UNLESS(i == LSN_FILE_NO(last_checkpoint_lsn)); + i= uint4korr(buffer + 37); + RET_ERR_UNLESS(i == LSN_OFFSET(last_checkpoint_lsn)); + i= uint4korr(buffer + 41); + RET_ERR_UNLESS(i == last_logno); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_start_stop(void) +{ + /* TEST5: Simulate start/nothing/stop/start/nothing/stop/start */ + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_2_open_and_2_close(void) +{ + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + + +static int test_bad_magic_string(void) +{ + uchar buffer[4]; + int fd; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt magic string */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_pwrite(fd, (const uchar *)"papa", 4, 0, + MYF(MY_FNABP | MY_WME)) == 0); + + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_BAD_MAGIC_STRING); + /* Restore magic string */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 4, 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + +static int test_bad_checksum(void) +{ + uchar buffer[4]; + int fd; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt checksum */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0); + buffer[0]+= 3; /* mangle checksum */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_BAD_CHECKSUM); + /* Restore checksum */ + buffer[0]-= 3; + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + return 0; +} + + +static int test_bad_blocksize(void) +{ + maria_block_size<<= 1; + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_WRONG_BLOCKSIZE); + /* Restore blocksize */ + maria_block_size>>= 1; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + return 0; +} + + +static int test_future_size(void) +{ + /* + Here we check ability to add fields only so we can use + defined constants + */ + uint32 sum; + int fd; + uchar buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 2]; + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_read(fd, buffer, + CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE, + MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + /* "add" new field of 1 byte (value 1) to header and variable part */ + memmove(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1, + buffer + CF_CREATE_TIME_TOTAL_SIZE, + CF_CHANGEABLE_TOTAL_SIZE); + buffer[CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE]= '\1'; + buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 1]= '\1'; + /* fix lengths */ + int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, CF_CREATE_TIME_TOTAL_SIZE + 1); + int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, CF_CHANGEABLE_TOTAL_SIZE + 1); + /* recalculete checksums */ + sum= (uint32) my_checksum(0, buffer, CF_CREATE_TIME_TOTAL_SIZE - + CF_CHECKSUM_SIZE + 1); + int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1, sum); + sum= (uint32) my_checksum(0, buffer + CF_CREATE_TIME_TOTAL_SIZE + 1 + + CF_CHECKSUM_SIZE, + CF_CHANGEABLE_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1); + int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1, sum); + /* write new file and check it */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pwrite(fd, buffer, + CF_CREATE_TIME_TOTAL_SIZE + + CF_CHANGEABLE_TOTAL_SIZE + 2, + 0, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + return(0); +} + +static int test_bad_hchecksum(void) +{ + uchar buffer[4]; + int fd; + + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + /* Corrupt checksum */ + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_pread(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0); + buffer[0]+= 3; /* mangle checksum */ + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_BAD_HEAD_CHECKSUM); + /* Restore checksum */ + buffer[0]-= 3; + RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP | MY_WME)) == 0); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + return 0; +} + + +static int test_bad_size(void) +{ + uchar buffer[]= + "123456789012345678901234567890123456789012345678901234567890123456"; + int fd, i; + + /* A too short file */ + RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0); + RET_ERR_UNLESS((fd= my_open(file_name, + O_BINARY | O_RDWR | O_CREAT, + MYF(MY_WME))) >= 0); + RET_ERR_UNLESS(my_write(fd, buffer, 10, MYF(MY_FNABP | MY_WME)) == 0); + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_TOO_SMALL); + for (i= 0; i < 8; i++) + { + RET_ERR_UNLESS(my_write(fd, buffer, 66, MYF(MY_FNABP | MY_WME)) == 0); + } + /* Check that control file module sees the problem */ + RET_ERR_UNLESS(local_ma_control_file_open() == + CONTROL_FILE_TOO_BIG); + RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0); + + /* Leave a correct control file */ + RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0); + RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK); + RET_ERR_UNLESS(close_file() == 0); + + return 0; +} + + +static struct my_option my_long_options[] = +{ +#ifndef DBUG_OFF + {"debug", '#', "Debug log.", + 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"help", '?', "Display help and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"version", 'V', "Print version number and exit", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +static void version(void) +{ + printf("ma_control_file_test: unit test for the control file " + "module of the Aria storage engine. Ver 1.0 \n"); +} + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument __attribute__((unused)), + const char *filename __attribute__((unused))) +{ + switch(opt->id) { + case 'V': + version(); + exit(0); + case '#': + DBUG_PUSH (argument); + break; + case '?': + version(); + usage(); + exit(0); + } + return 0; +} + + +/* Read options */ + +static void get_options(int argc, char *argv[]) +{ + int ho_error; + + if ((ho_error=handle_options(&argc, &argv, my_long_options, + get_one_option))) + exit(ho_error); + + return; +} /* get options */ + + +static void usage(void) +{ + printf("Usage: %s [options]\n\n", my_progname); + my_print_help(my_long_options); + my_print_variables(my_long_options); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_loghandler_examples.c b/storage/maria/unittest/ma_loghandler_examples.c new file mode 100644 index 00000000..422e6961 --- /dev/null +++ b/storage/maria/unittest/ma_loghandler_examples.c @@ -0,0 +1,68 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE= +{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0, + "fixed0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, NULL, NULL, 0, +"variable0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 7, 7, NULL, NULL, NULL, 1, +"fixed1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 12, NULL, NULL, NULL, 1, +"variable1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE= +{LOGRECTYPE_PSEUDOFIXEDLENGTH, 23, 23, NULL, NULL, NULL, 2, +"fixed2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, 19, NULL, NULL, NULL, 2, +"variable2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + + +void translog_example_table_init() +{ + int i; + log_record_type_descriptor[LOGREC_FIXED_RECORD_0LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_FIXED_RECORD_1LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_FIXED_RECORD_2LSN_EXAMPLE]= + INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE; + log_record_type_descriptor[LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE]= + INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE; + for (i= LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE + 1; + i < LOGREC_NUMBER_OF_TYPES; + i++) + log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED; +#ifndef DBUG_OFF + check_translog_description_table(LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE); +#endif +} + + + diff --git a/storage/maria/unittest/ma_maria_log_cleanup.c b/storage/maria/unittest/ma_maria_log_cleanup.c new file mode 100644 index 00000000..a4d0609f --- /dev/null +++ b/storage/maria/unittest/ma_maria_log_cleanup.c @@ -0,0 +1,88 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#ifdef _WIN32 +#include <direct.h> /* rmdir */ +#endif + +my_bool maria_log_remove(const char *testdir) +{ + MY_DIR *dirp; + size_t i; + MY_STAT stat_buff; + char file_name[FN_REFLEN]; + + /* Removes control file */ + if (fn_format(file_name, CONTROL_FILE_BASE_NAME, + maria_data_root, "", MYF(MY_WME)) == NullS) + return 1; + if (my_stat(file_name, &stat_buff, MYF(0)) && + my_delete(file_name, MYF(MY_WME)) != 0) + return 1; + + /* Finds and removes transaction log files */ + if (!(dirp = my_dir(maria_data_root, MYF(MY_DONT_SORT)))) + return 1; + + for (i= 0; i < dirp->number_of_files; i++) + { + char *file= dirp->dir_entry[i].name; + if (strncmp(file, "aria_log.", 9) == 0 && + file[9] >= '0' && file[9] <= '9' && + file[10] >= '0' && file[10] <= '9' && + file[11] >= '0' && file[11] <= '9' && + file[12] >= '0' && file[12] <= '9' && + file[13] >= '0' && file[13] <= '9' && + file[14] >= '0' && file[14] <= '9' && + file[15] >= '0' && file[15] <= '9' && + file[16] >= '0' && file[16] <= '9' && + file[17] == '\0') + { + if (fn_format(file_name, file, + maria_data_root, "", MYF(MY_WME)) == NullS || + my_delete(file_name, MYF(MY_WME)) != 0) + { + my_dirend(dirp); + return 1; + } + } + } + my_dirend(dirp); + if (testdir) + rmdir(testdir); + return 0; +} + +char *create_tmpdir(const char *progname) +{ + static char test_dirname[FN_REFLEN]; + char tmp_name[FN_REFLEN]; + size_t length; + + /* Create a temporary directory of name TMP-'executable', but without the -t extension */ + fn_format(tmp_name, progname, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + length= strlen(tmp_name); + if (length > 2 && tmp_name[length-2] == '-' && tmp_name[length-1] == 't') + tmp_name[length-2]= 0; + strxmov(test_dirname, "TMP-", tmp_name, NullS); + + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + (void) my_mkdir(test_dirname, 0777, MYF(0)); + return test_dirname; +} diff --git a/storage/maria/unittest/ma_pagecache_consist.c b/storage/maria/unittest/ma_pagecache_consist.c new file mode 100644 index 00000000..ff4a2bcb --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_consist.c @@ -0,0 +1,504 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + +static const char *base_file1_name= "page_cache_test_file_1"; +static char file1_name[FN_REFLEN]; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; +static PAGECACHE pagecache; + +#ifdef TEST_HIGH_CONCURENCY +static uint number_of_readers= 10; +static uint number_of_writers= 20; +static uint number_of_tests= 30000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#else /*TEST_HIGH_CONCURENCY*/ +#ifdef TEST_READERS +static uint number_of_readers= 10; +static uint number_of_writers= 1; +static uint number_of_tests= 30000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#else /*TEST_READERS*/ +#ifdef TEST_WRITERS +static uint number_of_readers= 0; +static uint number_of_writers= 10; +static uint number_of_tests= 30000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20; +static uint flush_divider= 1000; +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#else /*TEST_WRITERS*/ +static uint number_of_readers= 10; +static uint number_of_writers= 10; +static uint number_of_tests= 50000; +static uint record_length_limit= TEST_PAGE_SIZE/200; +static uint number_of_pages= 20000; +static uint flush_divider= 1000; +#endif /*TEST_WRITERS*/ +#endif /*TEST_READERS*/ +#endif /*TEST_HIGH_CONCURENCY*/ + + +/* + Get pseudo-random length of the field in (0;limit) + + SYNOPSYS + get_len() + limit limit for generated value + + RETURN + length where length >= 0 & length < limit +*/ + +static uint get_len(uint limit) +{ + return (uint)((ulonglong)rand()*(limit-1)/RAND_MAX); +} + + +/* + Check page's consistency: layout is + 4 bytes: number 'num' of records in this page, then num occurences of + { 4 bytes: record's length 'len'; then 4 bytes unchecked ('tag') then + 'len' bytes each equal to the record's sequential number in this page, + modulo 256 }, then zeroes. + */ +uint check_page(uchar *buff, ulong offset, int page_locked, int page_no, + int tag) +{ + uint end= sizeof(uint); + uint num= uint4korr(buff); + uint i; + DBUG_ENTER("check_page"); + + for (i= 0; i < num; i++) + { + uint len= uint4korr(buff + end); + uint j; + end+= 4 + 4; + if (len + end > TEST_PAGE_SIZE) + { + diag("incorrect field header #%u by offset %lu\n", i, offset + end); + goto err; + } + for(j= 0; j < len; j++) + { + if (buff[end + j] != (uchar)((i+1) % 256)) + { + diag("incorrect %lu byte\n", offset + end + j); + goto err; + } + } + end+= len; + } + for(i= end; i < TEST_PAGE_SIZE; i++) + { + if (buff[i] != 0) + { + int h; + DBUG_PRINT("err", + ("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n", + offset + i, offset, i, page_no, + (page_locked ? "locked" : "unlocked"), + end, num, tag)); + diag("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n", + offset + i, offset, i, page_no, + (page_locked ? "locked" : "unlocked"), + end, num, tag); + h= my_open("wrong_page", O_CREAT | O_TRUNC | O_RDWR, MYF(0)); + my_pwrite(h, (uchar*) buff, TEST_PAGE_SIZE, 0, MYF(0)); + my_close(h, MYF(0)); + goto err; + } + } + DBUG_RETURN(end); +err: + DBUG_PRINT("err", ("try to flush")); + if (page_locked) + { + pagecache_delete(&pagecache, &file1, page_no, + PAGECACHE_LOCK_LEFT_WRITELOCKED, 1); + } + else + { + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + } + exit(1); +} + +void put_rec(uchar *buff, uint end, uint len, uint tag) +{ + uint i; + uint num; + num= uint4korr(buff); + if (!len) + len= 1; + if (end + 4*2 + len > TEST_PAGE_SIZE) + return; + int4store(buff + end, len); + end+= 4; + int4store(buff + end, tag); + end+= 4; + num++; + int4store(buff, num); + for (i= end; i < (len + end); i++) + { + buff[i]= (uchar) num % 256; + } +} + +/* + Recreate and reopen a file for test + + SYNOPSIS + reset_file() + file File to reset + file_name Path (and name) of file which should be reset +*/ + +void reset_file(PAGECACHE_FILE file, char *file_name) +{ + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + if (my_close(file1.file, MYF(0)) != 0) + { + diag("Got error during %s closing from close() (errno: %d)\n", + file_name, errno); + exit(1); + } + my_delete(file_name, MYF(0)); + if ((file.file= my_open(file_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag("Got error during %s creation from open() (errno: %d)\n", + file_name, errno); + exit(1); + } +} + + +void reader(int num) +{ + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + uint i; + + for (i= 0; i < number_of_tests; i++) + { + uint page= get_len(number_of_pages); + pagecache_read(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + check_page(buffr, page * TEST_PAGE_SIZE, 0, page, -num); + + } + free(buffr); +} + + +void writer(int num) +{ + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + uint i; + + for (i= 0; i < number_of_tests; i++) + { + uint end; + uint page= get_len(number_of_pages); + pagecache_read(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + end= check_page(buffr, page * TEST_PAGE_SIZE, 1, page, num); + put_rec(buffr, end, get_len(record_length_limit), num); + pagecache_write(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + + if (i % flush_divider == 0) + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + } + free(buffr); +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_reader"); + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "reader%d: done", param); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_writer"); + DBUG_PRINT("enter", ("param: %d", param)); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "writer%d: done", param); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + +static char *create_tmpdir(const char *progname) +{ + static char test_dirname[FN_REFLEN]; + char tmp_name[FN_REFLEN]; + size_t length; + + /* Create a temporary directory of name TMP-'executable', but without the -t extension */ + fn_format(tmp_name, progname, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + length= strlen(tmp_name); + if (length > 2 && tmp_name[length-2] == '-' && tmp_name[length-1] == 't') + tmp_name[length-2]= 0; + strxmov(test_dirname, "TMP-", tmp_name, NullS); + + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + (void) my_mkdir(test_dirname, 0777, MYF(0)); + return test_dirname; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + size_t pagen; + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + plan(number_of_writers + number_of_readers); + + SKIP_BIG_TESTS(number_of_writers + number_of_readers) + { + + char *test_dirname= create_tmpdir(argv[0]); + fn_format(file1_name, base_file1_name, test_dirname, "", MYF(0)); + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag( "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + + pagecache_file_set_null_hooks(&file1); + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, 0777, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar *)"test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + diag("Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + diag( + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + thr_setconcurrency(2); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0, 0)) == 0) + { + diag("Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %zd pages", pagen)); + { + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + uint i; + memset(buffr, '\0', TEST_PAGE_SIZE); + for (i= 0; i < number_of_pages; i++) + { + pagecache_write(&pagecache, &file1, i, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + free(buffr); + } + pthread_mutex_lock(&LOCK_thread_count); + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count))) + diag("COND_thread_count: %d from pthread_cond_wait\n",error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + flush_pagecache_blocks(&pagecache, &file1, FLUSH_IGNORE_CHANGED); + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + diag( "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + my_delete(file1_name, MYF(0)); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + + rmdir(test_dirname); + } /* SKIP_BIG_TESTS */ + my_end(0); + + return exit_status(); + } +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_pagecache_rwconsist.c b/storage/maria/unittest/ma_pagecache_rwconsist.c new file mode 100644 index 00000000..24c30245 --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_rwconsist.c @@ -0,0 +1,368 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + + +#define SLEEP my_sleep(5) + +static const char *base_file1_name= "page_cache_test_file_1"; +static char file1_name[FN_REFLEN]; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count= 0; +static PAGECACHE pagecache; + +static uint number_of_readers= 5; +static uint number_of_writers= 5; +static uint number_of_read_tests= 2000; +static uint number_of_write_tests= 1000; +static uint read_sleep_limit= 3; +static uint report_divisor= 50; + +/** + @brief Checks page consistency + + @param buff pointer to the page content + @param task task ID +*/ +void check_page(uchar *buff, int task) +{ + uint i; + DBUG_ENTER("check_page"); + + for (i= 1; i < TEST_PAGE_SIZE; i++) + { + if (buff[0] != buff[i]) + goto err; + } + DBUG_VOID_RETURN; +err: + diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0], + (uint) buff[i]); + DBUG_PRINT("err", ("try to flush")); + exit(1); +} + + + +void reader(int num) +{ + unsigned char *buff; + uint i; + PAGECACHE_BLOCK_LINK *link; + + for (i= 0; i < number_of_read_tests; i++) + { + if (i % report_divisor == 0) + diag("Reader %d - %u", num, i); + buff= pagecache_read(&pagecache, &file1, 0, 3, NULL, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_READ, + &link); + check_page(buff, num); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, 0, 0, 0, FALSE); + { + int lim= rand() % read_sleep_limit; + int j; + for (j= 0; j < lim; j++) + SLEEP; + } + } +} + + +void writer(int num) +{ + uint i; + uchar *buff; + PAGECACHE_BLOCK_LINK *link; + + for (i= 0; i < number_of_write_tests; i++) + { + uchar c= (uchar) rand() % 256; + + if (i % report_divisor == 0) + diag("Writer %d - %u", num, i); + buff= pagecache_read(&pagecache, &file1, 0, 3, NULL, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + &link); + + check_page(buff, num); + bfill(buff, TEST_PAGE_SIZE / 2, c); + SLEEP; + bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c); + check_page(buff, num); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, 0, 0, 1, FALSE); + SLEEP; + } +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_reader"); + + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "reader%d: done", param); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_writer"); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "writer%d: done", param); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + +char *create_tmpdir(const char *progname) +{ + static char test_dirname[FN_REFLEN]; + char tmp_name[FN_REFLEN]; + size_t length; + + /* Create a temporary directory of name TMP-'executable', but without the -t extension */ + fn_format(tmp_name, progname, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + length= strlen(tmp_name); + if (length > 2 && tmp_name[length-2] == '-' && tmp_name[length-1] == 't') + tmp_name[length-2]= 0; + strxmov(test_dirname, "TMP-", tmp_name, NullS); + + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + (void) my_mkdir(test_dirname, 0777, MYF(0)); + return test_dirname; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + size_t pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + plan(number_of_writers + number_of_readers); + SKIP_BIG_TESTS(number_of_writers + number_of_readers) + { + + char *test_dirname= create_tmpdir(argv[0]); + fn_format(file1_name, base_file1_name, test_dirname, "", MYF(0)); + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag( "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_set_null_hooks(&file1); + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, 0777, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + diag("Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + diag( + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + thr_setconcurrency(2); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0, 0)) == 0) + { + diag("Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %zu pages", pagen)); + { + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + memset(buffr, '\0', TEST_PAGE_SIZE); + pagecache_write(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + pthread_mutex_lock(&LOCK_thread_count); + + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers + number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers + number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + diag("COND_thread_count: %d from pthread_cond_wait\n", error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + flush_pagecache_blocks(&pagecache, &file1, FLUSH_IGNORE_CHANGED); + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + diag( "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + my_delete(file1_name, MYF(0)); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + + rmdir(test_dirname); + } /* SKIP_BIG_TESTS */ + my_end(0); + + return exit_status(); + } +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_pagecache_rwconsist2.c b/storage/maria/unittest/ma_pagecache_rwconsist2.c new file mode 100644 index 00000000..c92bec3c --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_rwconsist2.c @@ -0,0 +1,363 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + +/** + @file this unit tests consistence of long block writing under write lock + and simultaneous reading of this block with read request without read lock + requirement. +*/ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). +*/ + +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + + +#define SLEEP my_sleep(5) + +static const char *base_file1_name= "page_cache_test_file_1"; +static char file1_name[FN_REFLEN]; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count= 0; +static PAGECACHE pagecache; + +static uint number_of_readers= 5; +static uint number_of_writers= 5; +static uint number_of_read_tests= 20000; +static uint number_of_write_tests= 1000; +static uint report_divisor= 50; + +/** + @brief Checks page consistency + + @param buff pointer to the page content + @param task task ID +*/ +void check_page(uchar *buff, int task) +{ + uint i; + DBUG_ENTER("check_page"); + + for (i= 1; i < TEST_PAGE_SIZE; i++) + { + if (buff[0] != buff[i]) + goto err; + } + DBUG_VOID_RETURN; +err: + diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0], + (uint) buff[i]); + DBUG_PRINT("err", ("try to flush")); + exit(1); +} + + + +void reader(int num) +{ + unsigned char buff[TEST_PAGE_SIZE]; + uint i; + + for (i= 0; i < number_of_read_tests; i++) + { + if (i % report_divisor == 0) + diag("Reader %d - %u", num, i); + pagecache_read(&pagecache, &file1, 0, 3, buff, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + NULL); + check_page(buff, num); + } +} + + +void writer(int num) +{ + uint i; + uchar *buff; + PAGECACHE_BLOCK_LINK *link; + + for (i= 0; i < number_of_write_tests; i++) + { + uchar c= (uchar) rand() % 256; + + if (i % report_divisor == 0) + diag("Writer %d - %u", num, i); + buff= pagecache_read(&pagecache, &file1, 0, 3, NULL, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + &link); + + check_page(buff, num); + bfill(buff, TEST_PAGE_SIZE / 2, c); + SLEEP; + bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c); + check_page(buff, num); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, 0, 0, 1, FALSE); + SLEEP; + } +} + + +static void *test_thread_reader(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_reader"); + + DBUG_PRINT("enter", ("param: %d", param)); + + reader(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "reader%d: done", param); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + + +static void *test_thread_writer(void *arg) +{ + int param=*((int*) arg); + my_thread_init(); + { + DBUG_ENTER("test_writer"); + + writer(param); + + DBUG_PRINT("info", ("Thread %s ended", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "writer%d: done", param); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + } + return 0; +} + +static char *create_tmpdir(const char *progname) +{ + static char test_dirname[FN_REFLEN]; + char tmp_name[FN_REFLEN]; + size_t length; + + /* Create a temporary directory of name TMP-'executable', but without the -t extension */ + fn_format(tmp_name, progname, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + length= strlen(tmp_name); + if (length > 2 && tmp_name[length-2] == '-' && tmp_name[length-1] == 't') + tmp_name[length-2]= 0; + strxmov(test_dirname, "TMP-", tmp_name, NullS); + + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + (void) my_mkdir(test_dirname, 0777, MYF(0)); + return test_dirname; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + size_t pagen; + + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace"; +#else + default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + plan(number_of_writers + number_of_readers); + SKIP_BIG_TESTS(number_of_writers + number_of_readers) + { + + char *test_dirname= create_tmpdir(argv[0]); + fn_format(file1_name, base_file1_name, test_dirname, "", MYF(0)); + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag( "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_set_null_hooks(&file1); + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, 0777, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + diag("Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + diag( + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + thr_setconcurrency(2); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0, 0)) == 0) + { + diag("Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %zd pages", pagen)); + { + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + memset(buffr, '\0', TEST_PAGE_SIZE); + pagecache_write(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + pthread_mutex_lock(&LOCK_thread_count); + + while (number_of_readers != 0 || number_of_writers != 0) + { + if (number_of_readers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_readers + number_of_writers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_reader, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_readers--; + } + if (number_of_writers != 0) + { + param=(int*) malloc(sizeof(int)); + *param= number_of_writers + number_of_readers; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + diag("Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + } + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + diag("COND_thread_count: %d from pthread_cond_wait\n", error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(0)) != 0) + { + diag( "Got error during file1 closing from close() (errno: %d)\n", + errno); + exit(1); + } + my_delete(file1_name, MYF(0)); + + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + + rmdir(test_dirname); + } /* SKIP_BIG_TESTS */ + my_end(0); + + return exit_status(); + } +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c new file mode 100644 index 00000000..de2ecaec --- /dev/null +++ b/storage/maria/unittest/ma_pagecache_single.c @@ -0,0 +1,853 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in + my_atomic-t.c (see BUG#22320). + Use diag() instead of fprintf(stderr). +*/ +#include <tap.h> +#include <my_sys.h> +#include <m_string.h> +#include "test_file.h" +#include <tap.h> + +#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*10) + +#ifndef DBUG_OFF +static const char* default_dbug_option; +#endif + +#ifndef BIG +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#endif + +static const char *base_file1_name= "page_cache_test_file_1"; +static const char *base_file2_name= "page_cache_test_file_2"; +static char file1_name[FN_REFLEN], file2_name[FN_REFLEN]; +static PAGECACHE_FILE file1; +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; +static PAGECACHE pagecache; + +/* + File contance descriptors +*/ +static struct file_desc simple_read_write_test_file[]= +{ + { TEST_PAGE_SIZE, '\1'}, + {0, 0} +}; +static struct file_desc simple_read_change_write_read_test_file[]= +{ + { TEST_PAGE_SIZE/2, '\65'}, + { TEST_PAGE_SIZE/2, '\1'}, + {0, 0} +}; +static struct file_desc simple_pin_test_file1[]= +{ + { TEST_PAGE_SIZE*2, '\1'}, + {0, 0} +}; +static struct file_desc simple_pin_test_file2[]= +{ + { TEST_PAGE_SIZE/2, '\1'}, + { TEST_PAGE_SIZE/2, (unsigned char)129}, + { TEST_PAGE_SIZE, '\1'}, + {0, 0} +}; +static struct file_desc simple_pin_no_lock_test_file1[]= +{ + { TEST_PAGE_SIZE, '\4'}, + {0, 0} +}; +static struct file_desc simple_pin_no_lock_test_file2[]= +{ + { TEST_PAGE_SIZE, '\5'}, + {0, 0} +}; +static struct file_desc simple_pin_no_lock_test_file3[]= +{ + { TEST_PAGE_SIZE, '\6'}, + {0, 0} +}; +static struct file_desc simple_delete_forget_test_file[]= +{ + { TEST_PAGE_SIZE, '\1'}, + {0, 0} +}; +static struct file_desc simple_delete_flush_test_file[]= +{ + { TEST_PAGE_SIZE, '\2'}, + {0, 0} +}; + + +/* + Recreate and reopen a file for test + + SYNOPSIS + reset_file() + file File to reset + file_name Path (and name) of file which should be reset +*/ + +void reset_file(PAGECACHE_FILE *file, const char *file_name) +{ + flush_pagecache_blocks(&pagecache, file, FLUSH_RELEASE); + if (my_close(file->file, MYF(MY_WME))) + exit(1); + my_delete(file_name, MYF(MY_WME)); + if ((file->file= my_open(file_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + diag("Got error during %s creation from open() (errno: %d)\n", + file_name, my_errno); + exit(1); + } +} + +/* + Write then read page, check file on disk +*/ + +int simple_read_write_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_read_write_test"); + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + pagecache_read(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + ok((res= MY_TEST(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)), + "Simple write-read page "); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + ok((res&= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_read_write_test_file))), + "Simple write-read page file"); + if (res) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + Prepare page, then read (and lock), change (write new value and unlock), + then check the page in the cache and on the disk +*/ +int simple_read_change_write_read_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + int res, res2; + DBUG_ENTER("simple_read_change_write_read_test"); + + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + /* test */ + pagecache_read(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + bfill(buffw, TEST_PAGE_SIZE/2, '\65'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + + pagecache_read(&pagecache, &file1, 0, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + ok((res= MY_TEST(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)), + "Simple read-change-write-read page "); + DBUG_ASSERT(pagecache.blocks_changed == 1); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + DBUG_ASSERT(pagecache.blocks_changed == 0); + ok((res2= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_read_change_write_read_test_file))), + "Simple read-change-write-read page file"); + if (res && res2) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res && res2); +} + + +/* + Prepare page, read page 0 (and pin) then write page 1 and page 0. + Flush the file (should flush only page 1 and return 1 (page 0 is + still pinned). + Check file on the disk. + Unpin and flush. + Check file on the disk. +*/ +int simple_pin_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_pin_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + pagecache_read(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + pagecache_write(&pagecache, &file1, 1, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129)); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Did not get error in flush_pagecache_blocks\n"); + res= 0; + goto err; + } + ok((res= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE * 2, + TEST_PAGE_SIZE * 2, simple_pin_test_file1))), + "Simple pin page file with pin"); + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + 0, 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks\n"); + res= 0; + goto err; + } + ok((res&= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE * 2, + TEST_PAGE_SIZE, simple_pin_test_file2))), + "Simple pin page result file"); + if (res) + reset_file(&file1, file1_name); +err: + free(buffw); + DBUG_RETURN(res); +} + +/* + Prepare page, read page 0 (and pin) then write page 1 and page 0. + Flush the file (should flush only page 1 and return 1 (page 0 is + still pinned). + Check file on the disk. + Unpin and flush. + Check file on the disk. +*/ +int simple_pin_test2() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_pin_test2"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache\n"); + exit(1); + } + pagecache_read(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + 0); + pagecache_write(&pagecache, &file1, 1, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129)); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_TO_READ, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY)) + { + diag("Did not get error in flush_pagecache_blocks 2\n"); + res= 0; + goto err; + } + ok((res= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE * 2, + TEST_PAGE_SIZE * 2, simple_pin_test_file1))), + "Simple pin page file with pin 2"); + + /* Test that a normal flush goes through */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 3\n"); + res= 0; + goto err; + } + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_READ_UNLOCK, + PAGECACHE_UNPIN, + 0, 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 4\n"); + res= 0; + goto err; + } + ok((res&= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE * 2, + TEST_PAGE_SIZE, simple_pin_test_file2))), + "Simple pin page result file 2"); + if (res) + reset_file(&file1, file1_name); +err: + free(buffw); + DBUG_RETURN(res); +} + +/* + Checks pins without lock. +*/ +int simple_pin_no_lock_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + PAGECACHE_BLOCK_LINK *link; + int res; + DBUG_ENTER("simple_pin_no_lock_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\4'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* test */ + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error during flushing pagecache 2\n"); + exit(1); + } + bfill(buffw, TEST_PAGE_SIZE, '\5'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + /* + We have to get error because one page of the file is pinned, + other page should be flushed + */ + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY)) + { + diag("Did not get error in flush_pagecache_blocks 2\n"); + res= 0; + goto err; + } + ok((res= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file1))), + "Simple pin (no lock) page file with pin 2"); + pagecache_unlock(&pagecache, + &file1, + 0, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_UNPIN, + 0, 0, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 2\n"); + res= 0; + goto err; + } + ok((res&= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file2))), + "Simple pin (no lock) page result file 2"); + + bfill(buffw, TEST_PAGE_SIZE, '\6'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + &link, LSN_IMPOSSIBLE); + pagecache_unlock_by_link(&pagecache, link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_PIN_LEFT_PINNED, 0, 0, 1, FALSE); + if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY)) + { + diag("Did not get error in flush_pagecache_blocks 3\n"); + res= 0; + goto err; + } + ok((res= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file2))), + "Simple pin (no lock) page file with pin 3"); + pagecache_unpin_by_link(&pagecache, link, 0); + if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) + { + diag("Got error in flush_pagecache_blocks 3\n"); + res= 0; + goto err; + } + ok((res&= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_pin_no_lock_test_file3))), + "Simple pin (no lock) page result file 3"); + if (res) + reset_file(&file1, file1_name); +err: + free(buffw); + DBUG_RETURN(res); +} +/* + Prepare page, write new value, then delete page from cache without flush, + on the disk should be page with old content written during preparation +*/ + +int simple_delete_forget_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + int res; + DBUG_ENTER("simple_delete_forget_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + bfill(buffw, TEST_PAGE_SIZE, '\2'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + pagecache_delete(&pagecache, &file1, 0, + PAGECACHE_LOCK_WRITE, 0); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_delete_forget_test_file))), + "Simple delete-forget page file"); + if (res) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + +/* + Prepare page with locking, write new content to the page, + delete page with flush and on existing lock, + check that page on disk contain new value. +*/ + +int simple_delete_flush_test() +{ + unsigned char *buffw= malloc(TEST_PAGE_SIZE); + unsigned char *buffr= malloc(TEST_PAGE_SIZE); + PAGECACHE_BLOCK_LINK *link; + int res; + DBUG_ENTER("simple_delete_flush_test"); + /* prepare the file */ + bfill(buffw, TEST_PAGE_SIZE, '\1'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE, + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, + &link, LSN_IMPOSSIBLE); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + /* test */ + bfill(buffw, TEST_PAGE_SIZE, '\2'); + pagecache_write(&pagecache, &file1, 0, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + PAGECACHE_PIN_LEFT_PINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + if (pagecache_delete_by_link(&pagecache, link, + PAGECACHE_LOCK_LEFT_WRITELOCKED, 1)) + { + diag("simple_delete_flush_test: error during delete"); + exit(1); + } + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + ok((res= MY_TEST(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE, + simple_delete_flush_test_file))), + "Simple delete flush (link) page file"); + if (res) + reset_file(&file1, file1_name); + free(buffw); + free(buffr); + DBUG_RETURN(res); +} + + +/* + write then read file bigger then cache +*/ + +int simple_big_test() +{ + unsigned char *buffw= (unsigned char *) my_malloc(PSI_NOT_INSTRUMENTED, TEST_PAGE_SIZE, MYF(MY_WME)); + unsigned char *buffr= (unsigned char *) my_malloc(PSI_NOT_INSTRUMENTED, TEST_PAGE_SIZE, MYF(MY_WME)); + struct file_desc *desc= ((struct file_desc *) + my_malloc(PSI_NOT_INSTRUMENTED, + (PCACHE_SIZE/(TEST_PAGE_SIZE/2) + 1) * sizeof(struct file_desc), MYF(MY_WME))); + int res, i; + DBUG_ENTER("simple_big_test"); + + /* prepare the file twice larger then cache */ + for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++) + { + bfill(buffw, TEST_PAGE_SIZE, (unsigned char) (i & 0xff)); + desc[i].length= TEST_PAGE_SIZE; + desc[i].content= (i & 0xff); + pagecache_write(&pagecache, &file1, i, 3, buffw, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + } + desc[i].length= 0; + desc[i].content= '\0'; + ok(1, "Simple big file write"); + /* check written pages sequentally read */ + for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++) + { + int j; + pagecache_read(&pagecache, &file1, i, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + for(j= 0; j < TEST_PAGE_SIZE; j++) + { + if (buffr[j] != (i & 0xff)) + { + diag("simple_big_test seq: page %u byte %u mismatch\n", i, j); + res= 0; + goto err; + } + } + } + ok(1, "Simple big file sequential read"); + /* chack random reads */ + for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE); i++) + { + int j, page; + page= rand() % (PCACHE_SIZE/(TEST_PAGE_SIZE/2)); + pagecache_read(&pagecache, &file1, page, 3, buffr, + PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + 0); + for(j= 0; j < TEST_PAGE_SIZE; j++) + { + if (buffr[j] != (page & 0xff)) + { + diag("simple_big_test rnd: page %u byte %u mismatch\n", page, j); + res= 0; + goto err; + } + } + } + ok(1, "Simple big file random read"); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE); + + ok((res= MY_TEST(test_file(file1, file1_name, PCACHE_SIZE * 2, TEST_PAGE_SIZE, + desc))), + "Simple big file"); + if (res) + reset_file(&file1, file1_name); + +err: + my_free(buffw); + my_free(buffr); + my_free(desc); + DBUG_RETURN(res); +} + + +/* + Thread function +*/ + +static void *test_thread(void *arg) +{ + my_thread_init(); + { + DBUG_ENTER("test_thread"); + DBUG_PRINT("enter", ("param: %d", *(int*) arg)); + + if (!simple_read_write_test() || + !simple_read_change_write_read_test() || + !simple_pin_test() || + !simple_pin_test2() || + !simple_pin_no_lock_test() || + !simple_delete_forget_test() || + !simple_delete_flush_test()) + exit(1); + + SKIP_BIG_TESTS(4) + { + if (!simple_big_test()) + exit(1); + } + + DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name())); + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + pthread_cond_signal(&COND_thread_count); /* Tell main we are ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + DBUG_RETURN(0); + } +} + + +static char *create_tmpdir(const char *progname) +{ + static char test_dirname[FN_REFLEN]; + char tmp_name[FN_REFLEN]; + size_t length; + + /* Create a temporary directory of name TMP-'executable', but without the -t extension */ + fn_format(tmp_name, progname, "", "", MY_REPLACE_DIR | MY_REPLACE_EXT); + length= strlen(tmp_name); + if (length > 2 && tmp_name[length-2] == '-' && tmp_name[length-1] == 't') + tmp_name[length-2]= 0; + strxmov(test_dirname, "TMP-", tmp_name, NullS); + + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + (void) my_mkdir(test_dirname, 0777, MYF(0)); + return test_dirname; +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__((unused))) +{ + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + size_t pagen; + File tmp_file; + MY_INIT(argv[0]); + +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\test_pagecache_single.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/test_pagecache_single.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + { + DBUG_ENTER("main"); + DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name())); + + plan(18); + SKIP_BIG_TESTS(18) + { + char *test_dirname= create_tmpdir(argv[0]); + fn_format(file1_name, base_file1_name, test_dirname, "", MYF(0)); + fn_format(file2_name, base_file2_name, test_dirname, "", MYF(0)); + + if ((tmp_file= my_open(file2_name, O_CREAT | O_TRUNC | O_RDWR, + MYF(MY_WME))) < 0) + exit(1); + + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_set_null_hooks(&file1); + my_close(tmp_file, MYF(0)); + my_delete(file2_name, MYF(0)); + + DBUG_PRINT("info", ("file1: %d", file1.file)); + if (my_chmod(file1_name, 0777, MYF(MY_WME))) + exit(1); + my_pwrite(file1.file, (const uchar*)"test file", 9, 0, MYF(MY_WME)); + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n", + error, errno); + exit(1); + } + + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n", + error,errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error,errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + thr_setconcurrency(2); +#endif + + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TEST_PAGE_SIZE, 0, MYF(MY_WME))) == 0) + { + fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n", + errno); + exit(1); + } + DBUG_PRINT("info", ("Page cache %zd pages", pagen)); + + pthread_mutex_lock(&LOCK_thread_count); + param=(int*) malloc(sizeof(int)); + *param= 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread, (void*) param))) + { + fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n", + error,errno); + exit(1); + } + thread_count++; + DBUG_PRINT("info", ("Thread started")); + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count))) + fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error); + } + pthread_mutex_unlock(&LOCK_thread_count); + DBUG_PRINT("info", ("thread ended")); + + end_pagecache(&pagecache, 1); + DBUG_PRINT("info", ("Page cache ended")); + + if (my_close(file1.file, MYF(MY_WME))) + exit(1); + + my_delete(file1_name, MYF(0)); + rmdir(test_dirname); + + } /* SKIP_BIG_TESTS */ + DBUG_PRINT("info", ("file1 (%d) closed", file1.file)); + DBUG_PRINT("info", ("Program end")); + + my_end(0); + } + return exit_status(); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_all-t b/storage/maria/unittest/ma_test_all-t new file mode 100755 index 00000000..8858649f --- /dev/null +++ b/storage/maria/unittest/ma_test_all-t @@ -0,0 +1,769 @@ +#!/usr/bin/env perl +# +# Run various unit tests. +# + +use Getopt::Long; +use File::Basename; + +$|= 1; +$^W = 1; # warnings, because env cannot parse 'perl -w' +$VER= "1.5"; + +$opt_version= 0; +$opt_help= 0; +$opt_verbose= 0; +$opt_abort_on_error= 0; +$opt_valgrind= "valgrind --alignment=8 --leak-check=yes"; +$opt_silent= "-s"; +$opt_number_of_tests= 0; +$opt_run_tests= undef(); + +my $maria_path; # path to "storage/maria" +my $maria_exe_path; # path to executables (ma_test1, aria_chk etc) +my $my_progname= $0; +$my_progname=~ s/.*[\/]//; +my $runtime_error= 0; # Return 1 if error(s) occur during run +my $NEW_TEST= 0; # Test group separator in an array of tests +my $test_begin= 0; +my $test_end= 0; +my $test_counter= 0; +my $using_internal_tmpdir= 0; +my $full_tmpdir; +my $tmpdir="tmp"; +my $exec_dir="TMP-ma_test_all"; # Run test in this directory +run_tests(); + +#### +#### Initialise variables, clean temporary files and run the tests +#### + +sub run_tests +{ + my $nr_tests= 0; + my $flag_exit= 0; + + if (!GetOptions("help" => \$opt_help, + "version" => \$opt_version, + "verbose" => \$opt_verbose, + "abort-on-error" => \$opt_abort_on_error, + "valgrind=s" => \$opt_valgrind, + "silent=s" => \$opt_silent, + "tmpdir=s" => \$full_tmpdir, + "number-of-tests" => \$opt_number_of_tests, + "run-tests=s" => \$opt_run_tests, + "start-from=s" => \$opt_run_tests)) + { + $flag_exit= 1; + } + if ($opt_version) + { + print "$my_progname version $VER\n"; + exit(0); + } + + if (! -d $exec_dir) + { + die if (!mkdir("$exec_dir")); + } + chdir($exec_dir); + + $maria_path= "../" . dirname($0) . "/.."; + + my $suffix= ( $^O =~ /win/i && $^O !~ /darwin/i ) ? ".exe" : ""; + $maria_exe_path= "$maria_path/release"; + # we use -f, sometimes -x is unexpectedly false in Cygwin + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/relwithdebinfo"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/debug"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= $maria_path; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + die("Cannot find ma_test1 executable in $maria_path\n"); + } + } + } + } + + usage() if ($opt_help || $flag_exit); + + if (defined($full_tmpdir)) + { + $tmpdir= $full_tmpdir; + } + else + { + $full_tmpdir= $tmpdir; + $using_internal_tmpdir= 1; + if (! -d "$full_tmpdir") + { + die if (!mkdir("$full_tmpdir")); + } + } + + # + # IMPORTANT: If you modify this file, please read this: + # + # Count total number of tests. Make sure that the functions return + # number of unit tests correctly, e.g. calls to ok(). The last argument + # for each function is a flag counter and will return the number of + # unit tests in each. Please see comments on function ok() at the end. + # + # If you modify any functions or add any new ones, please make sure the + # unit tests are appropriately detected here. A wrong count will + # make the unit test fail during 'make test'. $nr_tests must be right. + # + + $nr_tests+= run_check_tests(0, 0, 0, 0, 1) * 5; # + $nr_tests+= run_repair_tests(0, 0, 0, 0, 1) * 5; # called 4 times + $nr_tests+= run_pack_tests(0, 0, 0, 0, 1) * 5; # + $nr_tests+= run_tests_on_warnings_and_errors(0, 0, 0, 1); + $nr_tests+= run_ma_test_recovery(0, 1); + $nr_tests+= run_tests_on_clrs(0, 0, 1); + + if ($opt_number_of_tests) + { + print "Total number of tests is $nr_tests\n"; + exit(0); + } + + if (defined($opt_run_tests)) + { + if ($opt_run_tests =~ m/^(\d+)$/ || + $opt_run_tests =~ m/^(\d+)\.+$/) + { + $test_begin= $1; + } + elsif ($opt_run_tests =~ m/^(\d+)\.+(\d+)$/) + { + $test_begin= $1; + $test_end= $2; + } + else + { + print "Wrong syntax for option --run-tests=$opt_run_tests\n"; + print "Please use --run-tests=<begin>..<end>\nwhere 'begin' is the "; + print "first test to be run and 'end' is the last.\n"; + exit(1); + } + if ($test_end > $nr_tests) + { + print "Test range ($test_begin..$test_end) out of range. "; + print "There are only $nr_tests in the test suite.\n"; + exit(1); + } + $test_begin++ if (!$test_begin); # Handle zero, if user gave that + if ($test_end && $test_begin > $test_end) + { + print "Bad test range ($test_begin..$test_end)\n"; + exit(1); + } + # Now adjust number of tests + $nr_tests= ($test_end ? $test_end : $nr_tests) - $test_begin + 1; + } + + # + # clean-up + # + + unlink_all_possible_tmp_files(); + + # + # Run tests + # + + if (!$opt_verbose) + { + print "1..$nr_tests\n"; + } + else + { + print "Total tests: $nr_tests\n"; + } + + if ($opt_verbose) + { + print "Running tests with dynamic row format\n" + } + run_check_tests($suffix, $opt_silent, "", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with static row format\n"; + } + run_check_tests($suffix, $opt_silent, "-S", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-S", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-S", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with block row format\n"; + } + run_check_tests($suffix, $opt_silent, "-M", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-M", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-M", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with block row format and transactions\n"; + } + run_check_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0); + + if ($opt_verbose) + { + print "\nRunning tests with block row format, transactions and versioning\n"; + } + run_check_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0); + run_repair_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0); + run_pack_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0); + + + if ($opt_verbose) + { + print "\nRunning tests with warnings and recovery\n"; + } + run_tests_on_warnings_and_errors($suffix, $opt_silent, $opt_verbose, 0); + run_ma_test_recovery($opt_verbose, 0); + run_tests_on_clrs($suffix, $opt_verbose, 0); + + unlink_all_possible_tmp_files(); + if ($using_internal_tmpdir) + { + rmdir($tmpdir); + } + rmdir($exec_dir); + chdir(".."); + rmdir($exec_dir); + exit($runtime_error); +} + +#### +#### regular tests +#### + +sub run_check_tests +{ + my ($suffix, $silent, $row_type, $verbose, $count)= @_; + my ($i, $nr_tests); + my @ma_test1_opt= ( ["","-se"], + ["-N","-se"], + ["-P --checksum","-se"], + ["-P -N","-se"], + ["-B -N -R2","-sm"], + ["-a -k 480 --unique","-sm"], + ["-a -N -R1 ","-sm"], + ["-p","-sm"], + ["-p -N --unique","-sm"], + ["-p -N --key_length=127 --checksum","-sm"], + ["-p -N --key_length=128","-sm"], + ["-p --key_length=480","-sm"], + ["-a -B","-sm"], + ["-a -B --key_length=64 --unique","-sm"], + ["-a -B -k 480 --checksum","-sm"], + ["-a -B -k 480 -N --unique --checksum","-sm"], + ["-a -m","-sm"], + ["-a -m -P --unique --checksum","-sm"], + ["-a -m -P --key_length=480 --key_cache","-sm"], + ["-m -p","-sm"], + ["-w --unique","-sm"], + ["-a -w --key_length=64 --checksum","-sm"], + ["-a -w -N --key_length=480","-sm"], + ["-a -w --key_length=480 --checksum","-sm"], + ["-a -b -N","-sm"], + ["-a -b --key_length=480","-sm"], + ["-p -B --key_length=480","-sm"], + ["--checksum --unique","-se"], + ["--unique","-se"], + ["--rows-no-data", "-s"], + ["--key_multiple -N -S","-sm"], + ["--key_multiple -a -p --key_length=480","-sm"], + ["--key_multiple -a -B --key_length=480","-sm"], + ["--key_multiple -P -S","-sm"] ); + my @ma_test2_opt= ( ["-L -K -W -P","-sm"], + ["-L -K -W -P -A","-sm"], + ["-L -K -W -P -b32768", "-sm"], + ["-L -K -W -P -M -T -c -b32768 -t4 -m300", "-sm"], + ["-L -K -P -R3 -m50 -b1000000", "-sm"], + ["-L -B","-sm"], + ["-D -B -c","-sm"], + ["-m10000 -e4096 -K","-sm"], + ["-m10000 -e8192 -K","-sm"], + ["-m10000 -e16384 -E16384 -K -L","-sm"], + ["-L -K -W -P -b32768", "-se"], + ["-c -b65000","-se"] ); + my @ma_rt_test_opt= ( ); # (["--checksum", "-se"] ); + + + if ($count) + { + $nr_tests= 2; # Number of tests outside loops + for ($i= 0; defined($ma_test1_opt[$i]); $i++) { $nr_tests+=2; } + for ($i= 0; defined($ma_test2_opt[$i]); $i++) { $nr_tests+=2; } + for ($i= 0; defined($ma_rt_test_opt[$i]); $i++) { $nr_tests+=2; } + return $nr_tests; + } + + for ($i= 0; defined($ma_test1_opt[$i]); $i++) + { + unlink_log_files(); + ok("$maria_exe_path/ma_test1$suffix $silent -h$tmpdir $ma_test1_opt[$i][0] $row_type", + $verbose, $i + 1); + ok("$maria_exe_path/aria_chk$suffix -h$tmpdir $ma_test1_opt[$i][1] $tmpdir/test1", + $verbose, $i + 1); + } + # + # These tests are outside the loops. Make sure to include them in + # nr_tests manually + # + ok("$maria_exe_path/aria_pack$suffix --force -s $tmpdir/test1", $verbose, 0); + ok("$maria_exe_path/aria_chk$suffix -ess $tmpdir/test1", $verbose, 0); + + for ($i= 0; defined($ma_test2_opt[$i]); $i++) + { + unlink_log_files(); + ok("$maria_exe_path/ma_test2$suffix $silent -h$tmpdir $ma_test2_opt[$i][0] $row_type", + $verbose, $i + 1); + ok("$maria_exe_path/aria_chk$suffix -h$tmpdir $ma_test2_opt[$i][1] $tmpdir/test2", + $verbose, $i + 1); + } + + for ($i= 0; defined($ma_rt_test_opt[$i]); $i++) + { + unlink_log_files(); + ok("$maria_exe_path/ma_rt_test$suffix $silent -h$tmpdir $ma_rt_test_opt[$i][0] $row_type", + $verbose, $i + 1); + ok("$maria_exe_path/aria_chk$suffix -h$tmpdir $ma_rt_test_opt[$i][1] $tmpdir/rt_test", + $verbose, $i + 1); + } + + unlink_log_files(); + + return 0; +} + +#### +#### repair tests +#### + +sub run_repair_tests() +{ + my ($suffix, $silent, $row_type, $verbose, $count)= @_; + my ($i); + + my @t= ($NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix --silent -re --transaction-log test1", + "$maria_exe_path/aria_chk$suffix -rs test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rs --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rqs --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -ros --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -rqos --correct-checksum test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -sz test1", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type", + "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_chk$suffix -s --parallel-recover --quick test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/ma_test2$suffix $silent -c $row_type", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_chk$suffix -sr test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/ma_test2$suffix $silent -c -t4 -b32768 $row_type", + "$maria_exe_path/aria_chk$suffix -s --zerofill test1", + "$maria_exe_path/aria_chk$suffix -se test1" + ); + + return &count_tests(\@t) if ($count); + &run_test_bunch(\@t, $verbose, 0); + return 0; +} + +#### +#### pack tests +#### + +sub run_pack_tests() +{ + my ($suffix, $silent, $row_type, $verbose, $count)= @_; + my ($i); + + my @t= ($NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type", + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -ess test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -es test1", + "$maria_exe_path/aria_chk$suffix -rs test1", + "$maria_exe_path/aria_chk$suffix -es test1", + "$maria_exe_path/aria_chk$suffix -rus test1", + "$maria_exe_path/aria_chk$suffix -es test1", + $NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type", + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -rus --safe-recover test1", + "$maria_exe_path/aria_chk$suffix -es test1", + $NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent --checksum -S $row_type", + "$maria_exe_path/aria_chk$suffix -se test1", + "$maria_exe_path/aria_chk$suffix -ros test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -se test1", + $NEW_TEST, + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -rqs test1", + "$maria_exe_path/aria_chk$suffix -es test1", + "$maria_exe_path/aria_chk$suffix -rus test1", + "$maria_exe_path/aria_chk$suffix -es test1", + $NEW_TEST, + "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type", + "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + "$maria_exe_path/aria_pack$suffix --force -s test1", + "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2", + "$maria_exe_path/aria_chk$suffix -se test2", + $NEW_TEST, + "$maria_exe_path/ma_test1$suffix $silent -c $row_type", + "cp test1.MAD test2.MAD", + "cp test1.MAI test2.MAI", + "$maria_exe_path/aria_pack$suffix --force -s --join=test3 test1 test2", + ); + + return (&count_tests(\@t) + 3) if ($count); + &run_test_bunch(\@t, $verbose, 0); + + ok("$maria_exe_path/aria_chk -s test3", $verbose, 0, 1); + @t= ("$maria_exe_path/aria_chk -s --safe-recover test3", + "$maria_exe_path/aria_chk -s test3"); + &run_test_bunch(\@t, $verbose, 0); + + return 0; +} + +#### +#### Tests that gives warnings or errors +#### + +sub run_tests_on_warnings_and_errors +{ + my ($suffix, $silent, $verbose, $count)= @_; + my ($com); + + return 9 if ($count); # Number of tests in this function, e.g. calls to ok() + + ok("$maria_exe_path/ma_test2$suffix -h$tmpdir $silent -L -K -W -P -S -R1 -m500", + $verbose, 0); + ok("$maria_exe_path/aria_chk$suffix -h$tmpdir -sm $tmpdir/test2", $verbose, 0); + # ma_test2$suffix $silent -L -K -R1 -m2000 ; Should give error 135\n + # In the following a failure is a success and success is a failure + $com= "$maria_exe_path/ma_test2$suffix -h$tmpdir $silent -L -K -R1 -m2000 "; + $com.= ">ma_test2_message.txt 2>&1"; + ok($com, $verbose, 0, 1); + ok("cat ma_test2_message.txt", $verbose, 0); + ok("grep \"Error: 135\" ma_test2_message.txt > /dev/null", $verbose, 0); + # maria_exe_path/aria_chk$suffix -h$tmpdir -sm $tmpdir/test2 will warn that + # Datafile is almost full + ok("$maria_exe_path/aria_chk$suffix -h$tmpdir -sm $tmpdir/test2 >ma_test2_message.txt 2>&1", + $verbose, 0, 1); + ok("cat ma_test2_message.txt", $verbose, 0); + ok("grep \"warning: Datafile is almost full\" ma_test2_message.txt>/dev/null", + $verbose, 0); + unlink <ma_test2_message.txt>; + ok("$maria_exe_path/aria_chk$suffix -h$tmpdir -ssm $tmpdir/test2", $verbose, 0); + + return 0; +} + +#### +#### Test that removing tables and applying the log leads to identical tables +#### + +sub run_ma_test_recovery +{ + my ($verbose, $count)= @_; + + return 1 if ($count); # Number of tests in this function + ok("$maria_path/unittest/ma_test_recovery.pl", $verbose, 0); + return 0; +} + +#### +#### Tests on CLR's +#### + +sub run_tests_on_clrs +{ + my ($suffix, $verbose, $count)= @_; + my ($i); + + my @t= ($NEW_TEST, + "$maria_exe_path/ma_test2$suffix -h$tmpdir -s -L -K -W -P -M -T -c -b -t2 -A1", + "cp $tmpdir/aria_log_control $tmpdir/aria_log_control.backup", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -s -e $tmpdir/test2", + "mv $tmpdir/aria_log_control.backup $tmpdir/aria_log_control", + "rm $tmpdir/test2.MA?", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -s -e $tmpdir/test2", + "rm $tmpdir/test2.MA?", + $NEW_TEST, + "$maria_exe_path/ma_test2$suffix -h$tmpdir -s -L -K -W -P -M -T -c -b -t2 -A1", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir ", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -s -e $tmpdir/test2", + "rm $tmpdir/test2.MA?", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -e -s $tmpdir/test2", + "rm $tmpdir/test2.MA?", + $NEW_TEST, + "$maria_exe_path/ma_test2$suffix -h$tmpdir -s -L -K -W -P -M -T -c -b32768 -t4 -A1", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -es $tmpdir/test2", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir ", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -es $tmpdir/test2", + "rm $tmpdir/test2.MA?", + "$maria_exe_path/aria_read_log$suffix -a -s -h$tmpdir", + "$maria_exe_path/aria_chk$suffix -h$tmpdir -es $tmpdir/test2", + "rm $tmpdir/test2.MA?" + ); + + return &count_tests(\@t) if ($count); + &run_test_bunch(\@t, $verbose, 1); + return 0; +} + +# +# Print "ok" on success and "not ok" on error +# +# Note: Every time this function is called it will be counted +# as a unit test. +# +# Args: $com: The actual command run. Will be printed on a failure +# $verbose: Be more verbose. +# $iteration: Number of iterations in a loop when the error +# occurred. If not in loop, this should be blank +# (e.g. send zero). +# $expected_error: Optional; put here expected error code. Test +# will pass with this result only. +# +# Return value: Will return 1 on success and 0 on an error +# + +sub ok +{ + my ($com, $verbose, $iteration, $expected_error)= @_; + my ($msg, $output, $err, $errcode, $len); + + $test_counter++; + if ($test_begin > $test_counter) + { + return 0; + } + if ($test_end && $test_end < $test_counter) + { + exit(0); + } + + $msg= ""; + $expected_error= 0 if (!defined($expected_error)); + + if ($verbose) + { + # Print command with out the long unittest/../ prefix + my $tmp; + $tmp= $com; + $tmp =~ s|^unittest/../||; + print "$tmp "; + $len= length($tmp); + } + $output= `$com 2>&1`; + if ($verbose) + { + print " " x (62 - $len); + } + $err= $?; + $errcode= ($? >> 8); + if ((!$err && !$expected_error) || + ($errcode == $expected_error && $expected_error)) + { + print "[ " if ($verbose); + print "ok"; + if ($verbose) + { + print " ]"; + print " " x (5 - length("$test_counter")); + print "$test_counter"; + } + else + { + print " $test_counter - $com" + } + print "\n"; + return 1; + } + print "[ " if ($verbose); + print "not ok"; + print " ]" if ($verbose); + print " $test_counter - $com" unless $verbose; + print "\n"; + if ($verbose && defined($output) && length($output)) + { + print "$output\n"; + } + if (!$verbose) + { + $msg= "\n"; # Get a nicer output in perl unit test mode + } + $msg.= "Failed test '$com' "; + if ($iteration) + { + $msg.= "(loop iteration $iteration.) "; + } + $msg.= "at line "; + $msg.= (caller)[2]; + $msg.= "\n(errcode: $errcode, test: $test_counter)\n"; + if ($expected_error) + { + $msg.= "Was expecting errcode: $expected_error\n"; + } + warn $msg; + $runtime_error= 1; + if ($opt_abort_on_error) + { + exit 1; + } + # Unlink all files so that we can continue on error + unlink_all_possible_tmp_files(); + return 0; +} + +# +# Print "skip" and the reason +# +# Note: Every time this function is called it will be counted +# as a unit test. +# +# Args: $com: The actual command run. Will be printed on a failure +# $reason: The reason to skip a test +# $verbose: Be more verbose. +# + +sub skip +{ + my ($com, $reason, $verbose)= @_; + + $test_counter++; + return 0 if $test_begin > $test_counter; + exit 0 if $test_end && $test_end < $test_counter; + printf '%-64s[ skipped ]%5d', $com, $test_counter if $verbose; + print "ok $test_counter # skip $reason" unless $verbose; + print "\n"; + return 1; +} + +#### +#### Count tests +#### Arguments: $t: an array of the tests +#### + +sub count_tests +{ + my ($t)= @_; + my ($i, $nr_tests); + + $nr_tests= 0; + for ($i= 0; defined(@$t[$i]); $i++) { $nr_tests++ if (@$t[$i]); } + return $nr_tests; +} + +sub unlink_log_files +{ + unlink "$full_tmpdir/aria_log_control", "$full_tmpdir/aria_log.00000001", "$full_tmpdir/aria_log.00000002"; +} + +sub unlink_all_possible_tmp_files() +{ + unlink_log_files(); + + # Unlink tmp files that may have been created when testing the test programs + unlink <$full_tmpdir/*.TMD $full_tmpdir/aria_read_log_test1.txt $full_tmpdir/test1*.MA? $full_tmpdir/ma_test_recovery.output aria_log_control aria_log.00000001 aria_log.00000002 aria_logtest1.MA? test1.MA? test2.MA? test3.MA? *.TMD>; +} + +#### +#### Run a bunch of tests +#### Arguments: $t: an array of the tests +#### $verbose: to be passed to ok() +#### $clear: clear log files if set +#### + +sub run_test_bunch +{ + my ($t, $verbose, $clear)= @_; + my ($i); + + for ($i= 0; defined(@$t[$i]); $i++) + { + if ($clear && @$t[$i] eq $NEW_TEST) + { + unlink_log_files(); + } + if (@$t[$i] ne $NEW_TEST) + { + ok(@$t[$i], $verbose, $i + 1); + } + } +} + +#### +#### usage +#### + +sub usage +{ + print <<EOF; +$my_progname version $VER + +Description: + +Run various Aria related tests. Typically used via make test as a unittest. + +Options +--help Show this help and exit. +--abort-on-error Abort at once in case of error. +--number-of-tests Print the total number of tests and exit. +--run-tests=... Test range that should be run. You can give just + one number, to start tests from this test, or a range. + For example 45..89. To run a specific test alone, + for example test 215, use --run-tests=215..215 + Use this option with caution, because some of the tests + might depend on previous ones. +--start-from=... Alias for --run-tests +--silent=... Silent option passed to ma_test* tests ('$opt_silent') +--tmpdir=... Store tests data in this directory (works for most tests) +--valgrind=... Options for valgrind. + ('$opt_valgrind') +--verbose Be more verbose. Will print each unittest on a line + and result after. This mode cannot be used with unit.pl + when running in normal unit test mode. +--version Show version number and exit. +EOF + exit(0); +} diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c new file mode 100644 index 00000000..ccda66af --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler-t.c @@ -0,0 +1,668 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); +extern void example_loghandler_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif +static TRN *trn= &dummy_transaction_object; + +#define PCACHE_SIZE (1024*1024*10) + +#define LONG_BUFFER_SIZE (100 * 1024) + +#ifdef LONG_LOG_TEST +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE (1024L*1024L*8) +#define ITERATIONS (1600*4) + +#else +#undef SKIP_BIG_TESTS +#define SKIP_BIG_TESTS(X) /* no-op */ +#define LOG_FLAGS (TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC) +#define LOG_FILE_SIZE (1024L*1024L*8L) +#define ITERATIONS 1600 +#endif + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*1024L +#define ITERATIONS 181000 +*/ + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*3L +#define ITERATIONS 1600 +*/ + +/* +#define LOG_FLAGS 0 +#define LOG_FILE_SIZE 1024L*1024L*100L +#define ITERATIONS 65000 +*/ + +/* + Generate random value in the range (0,LONG_BUFFER_SIZE) +*/ +static uint32 rand_buffer_size() +{ + return (uint32)((ulonglong)rand()*(LONG_BUFFER_SIZE + 1)/RAND_MAX); +} + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + uchar buff[2]; + for (i= 0; i < length; i++) + { + if (i % 2 == 0) + int2store(buff, i >> 1); + if (ptr[i] != buff[i % 2]) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) buff[i % 2]); + return 1; + } + } + return 0; +} + + +/* + Report OK for read operation + + SYNOPSIS + read_ok() + rec the record header +*/ + +void read_ok(TRANSLOG_HEADER_BUFFER *rec) +{ + ok(1, "read record type: %u LSN: " LSN_FMT, + rec->type, LSN_IN_PARTS(rec->lsn)); +} + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE * 2 + 7 * 2 + 2); + if (translog_read_record(rec->lsn, 0, rec->record_length, buffer, NULL) != + rec->record_length) + return 1; + return check_content(buffer + skip, rec->record_length - skip); +} + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint32 i; + uint32 rec_len; + uchar long_tr_id[6]; + uchar lsn_buff[23]= + { + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + uchar long_buffer[LONG_BUFFER_SIZE * 2 + LSN_STORE_SIZE * 2 + 2]; + PAGECACHE pagecache; + LSN lsn, lsn_base, first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 3]; + struct st_translog_scanner_data scanner; + int rc; + MY_INIT(argv[0]); + + if (my_set_max_open_files(100) < 100) + { + fprintf(stderr, "can't allocate 100 file descriptors\n"); + exit(1); + } + bzero(&pagecache, sizeof(pagecache)); + + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + /* We don't need to do physical syncs in this test */ + my_disable_sync= 1; + + for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i+= 2) + { + int2store(long_buffer + i, (i >> 1)); + /* long_buffer[i]= (i & 0xFF); */ + } + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1); + + SKIP_BIG_TESTS(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1) + { + + srand(122334817L); + + long_tr_id[5]= 0xff; + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= 0; + trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + lsn_base= first_lsn= lsn; + + for (i= 1; i < ITERATIONS; i++) + { + trn->short_id= i % 0xFFFF; + if (i % 2) + { + lsn_store(lsn_buff, lsn_base); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + /* check auto-count feature */ + parts[TRANSLOG_INTERNAL_PARTS + 1].str= NULL; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= 0; + if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn, + NULL, LSN_STORE_SIZE, 0, parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + if ((rec_len= rand_buffer_size()) < 12) + rec_len= 12; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + /* check record length auto-counting */ + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + trn, NULL, 0, TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write var reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + } + else + { + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 23; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_2LSN_EXAMPLE, + trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "0 Can't write reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + if ((rec_len= rand_buffer_size()) < 19) + rec_len= 19; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 14; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE, + trn, NULL, 14 + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, parts, NULL, + NULL)) + { + fprintf(stderr, "0 Can't write var reference defore record #%lu\n", + (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + } + int4store(long_tr_id, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + + lsn_base= lsn; + + if ((rec_len= rand_buffer_size()) < 9) + rec_len= 9; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + trn, NULL, rec_len, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + if (translog_flush(lsn)) + { + fprintf(stderr, "Can't flush #%lu\n", (ulong) i); + translog_destroy(); + ok(0, "flush"); + exit(1); + } + ok(1, "flush"); + } + + if (translog_flush(translog_get_horizon())) + { + fprintf(stderr, "Can't flush up to horizon\n"); + translog_destroy(); + ok(0, "flush"); + exit(1); + } + ok(1, "flush"); + + srand(122334817L); + + rc= 1; + + { + int len= translog_read_record_header(first_lsn, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, " + "lsn" LSN_FMT "\n", + (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length, + (uint) uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + read_ok(&rec); + translog_free_record_header(&rec); + lsn= first_lsn; + if (translog_scanner_init(first_lsn, 1, &scanner, 0)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 1;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != ITERATIONS) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS); + goto err; + } + break; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 7 || ref != lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE " + "data read(%d) " + "type: %u strid: %u len: %u" + "ref: " LSN_FMT " " LSN_FMT " " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref), LSN_IN_PARTS(lsn), + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 23 || + ref1 != lsn || + ref2 != first_lsn || + ((uchar)rec.header[22]) != 0x55 || + ((uchar)rec.header[21]) != 0xAA || + ((uchar)rec.header[20]) != 0x55 || + ((uchar)rec.header[19]) != 0xAA || + ((uchar)rec.header[18]) != 0x55 || + ((uchar)rec.header[17]) != 0xAA || + ((uchar)rec.header[16]) != 0x55 || + ((uchar)rec.header[15]) != 0xAA || + ((uchar)rec.header[14]) != 0x55) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %u, ref1" LSN_FMT ", " + "ref2" LSN_FMT " %x%x%x%x%x%x%x%x%x " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + (uint) rec.header[14], (uint) rec.header[15], + (uint) rec.header[16], (uint) rec.header[17], + (uint) rec.header[18], (uint) rec.header[19], + (uint) rec.header[20], (uint) rec.header[21], + (uint) rec.header[22], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header (var) " + "failed (%d)\n", i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration (first var) %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if ((rec_len= rand_buffer_size()) < 12) + rec_len= 12; + if (rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE || + len != 12 || ref != lsn || + check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), " + "hdr len: %u (%d), " + "ref" LSN_FMT ", lsn" LSN_FMT " (%d), content: %d\n", + i, (uint) rec.type, + rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + (uint) rec.short_trid, + rec.short_trid != (i % 0xFFFF), + (ulong) rec.record_length, (ulong) rec_len, + rec.record_length != rec_len + LSN_STORE_SIZE, + (uint) len, + len != 12, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn), + (len != 12 || ref != lsn), + check_content(rec.header + LSN_STORE_SIZE, + len - LSN_STORE_SIZE)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if ((rec_len= rand_buffer_size()) < 19) + rec_len= 19; + if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE * 2 || + len != 19 || + ref1 != lsn || + ref2 != first_lsn || + check_content(rec.header + LSN_STORE_SIZE * 2, + len - LSN_STORE_SIZE * 2)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, " + "ref1" LSN_FMT ", ref2" LSN_FMT ", " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + LSN_IN_PARTS(rec.lsn)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + } + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 6 || uint4korr(rec.header) != i || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + (uint) uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + goto err; + } + lsn= rec.lsn; + read_ok(&rec); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if ((rec_len= rand_buffer_size()) < 9) + rec_len= 9; + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len || + len != 9 || check_content(rec.header, (uint)len)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu, hdr len: %d, " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(rec.lsn)); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + goto err; + } + read_ok(&rec); + translog_free_record_header(&rec); + } + } + + rc= 0; +err: + if (rc) + ok(0, "read record"); + } /* SKIP_BIG_TESTS */ + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + return(MY_TEST(exit_status())); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c new file mode 100644 index 00000000..21f6b7d7 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c @@ -0,0 +1,165 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn, first_lsn, theor_lsn; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + /* + Don't give an error if we can't create dir, as it may already exist from a previously aborted + run + */ + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE,TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + theor_lsn= translog_first_theoretical_lsn(); + if (theor_lsn == 1) + { + fprintf(stderr, "Error reading the first log file."); + translog_destroy(); + exit(1); + } + if (theor_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "There is no first log file."); + translog_destroy(); + exit(1); + } + first_lsn= translog_first_lsn_in_log(); + if (first_lsn != LSN_IMPOSSIBLE) + { + fprintf(stderr, "Incorrect first lsn response " LSN_FMT ".", + LSN_IN_PARTS(first_lsn)); + translog_destroy(); + exit(1); + } + ok(1, "Empty log response"); + + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + theor_lsn= translog_first_theoretical_lsn(); + if (theor_lsn == 1) + { + fprintf(stderr, "Error reading the first log file\n"); + translog_destroy(); + exit(1); + } + if (theor_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "There is no first log file\n"); + translog_destroy(); + exit(1); + } + first_lsn= translog_first_lsn_in_log(); + if (first_lsn != theor_lsn) + { + fprintf(stderr, "Incorrect first lsn: " LSN_FMT " " + " theoretical first: " LSN_FMT "\n", + LSN_IN_PARTS(first_lsn), LSN_IN_PARTS(theor_lsn)); + translog_destroy(); + exit(1); + } + + ok(1, "Full log response"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + exit(0); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c new file mode 100644 index 00000000..391d7851 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c @@ -0,0 +1,163 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (8*1024L*1024L) +#define LOG_FLAGS 0 + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn, max_lsn, last_lsn= LSN_IMPOSSIBLE; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + max_lsn= translog_get_file_max_lsn_stored(1); + if (max_lsn == 1) + { + fprintf(stderr, "Error reading the first log file."); + translog_destroy(); + exit(1); + } + if (max_lsn != LSN_IMPOSSIBLE) + { + fprintf(stderr, "Incorrect first lsn response " LSN_FMT ".", + LSN_IN_PARTS(max_lsn)); + translog_destroy(); + exit(1); + } + ok(1, "Empty log response"); + + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + for(i= 0; i < LOG_FILE_SIZE/6; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + if (LSN_FILE_NO(lsn) == 1) + last_lsn= lsn; + } + + + max_lsn= translog_get_file_max_lsn_stored(1); + if (max_lsn == 1) + { + fprintf(stderr, "Error reading the first log file\n"); + translog_destroy(); + exit(1); + } + if (max_lsn == LSN_IMPOSSIBLE) + { + fprintf(stderr, "Isn't first file still finished?!!\n"); + translog_destroy(); + exit(1); + } + if (max_lsn != last_lsn) + { + fprintf(stderr, "Incorrect max lsn: " LSN_FMT " " + " last lsn on first file: " LSN_FMT "\n", + LSN_IN_PARTS(max_lsn), LSN_IN_PARTS(last_lsn)); + translog_destroy(); + exit(1); + } + + ok(1, "First file max LSN"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + exit(0); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c new file mode 100644 index 00000000..e8e114dd --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c @@ -0,0 +1,771 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" +#include "sequence_storage.h" +#include <my_getopt.h> + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif +static TRN *trn= &dummy_transaction_object; + + +#ifndef READONLY_TEST + +#define PCACHE_SIZE (1024*1024*10) +#define LONG_BUFFER_SIZE ((1024L*1024L*1024L) + (1024L*1024L*512)) +#define MIN_REC_LENGTH (1024L*1024L + 1024L*512L + 1) +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define ITERATIONS 2 +#define READONLY 0 +#define BIG 1 + +#else + +#define PCACHE_SIZE (1024*1024*10) +#define LONG_BUFFER_SIZE (1024L*1024L) +#define MIN_REC_LENGTH (1024L) +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define ITERATIONS 2 +#define READONLY 1 +#undef BIG + +#endif /*READONLY_TEST*/ + + +/* +#define LOG_FILE_SIZE 1024L*1024L*3L +#define ITERATIONS 1600 +*/ +/* +#define LOG_FILE_SIZE 1024L*1024L*100L +#define ITERATIONS 65000 +*/ + + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + uchar buff[4]; + DBUG_ENTER("check_content"); + for (i= 0; i < length; i++) + { + if (i % 4 == 0) + int4store(buff, (i >> 2)); + if (ptr[i] != buff[i % 4]) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) buff[i % 4]); + DBUG_DUMP("mem", ptr +(ulong) (i > 16 ? i - 16 : 0), + (i > 16 ? 16 : i) + (i + 16 < length ? 16 : length - i)); + DBUG_RETURN(1); + } + } + DBUG_RETURN(0); +} + + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + int res= 0; + translog_size_t len; + DBUG_ENTER("read_and_check_content"); + DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); + if ((len= translog_read_record(rec->lsn, 0, rec->record_length, + buffer, NULL)) != rec->record_length) + { + fprintf(stderr, "Requested %lu byte, read %lu\n", + (ulong) rec->record_length, (ulong) len); + res= 1; + } + res|= check_content(buffer + skip, rec->record_length - skip); + DBUG_RETURN(res); +} + +static const char *load_default_groups[]= {"ma_unit_loghandler", 0}; +#ifndef DBUG_OFF +static const char *default_dbug_option= + IF_WIN("d:t:i:O,\\ma_test_loghandler.trace", + "d:t:i:o,/tmp/ma_test_loghandler.trace"); +#endif +static const char *opt_wfile= NULL; +static const char *opt_rfile= NULL; +static struct my_option my_long_options[] = +{ +#ifndef DBUG_OFF + {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.", + 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0}, +#endif + {"write-seq", 'w', "Path to file in which \"random\" sequence used in the test will be written", + (uchar**) &opt_wfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"read-seq", 'r', "Path to file from which \"random\" sequence used in the test will be read", + (uchar**) &opt_rfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"help", '?', "Display this help and exit.", + 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; +static SEQ_STORAGE seq; + +static uint32 get_len() +{ + uint32 res; + DBUG_ENTER("get_len"); + if (opt_rfile) + res= seq_storage_next(&seq); + else + { + res= (uint32) + ((ulonglong) rand() * + (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1) / RAND_MAX) + MIN_REC_LENGTH; + if (opt_wfile && + seq_storage_write(opt_wfile, res)) + exit(1); + } + DBUG_PRINT("info", ("length value : %lu", (ulong) res)); + DBUG_RETURN(res); +} + +static void usage(void) +{ + puts("Copyright (C) 2008 MySQL AB"); + puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,"); + puts("and you are welcome to modify and redistribute it under the GPL license\n"); + + puts("Unit test of maria engine"); + printf("\nUsage: %s [OPTIONS]\n", my_progname_short); + my_print_help(my_long_options); + print_defaults("my", load_default_groups); + my_print_variables(my_long_options); +} + + +static my_bool +get_one_option(const struct my_option *opt, + const char *argument __attribute__((unused)), + const char *filename __attribute__((unused))) +{ + switch (opt->id) { + case '?': + usage(); + exit(0); +#ifndef DBUG_OFF + case '#': + DBUG_SET_INITIAL(argument ? argument : default_dbug_option); + break; +#endif + } + return 0; +} + + +static void get_options(int *argc,char ***argv) +{ + int ho_error; + + if ((ho_error= handle_options(argc, argv, my_long_options, get_one_option))) + exit(ho_error); + + if (opt_rfile && opt_wfile) + { + usage(); + exit(1); + } +} + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uint32 i; + uint32 rec_len; + uchar long_tr_id[6]; + uchar lsn_buff[23]= + { + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, + 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55 + }; + uchar *long_buffer; + char **default_argv; + PAGECACHE pagecache; + LSN lsn, lsn_base, first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_translog_scanner_data scanner; + const char *progname=argv[0]; + int rc; + MY_INIT(argv[0]); + + plan(0); // read configuration (MYTAP_CONFIG) +#ifdef BIG + if (skip_big_tests) + { + plan(1); + ok(1, "skipped as big test"); + my_end(0); + return 0; + } +#endif + + long_buffer= malloc(LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); + load_defaults_or_exit("my", load_default_groups, &argc, &argv); + default_argv= argv; + get_options(&argc, &argv); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= create_tmpdir(progname); + if (maria_log_remove(0)) + exit(1); + + /* We don't need to do physical syncs in this test */ + my_disable_sync= 1; + + { + uchar buff[4]; + for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i++) + { + if (i % 4 == 0) + int4store(buff, (i >> 2)); + long_buffer[i]= buff[i % 4]; + } + } + + bzero(long_tr_id, 6); + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + 0, 0, &translog_example_table_init, 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + plan(((ITERATIONS - 1) * 4 + 1) * 2); + + if (opt_rfile && + seq_storage_reader_init(&seq, opt_rfile)) + exit(1); + srand(122334817L); + + long_tr_id[5]= 0xff; + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= 0; + trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, parts, + NULL, NULL)) + { + fprintf(stderr, "Can't write record #%u\n", 0); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + lsn_base= first_lsn= lsn; + + for (i= 1; i < ITERATIONS; i++) + { + if (i % 2) + { + lsn_store(lsn_buff, lsn_base); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn, NULL, + LSN_STORE_SIZE, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL, NULL)) + { + fprintf(stderr, "1 Can't write var reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE"); + } + else + { + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + parts[TRANSLOG_INTERNAL_PARTS + 1].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= 23; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_2LSN_EXAMPLE, + trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "0 Can't write reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE"); + lsn_store(lsn_buff, lsn_base); + lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn); + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE * 2; + parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE, + trn, NULL, LSN_STORE_SIZE * 2 + rec_len, + TRANSLOG_INTERNAL_PARTS + 2, + parts, NULL, NULL)) + { + fprintf(stderr, "0 Can't write var reference before record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE"); + } + int4store(long_tr_id, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + trn, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE"); + + lsn_base= lsn; + + rec_len= get_len(); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len; + trn->short_id= i % 0xFFFF; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + trn, NULL, rec_len, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record #%u\n", i); + translog_destroy(); + ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + exit(1); + } + ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE"); + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (ma_control_file_open(TRUE,TRUE,TRUE)) + { + fprintf(stderr, "pass2: Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, 0) == 0) + { + fprintf(stderr, "pass2: Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + 0, READONLY, &translog_example_table_init, 0)) + { + fprintf(stderr, "pass2: Can't init loghandler (%d)\n", errno); + exit(1); + } + + + /* If we were writing sequence we need it only once */ + opt_wfile= NULL; + if (opt_rfile) + seq_storage_rewind(&seq); + srand(122334817L); + + rc= 1; + + { + int len= translog_read_record_header(first_lsn, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + translog_free_record_header(&rec); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, " + LSN_FMT "\n", + (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length, + (uint)uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + lsn= first_lsn; + if (translog_scanner_init(first_lsn, 1, &scanner, 0)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 1;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != ITERATIONS) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS); + translog_free_record_header(&rec); + goto err; + } + break; + } + + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != LSN_STORE_SIZE || ref != lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u, strid %u, len %u, ref" LSN_FMT ", lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 23 || + ref1 != lsn || + ref2 != first_lsn || + ((uchar)rec.header[22]) != 0x55 || + ((uchar)rec.header[21]) != 0xAA || + ((uchar)rec.header[20]) != 0x55 || + ((uchar)rec.header[19]) != 0xAA || + ((uchar)rec.header[18]) != 0x55 || + ((uchar)rec.header[17]) != 0xAA || + ((uchar)rec.header[16]) != 0x55 || + ((uchar)rec.header[15]) != 0xAA || + ((uchar)rec.header[14]) != 0x55) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %u, ref1" LSN_FMT ", " + "ref2" LSN_FMT " %x%x%x%x%x%x%x%x%x " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + (uint) rec.header[14], (uint) rec.header[15], + (uint) rec.header[16], (uint) rec.header[17], + (uint) rec.header[18], (uint) rec.header[19], + (uint) rec.header[20], (uint) rec.header[21], + (uint) rec.header[22], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + DBUG_ASSERT(0); + goto err; + } + } + ok(1, "read record"); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header (var) " + "failed (%d)\n", i, errno); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration (first var) %u " + "instead of beginning of %u\n", i, ITERATIONS); + goto err; + } + if (i % 2) + { + LSN ref; + ref= lsn_korr(rec.header); + rec_len= get_len(); + if (rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE || + len != 12 || ref != lsn || + check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "data read(%d)" + "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), " + "hdr len: %d (%d), " + "ref" LSN_FMT ", lsn" LSN_FMT " (%d), content: %d\n", + i, (uint) rec.type, + rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE, + (uint) rec.short_trid, + rec.short_trid != (i % 0xFFFF), + (ulong) rec.record_length, (ulong) rec_len, + rec.record_length != rec_len + LSN_STORE_SIZE, + len, + len != 12, + LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn), + (ref != lsn), + check_content(rec.header + LSN_STORE_SIZE, + len - LSN_STORE_SIZE)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + else + { + LSN ref1, ref2; + ref1= lsn_korr(rec.header); + ref2= lsn_korr(rec.header + LSN_STORE_SIZE); + rec_len= get_len(); + if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len + LSN_STORE_SIZE * 2 || + len != 19 || + ref1 != lsn || + ref2 != first_lsn || + check_content(rec.header + LSN_STORE_SIZE * 2, + len - LSN_STORE_SIZE * 2)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + " data read(%d) " + "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, " + "ref1" LSN_FMT ", ref2" LSN_FMT ", " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, + LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2), + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "read record"); + translog_free_record_header(&rec); + + len= translog_read_next_record_header(&scanner, &rec); + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + fprintf(stderr, "EOL met at the middle of iteration %u " + "instead of beginning of %u\n", i, ITERATIONS); + translog_free_record_header(&rec); + goto err; + } + if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != 6 || uint4korr(rec.header) != i || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u, len %u, i: %u, 4: %u 5: %u " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (uint) rec.record_length, + (uint)uint4korr(rec.header), (uint) rec.header[4], + (uint) rec.header[5], + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + + lsn= rec.lsn; + + len= translog_read_next_record_header(&scanner, &rec); + rec_len= get_len(); + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + rec.short_trid != (i % 0xFFFF) || + rec.record_length != rec_len || + len != 9 || check_content(rec.header, len)) + { + fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "type %u, strid %u, len %lu != %lu, hdr len: %d, " + "lsn" LSN_FMT "\n", + i, (uint) rec.type, (uint) rec.short_trid, + (ulong) rec.record_length, (ulong) rec_len, + len, LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + ok(1, "read record"); + translog_free_record_header(&rec); + } + } + + rc= 0; +err: + if (rc) + ok(0, "read record"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + free_defaults(default_argv); + seq_storage_destroy(&seq); + if (maria_log_remove(maria_data_root)) + exit(1); + + free(long_buffer); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + return (MY_TEST(exit_status())); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c new file mode 100644 index 00000000..ec097ede --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c @@ -0,0 +1,557 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) + +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +/*#define LOG_FLAGS TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC */ +#define LOG_FLAGS 0 +/*#define LONG_BUFFER_SIZE (1024L*1024L*1024L + 1024L*1024L*512)*/ + +#ifdef MULTIFLUSH_TEST + +#define LONG_BUFFER_SZ (16384L) +#define MIN_REC_LENGTH 10 +#define SHOW_DIVIDER 20 +#define ITERATIONS 10000 +#define FLUSH_ITERATIONS 1000 +#define WRITERS 2 +#define FLUSHERS 10 + +#else + +#define LONG_BUFFER_SZ (512L*1024L*1024L) +#define MIN_REC_LENGTH 30 +#define SHOW_DIVIDER 10 +#define ITERATIONS 3 +#define FLUSH_ITERATIONS 0 +#define WRITERS 3 +#define FLUSHERS 0 + +#endif + +#define LONG_BUFFER_SIZE (LONG_BUFFER_SZ >> (skip_big_tests ? 4 : 0)) + +static uint number_of_writers= WRITERS; +static uint number_of_flushers= FLUSHERS; + +static pthread_cond_t COND_thread_count; +static pthread_mutex_t LOCK_thread_count; +static uint thread_count; + +static ulong lens[WRITERS][ITERATIONS]; +static LSN lsns1[WRITERS][ITERATIONS]; +static LSN lsns2[WRITERS][ITERATIONS]; +static uchar *long_buffer; + + +static LSN last_lsn; /* For test purposes the variable allow dirty read/write */ + +/* + Get pseudo-random length of the field in + limits [MIN_REC_LENGTH..LONG_BUFFER_SIZE] + + SYNOPSIS + get_len() + + RETURN + length - length >= 0 length <= LONG_BUFFER_SIZE +*/ + +static uint32 get_len() +{ + return MIN_REC_LENGTH + + (uint32)(((ulonglong)rand())* + (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)/RAND_MAX); +} + + +/* + Check that the buffer filled correctly + + SYNOPSIS + check_content() + ptr Pointer to the buffer + length length of the buffer + + RETURN + 0 - OK + 1 - Error +*/ + +static my_bool check_content(uchar *ptr, ulong length) +{ + ulong i; + for (i= 0; i < length; i++) + { + if (((uchar)ptr[i]) != (i & 0xFF)) + { + fprintf(stderr, "Byte # %lu is %x instead of %x", + i, (uint) ptr[i], (uint) (i & 0xFF)); + return 1; + } + } + return 0; +} + + +/* + Read whole record content, and check content (put with offset) + + SYNOPSIS + read_and_check_content() + rec The record header buffer + buffer The buffer to read the record in + skip Skip this number of bytes ot the record content + + RETURN + 0 - OK + 1 - Error +*/ + + +static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec, + uchar *buffer, uint skip) +{ + int res= 0; + translog_size_t len; + + if ((len= translog_read_record(rec->lsn, 0, rec->record_length, + buffer, NULL)) != rec->record_length) + { + fprintf(stderr, "Requested %lu byte, read %lu\n", + (ulong) rec->record_length, (ulong) len); + res= 1; + } + res|= check_content(buffer + skip, rec->record_length - skip); + return(res); +} + +void writer(int num) +{ + LSN lsn; + TRN trn; + uchar long_tr_id[6]; + uint i; + + trn.short_id= num; + trn.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + for (i= 0; i < ITERATIONS; i++) + { + uint len= get_len(); + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + lens[num][i]= len; + + int2store(long_tr_id, num); + int4store(long_tr_id + 2, i); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write LOGREC_FIXED_RECORD_0LSN_EXAMPLE record #%lu " + "thread %i\n", (ulong) i, num); + translog_destroy(); + pthread_mutex_lock(&LOCK_thread_count); + ok(0, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + return; + } + lsns1[num][i]= lsn; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= len; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + &trn, NULL, + len, TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i); + translog_destroy(); + pthread_mutex_lock(&LOCK_thread_count); + ok(0, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + return; + } + lsns2[num][i]= lsn; + last_lsn= lsn; + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "write records"); + pthread_mutex_unlock(&LOCK_thread_count); + } + return; +} + + +static void *test_thread_writer(void *arg) +{ + int param= *((int*) arg); + + my_thread_init(); + + writer(param); + + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + ok(1, "writer finished"); /* just to show progress */ + pthread_cond_signal(&COND_thread_count); /* Tell main we are + ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + return(0); +} + + +static void *test_thread_flusher(void *arg) +{ + int param= *((int*) arg); + int i; + + my_thread_init(); + + for(i= 0; i < FLUSH_ITERATIONS; i++) + { + translog_flush(last_lsn); + pthread_mutex_lock(&LOCK_thread_count); + ok(1, "-- flush %d", param); + pthread_mutex_unlock(&LOCK_thread_count); + } + + pthread_mutex_lock(&LOCK_thread_count); + thread_count--; + ok(1, "flusher finished"); /* just to show progress */ + pthread_cond_signal(&COND_thread_count); /* Tell main we are + ready */ + pthread_mutex_unlock(&LOCK_thread_count); + free((uchar*) arg); + my_thread_end(); + return(0); +} + + +int main(int argc __attribute__((unused)), + char **argv __attribute__ ((unused))) +{ + uint32 i; + PAGECACHE pagecache; + LSN first_lsn; + TRANSLOG_HEADER_BUFFER rec; + struct st_translog_scanner_data scanner; + pthread_t tid; + pthread_attr_t thr_attr; + int *param, error; + int rc; + MY_INIT(argv[0]); + + // plan read MYTAP_CONFIG so skip_big_tests will be set before using + plan(WRITERS + FLUSHERS + + ITERATIONS * WRITERS * 3 + FLUSH_ITERATIONS * FLUSHERS ); + /* We don't need to do physical syncs in this test */ + my_disable_sync= 1; + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + long_buffer= malloc(LONG_BUFFER_SIZE + 7 * 2 + 2); + if (long_buffer == 0) + { + fprintf(stderr, "End of memory\n"); + exit(1); + } + for (i= 0; i < (uint32)(LONG_BUFFER_SIZE + 7 * 2 + 2); i++) + long_buffer[i]= (i & 0xFF); + +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + + if ((error= pthread_cond_init(&COND_thread_count, NULL))) + { + fprintf(stderr, "COND_thread_count: %d from pthread_cond_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST))) + { + fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_attr_init(&thr_attr))) + { + fprintf(stderr, "Got error: %d from pthread_attr_init " + "(errno: %d)\n", error, errno); + exit(1); + } + if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED))) + { + fprintf(stderr, + "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n", + error, errno); + exit(1); + } + +#ifdef HAVE_THR_SETCONCURRENCY + thr_setconcurrency(2); +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + TRANSLOG_PAGE_SIZE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + srand(122334817L); + { + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar long_tr_id[6]= + { + 0x11, 0x22, 0x33, 0x44, 0x55, 0x66 + }; + + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&first_lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write the first record\n"); + translog_destroy(); + exit(1); + } + } + + + pthread_mutex_lock(&LOCK_thread_count); + while (number_of_writers != 0 || number_of_flushers != 0) + { + if (number_of_writers) + { + param= (int*) malloc(sizeof(int)); + *param= number_of_writers - 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread_writer, + (void*) param))) + { + fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n", + error, errno); + exit(1); + } + thread_count++; + number_of_writers--; + } + if (number_of_flushers) + { + param= (int*) malloc(sizeof(int)); + *param= number_of_flushers - 1; + if ((error= pthread_create(&tid, &thr_attr, test_thread_flusher, + (void*) param))) + { + fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n", + error, errno); + exit(1); + } + thread_count++; + number_of_flushers--; + } + } + pthread_mutex_unlock(&LOCK_thread_count); + + pthread_attr_destroy(&thr_attr); + + /* wait finishing */ + pthread_mutex_lock(&LOCK_thread_count); + while (thread_count) + { + if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count))) + fprintf(stderr, "COND_thread_count: %d from pthread_cond_wait\n", error); + } + pthread_mutex_unlock(&LOCK_thread_count); + + /* Find last LSN and flush up to it (all our log) */ + { + LSN max= 0; + for (i= 0; i < WRITERS; i++) + { + if (cmp_translog_addr(lsns2[i][ITERATIONS - 1], max) > 0) + max= lsns2[i][ITERATIONS - 1]; + } + translog_flush(max); + } + + rc= 1; + + { + uint indeces[WRITERS]; + uint index, stage; + int len; + bzero(indeces, sizeof(uint) * WRITERS); + + bzero(indeces, sizeof(indeces)); + + if (translog_scanner_init(first_lsn, 1, &scanner, 0)) + { + fprintf(stderr, "scanner init failed\n"); + goto err; + } + for (i= 0;; i++) + { + len= translog_read_next_record_header(&scanner, &rec); + + if (len == RECHEADER_READ_ERROR) + { + fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n", + i, errno); + translog_free_record_header(&rec); + goto err; + } + if (len == RECHEADER_READ_EOF) + { + if (i != WRITERS * ITERATIONS * 2) + { + fprintf(stderr, "EOL met at iteration %u instead of %u\n", + i, ITERATIONS * WRITERS * 2); + translog_free_record_header(&rec); + goto err; + } + break; + } + index= indeces[rec.short_trid] / 2; + stage= indeces[rec.short_trid] % 2; + if (stage == 0) + { + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || + rec.record_length != 6 || + uint2korr(rec.header) != rec.short_trid || + index != uint4korr(rec.header + 2) || + cmp_translog_addr(lsns1[rec.short_trid][index], rec.lsn) != 0) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(%d)\n" + "type %u, strid %u %u, len %u, i: %u %u, " + "lsn" LSN_FMT " " LSN_FMT "\n", + i, (uint) rec.type, + (uint) rec.short_trid, (uint) uint2korr(rec.header), + (uint) rec.record_length, + (uint) index, (uint) uint4korr(rec.header + 2), + LSN_IN_PARTS(rec.lsn), + LSN_IN_PARTS(lsns1[rec.short_trid][index])); + translog_free_record_header(&rec); + goto err; + } + } + else + { + if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE || + len != 9 || + rec.record_length != lens[rec.short_trid][index] || + cmp_translog_addr(lsns2[rec.short_trid][index], rec.lsn) != 0 || + check_content(rec.header, (uint)len)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "data read(%d) " + "thread: %d, iteration %d, stage %d\n" + "type %u (%d), len %d, length %lu %lu (%d) " + "lsn" LSN_FMT " " LSN_FMT "\n", + i, (uint) rec.short_trid, index, stage, + (uint) rec.type, (rec.type != + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE), + len, + (ulong) rec.record_length, lens[rec.short_trid][index], + (rec.record_length != lens[rec.short_trid][index]), + LSN_IN_PARTS(rec.lsn), + LSN_IN_PARTS(lsns2[rec.short_trid][index])); + translog_free_record_header(&rec); + goto err; + } + if (read_and_check_content(&rec, long_buffer, 0)) + { + fprintf(stderr, + "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE " + "in whole rec read lsn" LSN_FMT "\n", + LSN_IN_PARTS(rec.lsn)); + translog_free_record_header(&rec); + goto err; + } + } + ok(1, "record read"); + translog_free_record_header(&rec); + indeces[rec.short_trid]++; + } + } + + rc= 0; +err: + if (rc) + ok(0, "record read"); + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + return(exit_status()); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c new file mode 100644 index 00000000..46b3a8e7 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c @@ -0,0 +1,147 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); +extern void translog_example_table_init(); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +int main(int argc __attribute__((unused)), char *argv[]) +{ + int rc= 1; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN first_lsn; + TRANSLOG_HEADER_BUFFER rec; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + translog_size_t len; + + MY_INIT(argv[0]); + + plan(1); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + int4store(long_tr_id, 0); + long_tr_id[5]= 0xff; + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&first_lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + len= translog_read_record_header(first_lsn, &rec); + if (len == 0) + { + fprintf(stderr, "translog_read_record_header failed (%d)\n", errno); + goto err; + } + if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 || + rec.record_length != 6 || uint4korr(rec.header) != 0 || + ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF || + first_lsn != rec.lsn) + { + fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE " + "data read(0)\n" + "type: %u (%d) strid: %u (%d) len: %u (%d) i: %u (%d), " + "4: %u (%d) 5: %u (%d) " + "lsn" LSN_FMT " (%d)\n", + (uint) rec.type, (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE), + (uint) rec.short_trid, (rec.short_trid != 0), + (uint) rec.record_length, (rec.record_length != 6), + (uint) uint4korr(rec.header), (uint4korr(rec.header) != 0), + (uint) rec.header[4], (((uchar)rec.header[4]) != 0), + (uint) rec.header[5], (((uchar)rec.header[5]) != 0xFF), + LSN_IN_PARTS(rec.lsn), (first_lsn != rec.lsn)); + goto err; + } + + ok(1, "read OK"); + rc= 0; + +err: + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + exit(rc); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_nologs-t.c b/storage/maria/unittest/ma_test_loghandler_nologs-t.c new file mode 100644 index 00000000..b95d8bee --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_nologs-t.c @@ -0,0 +1,203 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (8*1024L*1024L) +#define LOG_FLAGS 0 +#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2) + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE); + + MY_INIT(argv[0]); + + plan(2); + + bzero(&pagecache, sizeof(pagecache)); + bzero(long_buffer, LONG_BUFFER_SIZE); + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #0\n"); + translog_destroy(); + exit(1); + } + + for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #0\n"); + translog_destroy(); + exit(1); + } + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + { + char file_name[FN_REFLEN]; + for (i= 1; i <= 2; i++) + { + translog_filename_by_fileno(i, file_name); + if (my_access(file_name, W_OK)) + { + fprintf(stderr, "No file '%s'\n", file_name); + exit(1); + } + if (my_delete(file_name, MYF(MY_WME)) != 0) + { + fprintf(stderr, "Error %d during removing file'%s'\n", + errno, file_name); + exit(1); + } + } + } + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 1)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + ok(1, "Log init OK"); + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #0\n"); + translog_destroy(); + exit(1); + } + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + + if (!translog_is_file(3)) + { + fprintf(stderr, "No file #3\n"); + exit(1); + } + + ok(1, "New log is OK"); + + if (maria_log_remove(maria_data_root)) + exit(1); + + free(long_buffer); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + exit(0); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c new file mode 100644 index 00000000..892a773b --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c @@ -0,0 +1,183 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512) +#define LOG_FLAGS 0 + +static const char *base_first_translog_file= "aria_log.00000001"; +static const char *base_file1_name= "page_cache_test_file_1"; +static char file1_name[FN_REFLEN], first_translog_file[FN_REFLEN]; + +static PAGECACHE_FILE file1; + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + my_off_t file_size; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + MY_INIT(argv[0]); + + plan(1); + + bzero(&pagecache, sizeof(pagecache)); + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + fn_format(first_translog_file, base_first_translog_file, maria_data_root, "", MYF(0)); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler_pagecache.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler_pagecache.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if (init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0) + { + fprintf(stderr, "There is no %s (%d)\n", first_translog_file, errno); + exit(1); + } + file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME)); + if (file_size != TRANSLOG_PAGE_SIZE) + { + fprintf(stderr, + "incorrect initial size of %s: %ld instead of %ld\n", + first_translog_file, (long)file_size, (long)TRANSLOG_PAGE_SIZE); + exit(1); + } + my_close(file1.file, MYF(MY_WME)); + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + fn_format(file1_name, base_file1_name, maria_data_root, "", MYF(0)); + if ((file1.file= my_open(file1_name, + O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1) + { + fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n", + errno); + exit(1); + } + pagecache_file_set_null_hooks(&file1); + file1.flush_log_callback= maria_flush_log_for_page; + + if (my_chmod(file1_name, 0777, MYF(MY_WME))) + exit(1); + + { + uchar page[PCACHE_PAGE]; + + bzero(page, PCACHE_PAGE); + lsn_store(page, lsn); + pagecache_write(&pagecache, &file1, 0, 3, page, + PAGECACHE_LSN_PAGE, + PAGECACHE_LOCK_LEFT_UNLOCKED, + PAGECACHE_PIN_LEFT_UNPINNED, + PAGECACHE_WRITE_DELAY, + 0, LSN_IMPOSSIBLE); + flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE); + } + my_close(file1.file, MYF(MY_WME)); + if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0) + { + fprintf(stderr, "can't open %s (%d)\n", first_translog_file, errno); + exit(1); + } + file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME)); + if (file_size != TRANSLOG_PAGE_SIZE * 2) + { + fprintf(stderr, + "incorrect initial size of %s: %ld instead of %ld\n", + first_translog_file, + (long)file_size, (long)(TRANSLOG_PAGE_SIZE * 2)); + ok(0, "log triggered"); + exit(1); + } + my_close(file1.file, MYF(MY_WME)); + ok(1, "log triggered"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + my_delete(file1_name, MYF(MY_WME)); + + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + exit(0); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c new file mode 100644 index 00000000..07b50f19 --- /dev/null +++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c @@ -0,0 +1,199 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include <stdio.h> +#include <errno.h> +#include <tap.h> +#include "../trnman.h" + +extern my_bool maria_log_remove(const char *testdir); +extern char *create_tmpdir(const char *progname); + +#ifndef DBUG_OFF +static const char *default_dbug_option; +#endif + +#define PCACHE_SIZE (1024*1024*10) +#define PCACHE_PAGE TRANSLOG_PAGE_SIZE +#define LOG_FILE_SIZE (8*1024L*1024L) +#define LOG_FLAGS 0 +#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2) + + +int main(int argc __attribute__((unused)), char *argv[]) +{ + ulong i; + size_t pagen; + uchar long_tr_id[6]; + PAGECACHE pagecache; + LSN lsn; + LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1]; + uchar *long_buffer= malloc(LONG_BUFFER_SIZE); + + MY_INIT(argv[0]); + + plan(4); + + bzero(&pagecache, sizeof(pagecache)); + bzero(long_buffer, LONG_BUFFER_SIZE); + maria_data_root= create_tmpdir(argv[0]); + if (maria_log_remove(0)) + exit(1); + + bzero(long_tr_id, 6); +#ifndef DBUG_OFF +#if defined(_WIN32) + default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace"; +#else + default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace"; +#endif + if (argc > 1) + { + DBUG_SET(default_dbug_option); + DBUG_SET_INITIAL(default_dbug_option); + } +#endif + + if (ma_control_file_open(TRUE, TRUE, TRUE)) + { + fprintf(stderr, "Can't init control file (%d)\n", errno); + exit(1); + } + if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0, + PCACHE_PAGE, 0, 0)) == 0) + { + fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno); + exit(1); + } + if (translog_init_with_table(maria_data_root, LOG_FILE_SIZE, 50112, 0, &pagecache, + LOG_FLAGS, 0, &translog_example_table_init, + 0)) + { + fprintf(stderr, "Can't init loghandler (%d)\n", errno); + exit(1); + } + /* Suppressing of automatic record writing */ + dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; + + /* write more then 1 file */ + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (!translog_is_file(1)) + { + fprintf(stderr, "First file was removed after first record\n"); + translog_destroy(); + exit(1); + } + ok(1, "First is not removed"); + + for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++) + { + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write record #%lu\n", (ulong) 0); + translog_destroy(); + exit(1); + } + } + + translog_purge(lsn); + if (translog_is_file(1)) + { + fprintf(stderr, "First file was not removed.\n"); + translog_destroy(); + exit(1); + } + + ok(1, "First file is removed"); + + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= LONG_BUFFER_SIZE; + if (translog_write_record(&lsn, + LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, LONG_BUFFER_SIZE, + TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL)) + { + fprintf(stderr, "Can't write variable record\n"); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (!translog_is_file(2) || !translog_is_file(3)) + { + fprintf(stderr, "Second file (%d) or third file (%d) is not present.\n", + translog_is_file(2), translog_is_file(3)); + translog_destroy(); + exit(1); + } + + ok(1, "Second and third files are not removed"); + + int4store(long_tr_id, 0); + parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id; + parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6; + if (translog_write_record(&lsn, + LOGREC_FIXED_RECORD_0LSN_EXAMPLE, + &dummy_transaction_object, NULL, 6, + TRANSLOG_INTERNAL_PARTS + 1, + parts, NULL, NULL)) + { + fprintf(stderr, "Can't write last record\n"); + translog_destroy(); + exit(1); + } + + translog_purge(lsn); + if (translog_is_file(2)) + { + fprintf(stderr, "Second file is not removed\n"); + translog_destroy(); + exit(1); + } + + ok(1, "Second file is removed"); + + translog_destroy(); + end_pagecache(&pagecache, 1); + ma_control_file_end(); + if (maria_log_remove(maria_data_root)) + exit(1); + + my_uuid_end(); + my_free_open_file_info(); + my_end(0); + + exit(0); +} + +#include "../ma_check_standalone.h" diff --git a/storage/maria/unittest/ma_test_recovery.expected b/storage/maria/unittest/ma_test_recovery.expected new file mode 100644 index 00000000..38e8e4d8 --- /dev/null +++ b/storage/maria/unittest/ma_test_recovery.expected @@ -0,0 +1,1578 @@ +Testing the REDO PHASE ALONE +TEST WITH ma_test1 -s -M -T -c +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500 +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -M -T -c -b65000 +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -M -T -c -b65000 -d800 +applying log +testing idempotency +applying log +TEST WITH ma_test1 -s -M -T -c -C +applying log +testing idempotency +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500 -C +applying log +testing idempotency +applying log +Testing the REDO AND UNDO PHASE +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -H1 --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -H2 --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -H1 -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=1 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=1 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A1 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=2 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=2 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A2 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=3 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=3 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A3 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end) +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=4 (additional aborted work) +Terminating after inserts +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end) +Terminating after updates +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work) +Terminating after deletes +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end) +Terminating after inserts +TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=4 (additional aborted work) +Terminating after updates +Dying on request without maria_commit()/maria_close() +applying log +testing idempotency +applying log +testing applying of CLRs to recreate table +applying log +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end) +TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A4 (additional aborted work) +Dying on request without maria_commit()/maria_close() +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing idempotency +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= +testing applying of CLRs to recreate table +applying log +Differences in aria_chk -dvv, recovery not yet perfect ! +========DIFF START======= +7c7 +< Status: checked,analyzed,optimized keys,sorted index pages,zerofilled +--- +> Status: changed +========DIFF END======= diff --git a/storage/maria/unittest/ma_test_recovery.pl b/storage/maria/unittest/ma_test_recovery.pl new file mode 100755 index 00000000..fdc7ee68 --- /dev/null +++ b/storage/maria/unittest/ma_test_recovery.pl @@ -0,0 +1,496 @@ +#!/usr/bin/env perl + +use Getopt::Long; +use File::Copy; +use File::Compare; +use File::Basename; +use Digest::MD5; + +$|= 1; +$^W = 1; # warnings, because env cannot parse 'perl -w' +$VER= "1.2"; + +$opt_version= 0; +$opt_help= 0; +$opt_verbose= 0; +$opt_abort_on_error=0; + +my $silent= "-s"; +my $maria_path; # path to "storage/maria" +my $maria_exe_path; # path to executables (ma_test1, aria_chk etc) +my $tmp= "./tmp"; +my $my_progname= $0; +my $suffix; +my $zerofilled_tables= 0; + +$my_progname=~ s/.*[\/]//; +$maria_path= dirname($0) . "/.."; + +main(); + +#### +#### main function +#### + +sub main +{ + my ($res, $table); + + if (!GetOptions("abort-on-error", "help", "version", "verbose")) + { + $flag_exit= 1; + } + if ($opt_version) + { + print "$my_progname version $VER\n"; + exit(0); + } + usage() if ($opt_help || $flag_exit); + + $suffix= ( $^O =~ /win/i && $^O !~ /darwin/i ) ? ".exe" : ""; + $maria_exe_path= "$maria_path/release"; + # we use -f, sometimes -x is unexpectedly false in Cygwin + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/relwithdebinfo"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= "$maria_path/debug"; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + $maria_exe_path= $maria_path; + if ( ! -f "$maria_exe_path/ma_test1$suffix" ) + { + die("Cannot find ma_test1 executable\n"); + } + } + } + } + + # test data is always put in the current directory or a tmp subdirectory + # of it + + if (! -d "$tmp") + { + mkdir $tmp; + } + print "ARIA RECOVERY TESTS\n"; + + # To not flood the screen, we redirect all the commands below to a text file + # and just give a final error if their output is not as expected + + open (MY_LOG, ">$tmp/ma_test_recovery.output") or die "Can't open log file\n"; + print MY_LOG "Testing the REDO PHASE ALONE\n"; + + # runs a program inserting/deleting rows, then moves the resulting table + # elsewhere; applies the log and checks that the data file is + # identical to the saved original. + + my @t= ("ma_test1$suffix $silent -M -T -c", + "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500", + "ma_test2$suffix $silent -M -T -c -b65000", + "ma_test2$suffix $silent -M -T -c -b65000 -d800", + "ma_test1$suffix $silent -M -T -c -C", + "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500 -C", + #"ma_rt_test$suffix $silent -M -T -c -C", + # @todo: also add to @t2 + ); + + foreach my $prog (@t) + { + unlink <aria_log.* aria_log_control>; + my $prog_no_suffix= $prog; + $prog_no_suffix=~ s/$suffix// if ($suffix); + print MY_LOG "TEST WITH $prog_no_suffix\n"; + $res= my_exec("$maria_exe_path/$prog"); + print MY_LOG $res; + # derive table's name from program's name + if ($prog =~ m/^ma_(\S+)\s.*/) + { + $table= $1; + } + else + { + die("can't guess table name"); + } + $com= "$maria_exe_path/aria_chk$suffix -dvv $table "; + $com.= "| grep -v \"Creation time:\" | grep -v \"recover time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\""; + $com.= "> $tmp/aria_chk_message.good.txt 2>&1"; + my_exec($com); + my $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table"); + move("$table.MAD", "$tmp/$table-good.MAD") || + die "Can't move $table.MAD to $tmp/$table-good.MAD\n"; + move("$table.MAI", "$tmp/$table-good.MAI") || + die "Can't move $table.MAI to $tmp/$table-good.MAI\n"; + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-good"); + print MY_LOG $res; + print MY_LOG "testing idempotency\n"; + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-good"); + print MY_LOG $res; + } + + print MY_LOG "Testing the REDO AND UNDO PHASE\n"; + # The test programs look like: + # work; commit (time T1); work; exit-without-commit (time T2) + # We first run the test program and let it exit after T1's commit. + # Then we run it again and let it exit at T2. Then we compare + # and expect identity. + + my @take_checkpoints= ("no", "yes"); + my @blobs= ("", "-b32768"); + my @test_undo= (1, 2, 3, 4); + my @t2= ("ma_test1$suffix $silent -M -T -c -N blob -H1", + "--testflag=1", + "--testflag=2 --test-undo=", + "ma_test1$suffix $silent -M -T -c -N blob -H2", + "--testflag=3", + "--testflag=4 --test-undo=", + "ma_test1$suffix $silent -M -T -c -N blob -H2 --versioning", + "--testflag=3", + "--testflag=4 --test-undo=", + "ma_test1$suffix $silent -M -T -c -N blob -H2", + "--testflag=2", + "--testflag=3 --test-undo=", + "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1", + "-t1", + "-t2 -A", + "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1", + "-t1", + "-t6 -A"); + + foreach my $take_checkpoint (@take_checkpoints) + { + my ($i, $j, $k, $commit_run_args, $abort_run_args); + # we test table without blobs and then table with blobs + for ($i= 0; defined($blobs[$i]); $i++) + { + for ($j= 0; defined($test_undo[$j]); $j++) + { + # first iteration tests rollback of insert, second tests rollback of delete + # -N (create NULL fields) is needed because --test-undo adds it anyway + for ($k= 0; defined($t2[$k]); $k+= 3) + { + $prog= $t2[$k]; + $prog=~ s/blob/$blobs[$i]/; + if ("$take_checkpoint" eq "no") { + $prog=~ s/\s+\-H[0-9]+//; + } + $commit_run_args= $t2[$k + 1]; + $abort_run_args= $t2[$k + 2]; + unlink <aria_log.* aria_log_control>; + my $prog_no_suffix= $prog; + $prog_no_suffix=~ s/$suffix// if ($suffix); + print MY_LOG "TEST WITH $prog_no_suffix $commit_run_args (commit at end)\n"; + $res= my_exec("$maria_exe_path/$prog $commit_run_args"); + print MY_LOG $res; + # derive table's name from program's name + if ($prog =~ m/^ma_(\S+)\s.*/) + { + $table= $1; + } + else + { + die("can't guess table name"); + } + $com= "$maria_exe_path/aria_chk$suffix -dvv $table "; + $com.= "| grep -v \"Creation time:\" | grep -v \"recover time:\" | grep -v \"recover time:\" |grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" "; + $com.= "> $tmp/aria_chk_message.good.txt 2>&1"; + $res= my_exec($com); + print MY_LOG $res; + $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table"); + move("$table.MAD", "$tmp/$table-good.MAD") || + die "Can't move $table.MAD to $tmp/$table-good.MAD\n"; + move("$table.MAI", "$tmp/$table-good.MAI") || + die "Can't move $table.MAI to $tmp/$table-good.MAI\n"; + unlink <aria_log.* aria_log_control>; + print MY_LOG "TEST WITH $prog_no_suffix $abort_run_args$test_undo[$j] (additional aborted work)\n"; + $res= my_exec("$maria_exe_path/$prog $abort_run_args$test_undo[$j]"); + print MY_LOG $res; + copy("$table.MAD", "$tmp/$table-before_undo.MAD") || + die "Can't copy $table.MAD to $tmp/$table-before_undo.MAD\n"; + copy("$table.MAI", "$tmp/$table-before_undo.MAI") || + die "Can't copy $table.MAI to $tmp/$table-before_undo.MAI\n"; + + # The lines below seem unneeded, will be removed soon + # We have to copy and restore logs, as running aria_read_log will + # change the aria_control_file + # rm -f $tmp/aria_log.* $tmp/aria_log_control + # cp $maria_path/aria_log* $tmp + + if ($test_undo[$j] != 3) { + apply_log($table, "shouldchangelog"); # should undo aborted work + } else { + # probably nothing to undo went to log or data file + apply_log($table, "dontknow"); + } + copy("$table.MAD", "$tmp/$table-after_undo.MAD") || + die "Can't copy $table.MAD to $tmp/$table-after_undo.MAD\n"; + copy("$table.MAI", "$tmp/$table-after_undo.MAI") || + die "Can't copy $table.MAI to $tmp/$table-after_undo.MAI\n"; + + # It is impossible to do a "cmp" between .good and .after_undo, + # because the UNDO phase generated log + # records whose LSN tagged pages. Another reason is that rolling back + # INSERT only marks the rows free, does not empty them + # (optimization), so traces of the INSERT+rollback remain. + + check_table_is_same($table, $checksum); + print MY_LOG "testing idempotency\n"; + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-after_undo"); + print MY_LOG $res; + print MY_LOG "testing applying of CLRs to recreate table\n"; + unlink <$table.MA?>; + # cp $tmp/aria_log* $maria_path #unneeded + apply_log($table, "shouldnotchangelog"); + check_table_is_same($table, $checksum); + $res= physical_cmp($table, "$tmp/$table-after_undo"); + print MY_LOG $res; + } + unlink <$table.* $tmp/$table* $tmp/aria_chk_*.txt $tmp/aria_read_log_$table.txt>; + } + } + } + + if ($? >> 8) { + print "Some test failed\n"; + exit(1); + } + + close(MY_LOG); + # also note that aria_chk -dvv shows differences for ma_test2 in UNDO phase, + # this is normal: removing records does not shrink the data/key file, + # does not put back the "analyzed,optimized keys"(etc) index state. + `diff -b $maria_path/unittest/ma_test_recovery.expected $tmp/ma_test_recovery.output`; + if ($? >> 8) { + print "UNEXPECTED OUTPUT OF TESTS, FAILED"; + print " (zerofilled $zerofilled_tables tables)\n"; + print "For more info, do diff -b $maria_path/unittest/ma_test_recovery.expected "; + print "$tmp/ma_test_recovery.output\n"; + exit(1); + } + print "ALL RECOVERY TESTS OK (zerofilled $zerofilled_tables tables)\n"; +} + +#### +#### check_table_is_same +#### + +sub check_table_is_same +{ + my ($table, $checksum)= @_; + my ($com, $checksum2, $res); + + # Computes checksum of new table and compares to checksum of old table + # Shows any difference in table's state (info from the index's header) + # Data/key file length is random in ma_test2 (as it uses srand() which + # may differ between machines). + + if ($opt_verbose) + { + print "checking if table $table has changed\n"; + } + + $com= "$maria_exe_path/aria_chk$suffix -dvv $table | grep -v \"Creation time:\" | grep -v \"recover time:\""; + $com.= "| grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" > $tmp/aria_chk_message.txt 2>&1"; + $res= my_exec2($com); + print MY_LOG $res; + $res= my_exec2("$maria_exe_path/aria_chk$suffix -ss -e --read-only $table"); + print MY_LOG $res; + $checksum2= my_exec2("$maria_exe_path/aria_chk$suffix -dss $table"); + if ("$checksum" ne "$checksum2") + { + print MY_LOG "checksum differs for $table before and after recovery\n"; + return 1; + } + + $com= "diff $tmp/aria_chk_message.good.txt $tmp/aria_chk_message.txt "; + $com.= "> $tmp/aria_chk_diff.txt || true"; + $res= my_exec2($com); + print MY_LOG $res; + + if (-s "$tmp/aria_chk_diff.txt") + { + print MY_LOG "Differences in aria_chk -dvv, recovery not yet perfect !\n"; + print MY_LOG "========DIFF START=======\n"; + open(MY_FILE, "<$tmp/aria_chk_diff.txt") || die "Can't open file aria_chk_diff.txt\n"; + while (<MY_FILE>) + { + print MY_LOG $_; + } + close(MY_FILE); + print MY_LOG "========DIFF END=======\n"; + } +} + +#### +#### apply_log +#### + +sub apply_log +{ + my ($table, $shouldchangelog)= @_; + my ($log_md5, $log_md5_2); + + # applies log, can verify if applying did write to log or not + + if ("$shouldchangelog" ne "shouldnotchangelog" && + "$shouldchangelog" ne "shouldchangelog" && + "$shouldchangelog" ne "dontknow" ) + { + print MY_LOG "bad argument '$shouldchangelog'\n"; + return 1; + } + foreach (<aria_log.*>) + { + $log_md5.= md5_conv($_); + } + print MY_LOG "applying log\n"; + my_exec("$maria_exe_path/aria_read_log$suffix -a > $tmp/aria_read_log_$table.txt"); + foreach (<aria_log.*>) + { + $log_md5_2.= md5_conv($_); + } + if ("$log_md5" ne "$log_md5_2" ) + { + if ("$shouldchangelog" eq "shouldnotchangelog") + { + print MY_LOG "aria_read_log should not have modified the log\n"; + return 1; + } + } + elsif ("$shouldchangelog" eq "shouldchangelog") + { + print MY_LOG "aria_read_log should have modified the log\n"; + return 1; + } +} + +#### +#### md5_conv +#### + +sub md5_conv +{ + my ($file)= @_; + + open(FILE, $file) or die "Can't open '$file': $!\n"; + binmode(FILE); + my $md5= Digest::MD5->new; + $md5->addfile(FILE); + close (FILE); + return $md5->hexdigest . "\n"; +} + +#### +#### physical_cmp: compares two tables (MAI and MAD) physically; +#### uses zerofill-keep-lsn to reduce irrelevant differences. +#### + +sub physical_cmp +{ + my ($table1, $table2)= @_; + my ($zerofilled, $ret_text)= (0, ""); + #return `cmp $table1.MAD $table2.MAD`.`cmp $table1.MAI $table2.MAI`; + foreach my $file_suffix ("MAD", "MAI") + { + my $file1= "$table1.$file_suffix"; + my $file2= "$table2.$file_suffix"; + my $res= File::Compare::compare($file1, $file2); + die() if ($res == -1); + if ($res == 1 # they differ + and !$zerofilled) + { + # let's try with --zerofill-keep-lsn + $zerofilled= 1; # but no need to do it twice + $zerofilled_tables= $zerofilled_tables + 1; + my $table_no= 1; + foreach my $table ($table1, $table2) + { + # save original tables to restore them later + copy("$table.MAD", "$tmp/before_zerofill$table_no.MAD") || die(); + copy("$table.MAI", "$tmp/before_zerofill$table_no.MAI") || die(); + $com= "$maria_exe_path/aria_chk$suffix -ss --zerofill-keep-lsn --skip-update-state $table"; + $res= `$com`; + print MY_LOG $res; + $table_no= $table_no + 1; + } + $res= File::Compare::compare($file1, $file2); + die() if ($res == -1); + } + $ret_text.= "$file1 and $file2 differ\n" if ($res != 0); + } + if ($zerofilled) + { + my $table_no= 1; + foreach my $table ($table1, $table2) + { + move("$tmp/before_zerofill$table_no.MAD", "$table.MAD") || die(); + move("$tmp/before_zerofill$table_no.MAI", "$table.MAI") || die(); + $table_no= $table_no + 1; + } + } + return $ret_text; +} + + +sub my_exec +{ + my($command)= @_; + my $res; + if ($opt_verbose) + { + print "$command\n"; + } + $res= `$command`; + if ($? != 0 && $opt_abort_on_error) + { + exit(1); + } + return $res; +} + +sub my_exec2 +{ + my($command)= @_; + my $res, $err; + $res= `$command`; + if ($? != 0 && $opt_abort_on_error) + { + $err= $?; + print "$command\n"; + print "failed with error: $err\n"; + exit(1); + } + return $res; +} + + +#### +#### usage +#### + +sub usage +{ + print <<EOF; +$my_progname version $VER + +Description: + +Run various Aria recovery tests and print the results + +Options +--help Show this help and exit. + +--abort-on-error Abort at once in case of error. +--verbose Show commands while there are executing. +--version Show version number and exit. + +EOF + exit(0); +} diff --git a/storage/maria/unittest/sequence_storage.c b/storage/maria/unittest/sequence_storage.c new file mode 100644 index 00000000..1e6b3fcb --- /dev/null +++ b/storage/maria/unittest/sequence_storage.c @@ -0,0 +1,111 @@ +/* Copyright (C) 2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include "../maria_def.h" +#include "sequence_storage.h" + + +/** + @brief Initializes the sequence from the sequence file. + + @param seq Reference on the sequence storage. + @param file Path to the file where to write the sequence + + @retval 0 OK + @retval 1 Error +*/ + +my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file) +{ + FILE *fd; + seq->pos= 0; + if ((fd= my_fopen(file, O_RDONLY, MYF(MY_WME))) == NULL) + return 1; + if (my_init_dynamic_array(PSI_NOT_INSTRUMENTED, &seq->seq, sizeof(ulong), 10, + 10, MYF(0))) + return 1; + + for(;;) + { + ulong num; + char line[22]; + if (fgets(line, sizeof(line), fd) == NULL) + break; + num= atol(line); + if (insert_dynamic(&seq->seq, (uchar*) &num)) + return 1; + } + fclose(fd); + return 0; +} + + +/** + @brief Gets next number from the sequence storage + + @param seq Reference on the sequence storage. + + @return Next number from the sequence. +*/ + +ulong seq_storage_next(SEQ_STORAGE *seq) +{ + DBUG_ASSERT(seq->seq.elements > 0); + DBUG_ASSERT(seq->pos < seq->seq.elements); + return (*(dynamic_element(&seq->seq, seq->pos++, ulong *))); +} + + +/** + @brief Frees resources allocated for the storage + + @param seq Reference on the sequence storage. +*/ + +void seq_storage_destroy(SEQ_STORAGE *seq) +{ + delete_dynamic(&seq->seq); +} + + +/** + @brief Starts the sequence from beginning + + @param seq Reference on the sequence storage. +*/ + +void seq_storage_rewind(SEQ_STORAGE *seq) +{ + seq->pos= 0; +} + +/** + @brief Writes a number to the sequence file. + + @param file Path to the file where to write the sequence + @pagem num Number to be written + + @retval 0 OK + @retval 1 Error +*/ + +my_bool seq_storage_write(const char *file, ulong num) +{ + FILE *fd; + return ((fd= my_fopen(file, O_CREAT | O_APPEND | O_WRONLY, MYF(MY_WME))) == + NULL || + fprintf(fd, "%lu\n", num) < 0 || + fclose(fd) != 0); +} diff --git a/storage/maria/unittest/sequence_storage.h b/storage/maria/unittest/sequence_storage.h new file mode 100644 index 00000000..17490256 --- /dev/null +++ b/storage/maria/unittest/sequence_storage.h @@ -0,0 +1,28 @@ +/* Copyright (C) 2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + + +typedef struct st_seq_storage +{ + uint pos; + DYNAMIC_ARRAY seq; +} SEQ_STORAGE; + +extern my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file); +extern ulong seq_storage_next(SEQ_STORAGE *seq); +extern void seq_storage_destroy(SEQ_STORAGE *seq); +extern void seq_storage_rewind(SEQ_STORAGE *seq); +extern my_bool seq_storage_write(const char *file, ulong num); + diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c new file mode 100644 index 00000000..853f5352 --- /dev/null +++ b/storage/maria/unittest/test_file.c @@ -0,0 +1,118 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <tap.h> /* Includes my_global.h */ +#include <my_sys.h> +#include <my_dir.h> +#include "test_file.h" + + +/* + Check that file contance correspond to descriptor + + SYNOPSIS + test_file() + file File to test + file_name Path (and name) of file which is tested + size size of file + buff_size size of buffer which is enought to check the file + desc file descriptor to check with + + RETURN + 1 file if OK + 0 error +*/ + +int test_file(PAGECACHE_FILE file, char *file_name, + off_t size, size_t buff_size, struct file_desc *desc) +{ + unsigned char *buffr= my_malloc(PSI_NOT_INSTRUMENTED, buff_size, MYF(0)); + off_t pos= 0; + size_t byte; + int step= 0; + int res= 1; /* ok */ + +#ifdef _WIN32 + /* + On Windows, the info returned by stat(), specifically file length + is not necessarily current, because this is the behavior of + underlying FindFirstFile() function. + */ + WIN32_FILE_ATTRIBUTE_DATA file_attr; + LARGE_INTEGER li; + if(GetFileAttributesEx(file_name, GetFileExInfoStandard, &file_attr) == 0) + { + diag("Can't GetFileAttributesEx %s (errno: %lu)\n", file_name, + GetLastError()); + res= 0; + goto err; + } + li.HighPart= file_attr.nFileSizeHigh; + li.LowPart= file_attr.nFileSizeLow; + if(li.QuadPart != size) + { + diag("file %s size is %llu (should be %llu)\n", + file_name, (ulonglong)size, (ulonglong)li.QuadPart); + res= 0; /* failed */ + /* continue to get more information */ + } +#else + MY_STAT stat_buff, *stat; + if ((stat= my_stat(file_name, &stat_buff, MYF(0))) == NULL) + { + diag("Can't stat() %s (errno: %d)\n", file_name, errno); + res= 0; + goto err; + } + if (stat->st_size != size) + { + diag("file %s size is %lu (should be %lu)\n", + file_name, (ulong) stat->st_size, (ulong) size); + res= 0; /* failed */ + /* continue to get more information */ + } +#endif + + /* check content */ + my_seek(file.file, 0, SEEK_SET, MYF(MY_WME)); + while (desc[step].length != 0) + { + if (my_read(file.file, buffr, desc[step].length, MYF(0)) != + desc[step].length) + { + diag("Can't read %u bytes from %s (file: %d errno: %d)\n", + (uint)desc[step].length, file_name, file.file, errno); + res= 0; + goto err; + } + for (byte= 0; byte < desc[step].length; byte++) + { + if (buffr[byte] != desc[step].content) + { + diag("content of %s mismatch 0x%x in position %lu instead of 0x%x\n", + file_name, (uint) buffr[byte], (ulong) (pos + byte), + desc[step].content); + res= 0; + goto err; + } + } + pos+= desc[step].length; + step++; + } + +err: + my_free(buffr); + return res; +} diff --git a/storage/maria/unittest/test_file.h b/storage/maria/unittest/test_file.h new file mode 100644 index 00000000..22337b7a --- /dev/null +++ b/storage/maria/unittest/test_file.h @@ -0,0 +1,31 @@ +/* Copyright (C) 2006-2008 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <m_string.h> +#include "../ma_pagecache.h" +#ifdef _WIN32 +#include <direct.h> +#endif +/* + File content descriptor +*/ +struct file_desc +{ + unsigned int length; + unsigned char content; +}; + +int test_file(PAGECACHE_FILE file, char *file_name, + off_t size, size_t buff_size, struct file_desc *desc); diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c new file mode 100644 index 00000000..43bca725 --- /dev/null +++ b/storage/maria/unittest/trnman-t.c @@ -0,0 +1,172 @@ +/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <tap.h> + +#include <my_global.h> +#include <my_sys.h> +#include <lf.h> +#include <m_string.h> +#include "../trnman.h" + +pthread_mutex_t rt_mutex; +pthread_attr_t attr; +size_t stacksize= 0; +#define STACK_SIZE (((int)stacksize-2048)*STACK_DIRECTION) + +int rt_num_threads; +int litmus; + +/* + create and end (commit or rollback) transactions randomly +*/ +#define MAX_ITER 100 +pthread_handler_t test_trnman(void *arg) +{ + uint x, y, i, n; + TRN *trn[MAX_ITER]; + int m= (*(int *)arg); + + if (my_thread_init()) + BAIL_OUT("my_thread_init failed!"); + + for (x= ((int)(intptr)(&m)); m > 0; ) + { + y= x= (x*3628273133LL + 1500450271LL) % 9576890767LL; /* three prime numbers */ + m-= n= x % MAX_ITER; + for (i= 0; i < n; i++) + { + trn[i]= trnman_new_trn(0); + if (!trn[i]) + { + diag("trnman_new_trn() failed"); + litmus++; + } + } + for (i= 0; i < n; i++) + { + y= (y*19 + 7) % 31; + trnman_end_trn(trn[i], y & 1); + } + } + pthread_mutex_lock(&rt_mutex); + rt_num_threads--; + pthread_mutex_unlock(&rt_mutex); + + my_thread_end(); + + return 0; +} +#undef MAX_ITER + +void run_test(const char *test, pthread_handler handler, int n, int m) +{ + pthread_t *threads; + ulonglong now= microsecond_interval_timer(); + int i; + + litmus= 0; + + threads= (pthread_t *)my_malloc(PSI_NOT_INSTRUMENTED, sizeof(void *)*n, MYF(0)); + if (!threads) + { + diag("Out of memory"); + abort(); + } + + diag("Testing %s with %d threads, %d iterations... ", test, n, m); + rt_num_threads= n; + for (i= 0; i < n ; i++) + if (pthread_create(threads+i, &attr, handler, &m)) + { + diag("Could not create thread"); + abort(); + } + for (i= 0 ; i < n ; i++) + pthread_join(threads[i], 0); + now= microsecond_interval_timer() - now; + ok(litmus == 0, "Tested %s in %g secs (%d)", test, ((double)now)/1e6, litmus); + my_free(threads); +} + +#define ok_read_from(T1, T2, RES) \ + i= trnman_can_read_from(trn[T1], trid[T2]); \ + ok(i == RES, "trn" #T1 " %s read from trn" #T2, i ? "can" : "cannot") +#define start_transaction(T) \ + trn[T]= trnman_new_trn(0); \ + trid[T]= trn[T]->trid +#define commit(T) trnman_commit_trn(trn[T]) +#define abort(T) trnman_abort_trn(trn[T]) + +#define Ntrns 4 +void test_trnman_read_from() +{ + TRN *trn[Ntrns]; + TrID trid[Ntrns]; + int i; + + start_transaction(0); /* start trn1 */ + start_transaction(1); /* start trn2 */ + ok_read_from(1, 0, 0); + commit(0); /* commit trn1 */ + start_transaction(2); /* start trn4 */ + abort(2); /* abort trn4 */ + start_transaction(3); /* start trn5 */ + ok_read_from(3, 0, 1); + ok_read_from(3, 1, 0); + ok_read_from(3, 2, 0); + ok_read_from(3, 3, 1); + commit(1); /* commit trn2 */ + ok_read_from(3, 1, 0); + commit(3); /* commit trn5 */ + +} + +int main(int argc __attribute__((unused)), char **argv) +{ + MY_INIT(argv[0]); + + plan(7); + + pthread_mutex_init(&rt_mutex, 0); + pthread_attr_init(&attr); +#ifdef HAVE_PTHREAD_ATTR_GETSTACKSIZE + pthread_attr_getstacksize(&attr, &stacksize); + if (stacksize == 0) +#endif + stacksize= PTHREAD_STACK_MIN; + +#define CYCLES 10000 +#define THREADS 10 + + trnman_init(0); + + test_trnman_read_from(); + run_test("trnman", test_trnman, THREADS, CYCLES); + + diag("mallocs: %d", trnman_allocated_transactions); + { + ulonglong now= microsecond_interval_timer(); + trnman_destroy(); + now= microsecond_interval_timer() - now; + diag("trnman_destroy: %g", ((double)now)/1e6); + } + + pthread_mutex_destroy(&rt_mutex); + my_end(0); + return exit_status(); +} + +#include "../ma_check_standalone.h" |