diff options
Diffstat (limited to 'client/mariadb-conv.cc')
-rw-r--r-- | client/mariadb-conv.cc | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/client/mariadb-conv.cc b/client/mariadb-conv.cc new file mode 100644 index 00000000..1774debe --- /dev/null +++ b/client/mariadb-conv.cc @@ -0,0 +1,484 @@ +/* + Copyright (c) 2001, 2013, Oracle and/or its affiliates. + Copyright (c) 2010, 2019, MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +/* + Character set conversion utility +*/ + +#include "mariadb.h" +#include "client_priv.h" +#include "sql_string.h" +#include "my_dir.h" + +#define CONV_VERSION "1.0" + + +class CmdOpt +{ +public: + const char *m_charset_from; + const char *m_charset_to; + const char *m_delimiter; + my_bool m_continue; + CmdOpt() + :m_charset_from("latin1"), + m_charset_to("latin1"), + m_delimiter(NULL), + m_continue(FALSE) + { } + static CHARSET_INFO *csinfo_by_name(const char *csname) + { + return get_charset_by_csname(csname, MY_CS_PRIMARY, MYF(MY_UTF8_IS_UTF8MB3)); + } + CHARSET_INFO *csinfo_from() const + { + return m_charset_from ? csinfo_by_name(m_charset_from) : NULL; + } + CHARSET_INFO *csinfo_to() const + { + return m_charset_to ? csinfo_by_name(m_charset_to) : NULL; + } +}; + + +static CmdOpt opt; + + +static struct my_option long_options[] = +{ + {"from", 'f', "Specifies the encoding of the input.", &opt.m_charset_from, + &opt.m_charset_from, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"to", 't', "Specifies the encoding of the output.", &opt.m_charset_to, + &opt.m_charset_to, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {"continue", 'c', "Silently ignore conversion errors.", + &opt.m_continue, &opt.m_continue, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + {"delimiter", 0, "Treat the specified characters as delimiters.", + &opt.m_delimiter, &opt.m_delimiter, 0, GET_STR, REQUIRED_ARG, + 0, 0, 0, 0, 0, 0}, + {"character-sets-dir", OPT_CHARSETS_DIR, + "Directory for character set files.", &charsets_dir, + &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0} +}; + + +my_bool +get_one_option(const struct my_option *opt, + const char *value, const char *filename) +{ + return 0; +} + + +class File_buffer: public Binary_string +{ +public: + bool load_binary_stream(FILE *file); + bool load_binary_file_by_name(const char *file); +}; + + +/* + Load data from a binary stream whose length is not known in advance, + e.g. from stdin. +*/ +bool File_buffer::load_binary_stream(FILE *file) +{ + for ( ; ; ) + { + char buf[1024]; + if (length() + sizeof(buf) > UINT_MAX32 || reserve(sizeof(buf))) + { + fprintf(stderr, "Input data is too large\n"); + return true; + } + size_t nbytes= my_fread(file, (uchar *) end(), sizeof(buf), MYF(0)); + if (!nbytes || nbytes == (size_t) -1) + return false; + str_length+= (uint32) nbytes; + } + return false; +} + + +/* + Load data from a file by name. + The file size is know. +*/ +bool File_buffer::load_binary_file_by_name(const char *filename) +{ + MY_STAT sbuf; + File fd; + + if (!my_stat(filename, &sbuf, MYF(0))) + { + fprintf(stderr, "my_stat failed for '%s'\n", filename); + return true; + } + + if (!MY_S_ISREG(sbuf.st_mode)) + { + fprintf(stderr, "'%s' is not a regular file\n", filename); + return true; + } + + if ((size_t) sbuf.st_size > UINT_MAX32) + { + fprintf(stderr, "File '%s' is too large\n", filename); + return true; + } + + if (alloc((uint32) sbuf.st_size)) + { + fprintf(stderr, "Failed to allocate read buffer\n"); + return true; + } + + if ((fd= my_open(filename, O_RDONLY, MYF(0))) == -1) + { + fprintf(stderr, "Could not open '%s'\n", filename); + return true; + } + + size_t nbytes= my_read(fd, (uchar*) Ptr, (size_t)sbuf.st_size, MYF(0)); + my_close(fd, MYF(0)); + length((uint32) nbytes); + + return false; +} + + +class Delimiter +{ +protected: + bool m_delimiter[127]; + bool m_has_delimiter_cached; + bool has_delimiter_slow() const + { + for (size_t i= 0; i < sizeof(m_delimiter); i++) + { + if (m_delimiter[i]) + return true; + } + return false; + } + bool unescape(char *to, char from) const + { + switch (from) { + case '\\': *to= '\\'; return false; + case 'r': *to= '\r'; return false; + case 'n': *to= '\n'; return false; + case 't': *to= '\t'; return false; + case '0': *to= '\0'; return false; + } + *to= '\0'; + return true; + } + bool is_delimiter(char ch) const + { + return (signed char) ch < 0 ? false : m_delimiter[(uint32) ch]; + } +public: + Delimiter() + :m_has_delimiter_cached(false) + { + bzero(&m_delimiter, sizeof(m_delimiter)); + } + bool has_delimiter() const + { + return m_has_delimiter_cached; + } + bool set_delimiter_unescape(const char *str) + { + m_has_delimiter_cached= false; + for ( ; *str; str++) + { + if ((signed char) *str < 0) + return true; + if (*str == '\\') + { + char unescaped; + str++; + if (!*str || unescape(&unescaped, *str)) + return true; + m_delimiter[(uint) unescaped]= true; + } + else + m_delimiter[(uint) *str]= true; + } + m_has_delimiter_cached= has_delimiter_slow(); + return false; + } + size_t get_delimiter_length(const char *str, const char *end) const + { + const char *str0= str; + for ( ; str < end; str++) + { + if (!is_delimiter(*str)) + break; + } + return str - str0; + } + size_t get_data_length(const char *str, const char *end) const + { + const char *str0= str; + for ( ; str < end; str++) + { + if (is_delimiter(*str)) + break; + } + return str - str0; + } +}; + + +class Conv_inbuf +{ + const char *m_ptr; + const char *m_end; +public: + Conv_inbuf(const char *from, size_t length) + :m_ptr(from), m_end(from + length) + { } + const char *ptr() const { return m_ptr; } + const char *end() const { return m_end; } + size_t length() const + { + return m_end - m_ptr; + } +private: + LEX_CSTRING get_prefix(size_t len) + { + LEX_CSTRING res; + res.str= ptr(); + res.length= len; + m_ptr+= len; + return res; + } + LEX_CSTRING get_empty_string() const + { + static LEX_CSTRING str= {NULL, 0}; + return str; + } +public: + LEX_CSTRING get_delimiter_chunk(const Delimiter &delimiter) + { + if (!delimiter.has_delimiter()) + return get_empty_string(); + size_t len= delimiter.get_delimiter_length(ptr(), end()); + return get_prefix(len); + } + LEX_CSTRING get_data_chunk(const Delimiter &delimiter) + { + if (!delimiter.has_delimiter()) + return get_prefix(length()); + size_t len= delimiter.get_data_length(ptr(), end()); + return get_prefix(len); + } +}; + + +class Conv_outbuf: public Binary_string +{ +public: + bool alloc(size_t out_max_length) + { + if (out_max_length >= UINT_MAX32) + { + fprintf(stderr, "The data needs a too large output buffer\n"); + return true; + } + if (Binary_string::alloc((uint32) out_max_length)) + { + fprintf(stderr, "Failed to allocate the output buffer\n"); + return true; + } + return false; + } +}; + + +class Conv: public String_copier, public Delimiter +{ + CHARSET_INFO *m_tocs; + CHARSET_INFO *m_fromcs; + bool m_continue; +public: + Conv(CHARSET_INFO *tocs, CHARSET_INFO *fromcs, bool opt_continue) + :m_tocs(tocs), m_fromcs(fromcs), m_continue(opt_continue) + { } + size_t out_buffer_max_length(size_t from_length) const + { + return from_length / m_fromcs->mbminlen * m_tocs->mbmaxlen; + } + bool convert_data(const char *from, size_t length); + bool convert_binary_stream(FILE *file) + { + File_buffer buf; + return buf.load_binary_stream(file) || + convert_data(buf.ptr(), buf.length()); + } + bool convert_binary_file_by_name(const char *filename) + { + File_buffer buf; + return buf.load_binary_file_by_name(filename)|| + convert_data(buf.ptr(), buf.length()); + } +private: + void report_error(const char *from) const + { + if (well_formed_error_pos()) + { + fflush(stdout); + fprintf(stderr, + "Illegal %s byte sequence at position %d\n", + m_fromcs->cs_name.str, + (uint) (well_formed_error_pos() - from)); + } + else if (cannot_convert_error_pos()) + { + fflush(stdout); + fprintf(stderr, + "Conversion from %s to %s failed at position %d\n", + m_fromcs->cs_name.str, m_tocs->cs_name.str, + (uint) (cannot_convert_error_pos() - from)); + } + } + size_t write(const char *str, size_t length) const + { + return my_fwrite(stdout, (uchar *) str, length, MY_WME); + } +}; + + +bool Conv::convert_data(const char *from, size_t from_length) +{ + Conv_inbuf inbuf(from, from_length); + Conv_outbuf outbuf; + + if (outbuf.alloc(out_buffer_max_length(from_length))) + return true; + + for ( ; ; ) + { + LEX_CSTRING delim, data; + + delim= inbuf.get_delimiter_chunk(*this); + if (delim.length) + write(delim.str, delim.length); + + data= inbuf.get_data_chunk(*this); + if (!data.length) + break; + size_t length= well_formed_copy(m_tocs, + (char *) outbuf.ptr(), + outbuf.alloced_length(), + m_fromcs, data.str, data.length); + outbuf.length((uint32) length); + + if (most_important_error_pos() && !m_continue) + { + report_error(from); + return true; + } + write(outbuf.ptr(), outbuf.length()); + } + return false; +} + + +class Session +{ +public: + Session(const char *prog) + { + MY_INIT(prog); + } + ~Session() + { + my_end(0); + } + void usage(void) + { + printf("%s Ver %s Distrib %s for %s on %s\n", my_progname, CONV_VERSION, + MYSQL_SERVER_VERSION, SYSTEM_TYPE, MACHINE_TYPE); + puts("Character set conversion utility for MariaDB"); + puts("Usage:"); + printf("%s [OPTION...] [FILE...]\n", my_progname); + my_print_help(long_options); + } +}; + + +int main(int argc, char *argv[]) +{ + Session session(argv[0]); + CHARSET_INFO *charset_info_from= NULL; + CHARSET_INFO *charset_info_to= NULL; + + if (handle_options(&argc, &argv, long_options, get_one_option)) + { + session.usage(); + return 1; + } + + if (!(charset_info_from= opt.csinfo_from())) + { + fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_from); + return 1; + } + + if (!(charset_info_to= opt.csinfo_to())) + { + fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_to); + return 1; + } + + Conv conv(charset_info_to, charset_info_from, opt.m_continue); + if (opt.m_delimiter) + { + if (charset_info_from->mbminlen > 1 || + charset_info_to->mbminlen > 1) + { + fprintf(stderr, "--delimiter cannot be used with %s to %s conversion\n", + charset_info_from->cs_name.str, charset_info_to->cs_name.str); + return 1; + } + if (conv.set_delimiter_unescape(opt.m_delimiter)) + { + fprintf(stderr, "Bad --delimiter value\n"); + return 1; + } + } + + if (argc == 0) + { + if (conv.convert_binary_stream(stdin)) + return 1; + } + else + { + for (int i= 0; i < argc; i++) + { + if (conv.convert_binary_file_by_name(argv[i])) + return 1; + } + } + + return 0; +} /* main */ |