summaryrefslogtreecommitdiffstats
path: root/client/mariadb-conv.cc
diff options
context:
space:
mode:
Diffstat (limited to 'client/mariadb-conv.cc')
-rw-r--r--client/mariadb-conv.cc484
1 files changed, 484 insertions, 0 deletions
diff --git a/client/mariadb-conv.cc b/client/mariadb-conv.cc
new file mode 100644
index 00000000..1774debe
--- /dev/null
+++ b/client/mariadb-conv.cc
@@ -0,0 +1,484 @@
+/*
+ Copyright (c) 2001, 2013, Oracle and/or its affiliates.
+ Copyright (c) 2010, 2019, MariaDB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+/*
+ Character set conversion utility
+*/
+
+#include "mariadb.h"
+#include "client_priv.h"
+#include "sql_string.h"
+#include "my_dir.h"
+
+#define CONV_VERSION "1.0"
+
+
+class CmdOpt
+{
+public:
+ const char *m_charset_from;
+ const char *m_charset_to;
+ const char *m_delimiter;
+ my_bool m_continue;
+ CmdOpt()
+ :m_charset_from("latin1"),
+ m_charset_to("latin1"),
+ m_delimiter(NULL),
+ m_continue(FALSE)
+ { }
+ static CHARSET_INFO *csinfo_by_name(const char *csname)
+ {
+ return get_charset_by_csname(csname, MY_CS_PRIMARY, MYF(MY_UTF8_IS_UTF8MB3));
+ }
+ CHARSET_INFO *csinfo_from() const
+ {
+ return m_charset_from ? csinfo_by_name(m_charset_from) : NULL;
+ }
+ CHARSET_INFO *csinfo_to() const
+ {
+ return m_charset_to ? csinfo_by_name(m_charset_to) : NULL;
+ }
+};
+
+
+static CmdOpt opt;
+
+
+static struct my_option long_options[] =
+{
+ {"from", 'f', "Specifies the encoding of the input.", &opt.m_charset_from,
+ &opt.m_charset_from, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"to", 't', "Specifies the encoding of the output.", &opt.m_charset_to,
+ &opt.m_charset_to, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {"continue", 'c', "Silently ignore conversion errors.",
+ &opt.m_continue, &opt.m_continue, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"delimiter", 0, "Treat the specified characters as delimiters.",
+ &opt.m_delimiter, &opt.m_delimiter, 0, GET_STR, REQUIRED_ARG,
+ 0, 0, 0, 0, 0, 0},
+ {"character-sets-dir", OPT_CHARSETS_DIR,
+ "Directory for character set files.", &charsets_dir,
+ &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+my_bool
+get_one_option(const struct my_option *opt,
+ const char *value, const char *filename)
+{
+ return 0;
+}
+
+
+class File_buffer: public Binary_string
+{
+public:
+ bool load_binary_stream(FILE *file);
+ bool load_binary_file_by_name(const char *file);
+};
+
+
+/*
+ Load data from a binary stream whose length is not known in advance,
+ e.g. from stdin.
+*/
+bool File_buffer::load_binary_stream(FILE *file)
+{
+ for ( ; ; )
+ {
+ char buf[1024];
+ if (length() + sizeof(buf) > UINT_MAX32 || reserve(sizeof(buf)))
+ {
+ fprintf(stderr, "Input data is too large\n");
+ return true;
+ }
+ size_t nbytes= my_fread(file, (uchar *) end(), sizeof(buf), MYF(0));
+ if (!nbytes || nbytes == (size_t) -1)
+ return false;
+ str_length+= (uint32) nbytes;
+ }
+ return false;
+}
+
+
+/*
+ Load data from a file by name.
+ The file size is know.
+*/
+bool File_buffer::load_binary_file_by_name(const char *filename)
+{
+ MY_STAT sbuf;
+ File fd;
+
+ if (!my_stat(filename, &sbuf, MYF(0)))
+ {
+ fprintf(stderr, "my_stat failed for '%s'\n", filename);
+ return true;
+ }
+
+ if (!MY_S_ISREG(sbuf.st_mode))
+ {
+ fprintf(stderr, "'%s' is not a regular file\n", filename);
+ return true;
+ }
+
+ if ((size_t) sbuf.st_size > UINT_MAX32)
+ {
+ fprintf(stderr, "File '%s' is too large\n", filename);
+ return true;
+ }
+
+ if (alloc((uint32) sbuf.st_size))
+ {
+ fprintf(stderr, "Failed to allocate read buffer\n");
+ return true;
+ }
+
+ if ((fd= my_open(filename, O_RDONLY, MYF(0))) == -1)
+ {
+ fprintf(stderr, "Could not open '%s'\n", filename);
+ return true;
+ }
+
+ size_t nbytes= my_read(fd, (uchar*) Ptr, (size_t)sbuf.st_size, MYF(0));
+ my_close(fd, MYF(0));
+ length((uint32) nbytes);
+
+ return false;
+}
+
+
+class Delimiter
+{
+protected:
+ bool m_delimiter[127];
+ bool m_has_delimiter_cached;
+ bool has_delimiter_slow() const
+ {
+ for (size_t i= 0; i < sizeof(m_delimiter); i++)
+ {
+ if (m_delimiter[i])
+ return true;
+ }
+ return false;
+ }
+ bool unescape(char *to, char from) const
+ {
+ switch (from) {
+ case '\\': *to= '\\'; return false;
+ case 'r': *to= '\r'; return false;
+ case 'n': *to= '\n'; return false;
+ case 't': *to= '\t'; return false;
+ case '0': *to= '\0'; return false;
+ }
+ *to= '\0';
+ return true;
+ }
+ bool is_delimiter(char ch) const
+ {
+ return (signed char) ch < 0 ? false : m_delimiter[(uint32) ch];
+ }
+public:
+ Delimiter()
+ :m_has_delimiter_cached(false)
+ {
+ bzero(&m_delimiter, sizeof(m_delimiter));
+ }
+ bool has_delimiter() const
+ {
+ return m_has_delimiter_cached;
+ }
+ bool set_delimiter_unescape(const char *str)
+ {
+ m_has_delimiter_cached= false;
+ for ( ; *str; str++)
+ {
+ if ((signed char) *str < 0)
+ return true;
+ if (*str == '\\')
+ {
+ char unescaped;
+ str++;
+ if (!*str || unescape(&unescaped, *str))
+ return true;
+ m_delimiter[(uint) unescaped]= true;
+ }
+ else
+ m_delimiter[(uint) *str]= true;
+ }
+ m_has_delimiter_cached= has_delimiter_slow();
+ return false;
+ }
+ size_t get_delimiter_length(const char *str, const char *end) const
+ {
+ const char *str0= str;
+ for ( ; str < end; str++)
+ {
+ if (!is_delimiter(*str))
+ break;
+ }
+ return str - str0;
+ }
+ size_t get_data_length(const char *str, const char *end) const
+ {
+ const char *str0= str;
+ for ( ; str < end; str++)
+ {
+ if (is_delimiter(*str))
+ break;
+ }
+ return str - str0;
+ }
+};
+
+
+class Conv_inbuf
+{
+ const char *m_ptr;
+ const char *m_end;
+public:
+ Conv_inbuf(const char *from, size_t length)
+ :m_ptr(from), m_end(from + length)
+ { }
+ const char *ptr() const { return m_ptr; }
+ const char *end() const { return m_end; }
+ size_t length() const
+ {
+ return m_end - m_ptr;
+ }
+private:
+ LEX_CSTRING get_prefix(size_t len)
+ {
+ LEX_CSTRING res;
+ res.str= ptr();
+ res.length= len;
+ m_ptr+= len;
+ return res;
+ }
+ LEX_CSTRING get_empty_string() const
+ {
+ static LEX_CSTRING str= {NULL, 0};
+ return str;
+ }
+public:
+ LEX_CSTRING get_delimiter_chunk(const Delimiter &delimiter)
+ {
+ if (!delimiter.has_delimiter())
+ return get_empty_string();
+ size_t len= delimiter.get_delimiter_length(ptr(), end());
+ return get_prefix(len);
+ }
+ LEX_CSTRING get_data_chunk(const Delimiter &delimiter)
+ {
+ if (!delimiter.has_delimiter())
+ return get_prefix(length());
+ size_t len= delimiter.get_data_length(ptr(), end());
+ return get_prefix(len);
+ }
+};
+
+
+class Conv_outbuf: public Binary_string
+{
+public:
+ bool alloc(size_t out_max_length)
+ {
+ if (out_max_length >= UINT_MAX32)
+ {
+ fprintf(stderr, "The data needs a too large output buffer\n");
+ return true;
+ }
+ if (Binary_string::alloc((uint32) out_max_length))
+ {
+ fprintf(stderr, "Failed to allocate the output buffer\n");
+ return true;
+ }
+ return false;
+ }
+};
+
+
+class Conv: public String_copier, public Delimiter
+{
+ CHARSET_INFO *m_tocs;
+ CHARSET_INFO *m_fromcs;
+ bool m_continue;
+public:
+ Conv(CHARSET_INFO *tocs, CHARSET_INFO *fromcs, bool opt_continue)
+ :m_tocs(tocs), m_fromcs(fromcs), m_continue(opt_continue)
+ { }
+ size_t out_buffer_max_length(size_t from_length) const
+ {
+ return from_length / m_fromcs->mbminlen * m_tocs->mbmaxlen;
+ }
+ bool convert_data(const char *from, size_t length);
+ bool convert_binary_stream(FILE *file)
+ {
+ File_buffer buf;
+ return buf.load_binary_stream(file) ||
+ convert_data(buf.ptr(), buf.length());
+ }
+ bool convert_binary_file_by_name(const char *filename)
+ {
+ File_buffer buf;
+ return buf.load_binary_file_by_name(filename)||
+ convert_data(buf.ptr(), buf.length());
+ }
+private:
+ void report_error(const char *from) const
+ {
+ if (well_formed_error_pos())
+ {
+ fflush(stdout);
+ fprintf(stderr,
+ "Illegal %s byte sequence at position %d\n",
+ m_fromcs->cs_name.str,
+ (uint) (well_formed_error_pos() - from));
+ }
+ else if (cannot_convert_error_pos())
+ {
+ fflush(stdout);
+ fprintf(stderr,
+ "Conversion from %s to %s failed at position %d\n",
+ m_fromcs->cs_name.str, m_tocs->cs_name.str,
+ (uint) (cannot_convert_error_pos() - from));
+ }
+ }
+ size_t write(const char *str, size_t length) const
+ {
+ return my_fwrite(stdout, (uchar *) str, length, MY_WME);
+ }
+};
+
+
+bool Conv::convert_data(const char *from, size_t from_length)
+{
+ Conv_inbuf inbuf(from, from_length);
+ Conv_outbuf outbuf;
+
+ if (outbuf.alloc(out_buffer_max_length(from_length)))
+ return true;
+
+ for ( ; ; )
+ {
+ LEX_CSTRING delim, data;
+
+ delim= inbuf.get_delimiter_chunk(*this);
+ if (delim.length)
+ write(delim.str, delim.length);
+
+ data= inbuf.get_data_chunk(*this);
+ if (!data.length)
+ break;
+ size_t length= well_formed_copy(m_tocs,
+ (char *) outbuf.ptr(),
+ outbuf.alloced_length(),
+ m_fromcs, data.str, data.length);
+ outbuf.length((uint32) length);
+
+ if (most_important_error_pos() && !m_continue)
+ {
+ report_error(from);
+ return true;
+ }
+ write(outbuf.ptr(), outbuf.length());
+ }
+ return false;
+}
+
+
+class Session
+{
+public:
+ Session(const char *prog)
+ {
+ MY_INIT(prog);
+ }
+ ~Session()
+ {
+ my_end(0);
+ }
+ void usage(void)
+ {
+ printf("%s Ver %s Distrib %s for %s on %s\n", my_progname, CONV_VERSION,
+ MYSQL_SERVER_VERSION, SYSTEM_TYPE, MACHINE_TYPE);
+ puts("Character set conversion utility for MariaDB");
+ puts("Usage:");
+ printf("%s [OPTION...] [FILE...]\n", my_progname);
+ my_print_help(long_options);
+ }
+};
+
+
+int main(int argc, char *argv[])
+{
+ Session session(argv[0]);
+ CHARSET_INFO *charset_info_from= NULL;
+ CHARSET_INFO *charset_info_to= NULL;
+
+ if (handle_options(&argc, &argv, long_options, get_one_option))
+ {
+ session.usage();
+ return 1;
+ }
+
+ if (!(charset_info_from= opt.csinfo_from()))
+ {
+ fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_from);
+ return 1;
+ }
+
+ if (!(charset_info_to= opt.csinfo_to()))
+ {
+ fprintf(stderr, "Character set %s is not supported\n", opt.m_charset_to);
+ return 1;
+ }
+
+ Conv conv(charset_info_to, charset_info_from, opt.m_continue);
+ if (opt.m_delimiter)
+ {
+ if (charset_info_from->mbminlen > 1 ||
+ charset_info_to->mbminlen > 1)
+ {
+ fprintf(stderr, "--delimiter cannot be used with %s to %s conversion\n",
+ charset_info_from->cs_name.str, charset_info_to->cs_name.str);
+ return 1;
+ }
+ if (conv.set_delimiter_unescape(opt.m_delimiter))
+ {
+ fprintf(stderr, "Bad --delimiter value\n");
+ return 1;
+ }
+ }
+
+ if (argc == 0)
+ {
+ if (conv.convert_binary_stream(stdin))
+ return 1;
+ }
+ else
+ {
+ for (int i= 0; i < argc; i++)
+ {
+ if (conv.convert_binary_file_by_name(argv[i]))
+ return 1;
+ }
+ }
+
+ return 0;
+} /* main */