diff options
Diffstat (limited to 'src/manconv.c')
-rw-r--r-- | src/manconv.c | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/src/manconv.c b/src/manconv.c new file mode 100644 index 0000000..9759a28 --- /dev/null +++ b/src/manconv.c @@ -0,0 +1,352 @@ +/* + * manconv.c: convert manual page from one encoding to another + * + * Copyright (C) 2007, 2008, 2009, 2010, 2012 Colin Watson. + * Based loosely on parts of glibc's iconv_prog.c, which is: + * Copyright (C) 1998-2004, 2005, 2006, 2007 Free Software Foundation, Inc. + * + * This file is part of man-db. + * + * man-db is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * man-db is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with man-db; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* This program arose during a discussion with Adam Borowski. See: + * https://lists.debian.org/debian-mentors/2007/09/msg00245.html + * It behaves like iconv, but allows multiple source encodings and + * attempts to guess the first one that works. An Emacs-style + * "-*- coding:" declaration overrides this. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <stdint.h> +#include <unistd.h> + +#ifdef HAVE_ICONV +# include <iconv.h> +#endif /* HAVE_ICONV */ + +#include "argp.h" + +#include "gettext.h" +#include <locale.h> +#define _(String) gettext (String) + +#include "manconfig.h" + +#include "error.h" +#include "pipeline.h" +#include "encodings.h" + +#include "manconv.h" + +#ifdef HAVE_ICONV + +/* When converting text containing an invalid multibyte sequence to + * UTF-8//IGNORE, GNU libc's iconv returns EILSEQ but sets *inbuf to the end + * of the input buffer. I'm not sure whether this is a bug or not (it seems + * to contradict the documentation), but work around it anyway by recoding + * to UTF-8 so that we can accurately position the error. + */ +static off_t locate_error (const char *try_from_code, + const char *input, size_t input_size, + char *utf8, size_t utf8_size) +{ + iconv_t cd_utf8_strict; + char *inptr = (char *) input, *utf8ptr = utf8; + size_t inleft = input_size, utf8left = utf8_size; + size_t n; + off_t ret; + + cd_utf8_strict = iconv_open ("UTF-8", try_from_code); + if (cd_utf8_strict == (iconv_t) -1) { + error (0, errno, "iconv_open (\"UTF-8\", \"%s\")", + try_from_code); + return 0; + } + + n = iconv (cd_utf8_strict, (ICONV_CONST char **) &inptr, &inleft, + &utf8ptr, &utf8left); + if (n == (size_t) -1) + ret = inptr - input; + else + ret = 0; + + iconv_close (cd_utf8_strict); + + return ret; +} + +static int try_iconv (pipeline *p, const char *try_from_code, const char *to, + int last) +{ + char *try_to_code = xstrdup (to); + static const size_t buf_size = 65536; + size_t input_size = buf_size; + off_t input_pos = 0; + const char *input; + static char *utf8 = NULL, *output = NULL; + size_t utf8left = 0; + iconv_t cd_utf8, cd = NULL; + int to_utf8 = STREQ (try_to_code, "UTF-8") || + STRNEQ (try_to_code, "UTF-8//", 7); + const char *utf8_target = last ? "UTF-8//IGNORE" : "UTF-8"; + int ignore_errors = (strstr (try_to_code, "//IGNORE") != NULL);; + int ret = 0; + + debug ("trying encoding %s -> %s\n", try_from_code, try_to_code); + + cd_utf8 = iconv_open (utf8_target, try_from_code); + if (cd_utf8 == (iconv_t) -1) { + error (0, errno, "iconv_open (\"%s\", \"%s\")", + utf8_target, try_from_code); + free (try_to_code); + return -1; + } + + if (!to_utf8) { + cd = iconv_open (try_to_code, "UTF-8"); + if (cd == (iconv_t) -1) { + error (0, errno, "iconv_open (\"%s\", \"UTF-8\")", + try_to_code); + free (try_to_code); + return -1; + } + } + + input = pipeline_peek (p, &input_size); + if (input_size < buf_size) { + /* End of file, error, or just a short read? Repeat until we + * have either a full buffer or EOF/error. + */ + while (input_size < buf_size) { + size_t old_input_size = input_size; + input_size = buf_size; + input = pipeline_peek (p, &input_size); + if (input_size == old_input_size) + break; + } + } + + if (!utf8) + utf8 = xmalloc (buf_size); + if (!output) + output = xmalloc (buf_size); + + while (input_size || utf8left) { + int handle_iconv_errors = 0; + char *inptr = (char *) input, *utf8ptr = utf8; + char *outptr = output; + size_t inleft = input_size, outleft; + size_t n, n2 = -1; + + if (!utf8left) { + /* First, convert the text to UTF-8. By assumption, + * all validly-encoded text can be converted to + * UTF-8 assuming that we picked the correct + * encoding. Any errors at this stage are due to + * selecting an incorrect encoding, or due to + * misencoded source text. + */ + utf8left = buf_size; + n = iconv (cd_utf8, (ICONV_CONST char **) &inptr, + &inleft, &utf8ptr, &utf8left); + utf8left = buf_size - utf8left; + + /* If we need to try the next encoding, do that + * before writing anything. + */ + if (!last && n == (size_t) -1 && + (errno == EILSEQ || + (errno == EINVAL && input_size < buf_size))) { + ret = -1; + break; + } else if (n == (size_t) -1) + handle_iconv_errors = errno; + } + + /* If the target encoding is UTF-8 (the common case), then + * we can just write out what we've got. Otherwise, we need + * to convert to the target encoding. Any errors at this + * stage are due to characters that are not representable in + * the target encoding. + */ + if (handle_iconv_errors) + /* Fall back to error handling below. If we have + * anything to write out, we'll do it next time + * round the loop. + */ + ; + else if (to_utf8) { + memcpy (output, utf8, utf8left); + outptr += utf8left; + outleft = utf8left; + utf8left = 0; + } else if (utf8left) { + outptr = output; + outleft = buf_size; + utf8ptr = utf8; + n2 = iconv ( + cd, (ICONV_CONST char **) &utf8ptr, &utf8left, + &outptr, &outleft); + outleft = buf_size - outleft; + if (n2 == (size_t) -1) + handle_iconv_errors = errno; + + if (n2 == (size_t) -1 && + errno == EILSEQ && ignore_errors) + errno = 0; + } else + /* We appear to have converted some input text, but + * not actually ended up with any UTF-8 text. This + * is odd. However, we can at least continue round + * the loop, skip the input text we converted, and + * then we should get a different result next time. + */ + outptr = output; + + if (outptr != output) { + /* We have something to write out. */ + int errno_save = errno; + size_t w; + w = fwrite (output, 1, outleft, stdout); + if (w < (size_t) outleft || ferror (stdout)) + error (FATAL, 0, _("can't write to " + "standard output")); + errno = errno_save; + } + + if (!to_utf8 && n2 != (size_t) -1) { + /* All the UTF-8 text we have so far was processed. + * For state-dependent character sets we have to + * flush the state now. + */ + outptr = output; + outleft = buf_size; + iconv (cd, NULL, NULL, &outptr, &outleft); + outleft = buf_size - outleft; + + if (outptr != output) { + /* We have something to write out. */ + int errno_save = errno; + size_t w; + w = fwrite (output, 1, outleft, stdout); + if (w < (size_t) outleft || ferror (stdout)) + error (FATAL, 0, _("can't write to " + "standard output")); + errno = errno_save; + } + } else if (handle_iconv_errors) { + intmax_t error_pos; + + if (handle_iconv_errors == EILSEQ && !ignore_errors) { + if (!quiet) { + error_pos = input_pos + locate_error ( + try_from_code, + input, input_size, + utf8, buf_size); + error (0, handle_iconv_errors, + "byte %jd: iconv", error_pos); + } + exit (FATAL); + } else if (handle_iconv_errors == EINVAL && + input_size < buf_size) { + if (!quiet) { + error_pos = input_pos + locate_error ( + try_from_code, + input, input_size, + utf8, buf_size); + error (FATAL, 0, "byte %jd: %s", + error_pos, + _("iconv: incomplete character " + "at end of buffer")); + } + exit (FATAL); + } + } + + if (inptr != input) { + pipeline_peek_skip (p, input_size - inleft); + input_pos += input_size - inleft; + } + + /* Unless we have some UTF-8 text left (which will only + * happen if the output encoding is more verbose than UTF-8, + * so is unlikely for legacy encodings), we need to fetch + * more input text now. + */ + if (!utf8left) { + input_size = buf_size; + input = pipeline_peek (p, &input_size); + while (input_size < buf_size) { + size_t old_input_size = input_size; + input_size = buf_size; + input = pipeline_peek (p, &input_size); + if (input_size == old_input_size) + break; + } + } + } + + if (!to_utf8) + iconv_close (cd); + iconv_close (cd_utf8); + free (try_to_code); + + return ret; +} + +void manconv (pipeline *p, char * const *from, const char *to) +{ + char *pp_encoding; + char * const *try_from_code; + + pp_encoding = check_preprocessor_encoding (p); + if (pp_encoding) { + try_iconv (p, pp_encoding, to, 1); + free (pp_encoding); + } else { + for (try_from_code = from; *try_from_code; ++try_from_code) + if (try_iconv (p, *try_from_code, to, + !*(try_from_code + 1)) == 0) + break; + } +} + +#else /* !HAVE_ICONV */ + +/* If we don't have iconv, there isn't much we can do; just pass everything + * through unchanged. + */ +void manconv (pipeline *p, char * const *from ATTRIBUTE_UNUSED, + const char *to ATTRIBUTE_UNUSED) +{ + for (;;) { + size_t len = 4096; + const char *buffer = pipeline_read (p, &len); + if (len == 0) + break; + if (fwrite (buffer, 1, len, stdout) < len || ferror (stdout)) + error (FATAL, 0, _("can't write to standard output")); + } +} + +#endif /* HAVE_ICONV */ |