summaryrefslogtreecommitdiffstats
path: root/src/manconv.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:37:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:37:10 +0000
commitc9addba5cc770d2d231b34f6739f32c6be8690f1 (patch)
treec643da154a95a1d163137135050bb47858a1654e /src/manconv.c
parentInitial commit. (diff)
downloadman-db-c9addba5cc770d2d231b34f6739f32c6be8690f1.tar.xz
man-db-c9addba5cc770d2d231b34f6739f32c6be8690f1.zip
Adding upstream version 2.12.0.upstream/2.12.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/manconv.c')
-rw-r--r--src/manconv.c570
1 files changed, 570 insertions, 0 deletions
diff --git a/src/manconv.c b/src/manconv.c
new file mode 100644
index 0000000..e775b1b
--- /dev/null
+++ b/src/manconv.c
@@ -0,0 +1,570 @@
+/*
+ * manconv.c: convert manual page from one encoding to another
+ *
+ * Copyright (C) 2007, 2008, 2009, 2010, 2012 Colin Watson.
+ * Based loosely on parts of glibc's iconv_prog.c, which is:
+ * Copyright (C) 1998-2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ *
+ * This file is part of man-db.
+ *
+ * man-db is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * man-db is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with man-db; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* This program arose during a discussion with Adam Borowski. See:
+ * https://lists.debian.org/debian-mentors/2007/09/msg00245.html
+ * It behaves like iconv, but allows multiple source encodings and
+ * attempts to guess the first one that works. An Emacs-style
+ * "-*- coding:" declaration overrides this.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <assert.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#ifdef HAVE_ICONV
+# include <iconv.h>
+#endif /* HAVE_ICONV */
+
+#include "argp.h"
+#include "attribute.h"
+#include "error.h"
+#include "gl_list.h"
+#include "xalloc.h"
+#include "xstrndup.h"
+#include "xvasprintf.h"
+
+#include "gettext.h"
+#include <locale.h>
+#define _(String) gettext (String)
+
+#include "manconfig.h"
+
+#include "debug.h"
+#include "fatal.h"
+#include "glcontainers.h"
+
+#include "decompress.h"
+#include "manconv.h"
+
+/* Encoding conversions from groff-1.20/src/preproc/preconv/preconv.cpp.
+ * I've only included those not already recognised by GNU libiconv.
+ */
+struct conversion_entry {
+ const char *from;
+ const char *to;
+};
+
+static struct conversion_entry conversion_table[] = {
+ { "chinese-big5", "Big5" },
+ { "chinese-euc", "GB2312" },
+ { "chinese-iso-8bit", "GB2312" },
+ { "cn-gb-2312", "GB2312" },
+ { "cp878", "KOI8-R" },
+ { "cyrillic-iso-8bit", "ISO-8859-5" },
+ { "cyrillic-koi8", "KOI8-R" },
+ { "euc-china", "GB2312" },
+ { "euc-japan", "EUC-JP" },
+ { "euc-japan-1990", "EUC-JP" },
+ { "euc-kr", "EUC-KR" },
+ { "greek-iso-8bit", "ISO-8859-7" },
+ { "iso-latin-1", "ISO-8859-1" },
+ { "iso-latin-2", "ISO-8859-2" },
+ { "iso-latin-5", "ISO-8859-9" },
+ { "iso-latin-7", "ISO-8859-13" },
+ { "iso-latin-9", "ISO-8859-15" },
+ { "japanese-iso-8bit", "EUC-JP" },
+ { "japanese-euc", "EUC-JP" },
+ { "jis8", "EUC-JP" },
+ { "korean-euc", "EUC-KR" },
+ { "korean-iso-8bit", "EUC-KR" },
+ { "latin-0", "ISO-8859-15" },
+ { "latin-1", "ISO-8859-1" },
+ { "latin-2", "ISO-8859-2" },
+ { "latin-5", "ISO-8859-9" },
+ { "latin-7", "ISO-8859-13" },
+ { "mule-utf-16", "UTF-16" },
+ { "mule-utf-16be", "UTF-16BE" },
+ { "mule-utf-16-be", "UTF-16BE" },
+ { "mule-utf-16be-with-signature", "UTF-16" },
+ { "mule-utf-16le", "UTF-16LE" },
+ { "mule-utf-16-le", "UTF-16LE" },
+ { "mule-utf-16le-with-signature", "UTF-16" },
+ { "mule-utf-8", "UTF-8" },
+ { "utf-16-be", "UTF-16BE" },
+ { "utf-16be-with-signature", "UTF-16" },
+ { "utf-16-be-with-signature", "UTF-16" },
+ { "utf-16-le", "UTF-16LE" },
+ { "utf-16le-with-signature", "UTF-16" },
+ { "utf-16-le-with-signature", "UTF-16" },
+ { NULL, NULL }
+};
+
+/* Convert Emacs-style coding tags to ones that libiconv understands. */
+static char *convert_encoding (char *encoding)
+{
+ size_t encoding_len = strlen (encoding);
+ const struct conversion_entry *entry;
+
+#define STRIP(s, l) do { \
+ if (encoding_len > (l) && \
+ !strcasecmp (encoding + encoding_len - (l), (s))) \
+ encoding[encoding_len - (l)] = '\0'; \
+} while (0)
+
+ STRIP ("-dos", 4);
+ STRIP ("-mac", 4);
+ STRIP ("-unix", 5);
+
+#undef STRIP
+
+ for (entry = conversion_table; entry->from; ++entry)
+ if (!strcasecmp (entry->from, encoding)) {
+ free (encoding);
+ return xstrdup (entry->to);
+ }
+
+ return encoding;
+}
+
+/* Inspect the first line of data from a decompressor for preprocessor
+ * encoding declarations.
+ *
+ * If to_encoding and modified_line are both non-NULL, and if the encoding
+ * declaration in the input does not match to_encoding, then return an
+ * encoding declaration line modified to refer to the given to_encoding in
+ * *modified_line. The caller should free *modified_line.
+ */
+char *check_preprocessor_encoding (decompress *decomp, const char *to_encoding,
+ char **modified_line)
+{
+ char *pp_encoding = NULL;
+ const char *line = decompress_peekline (decomp);
+ const char *directive = NULL, *directive_end = NULL, *pp_search = NULL;
+ size_t pp_encoding_len = 0;
+
+ /* Some people use .\" incorrectly. We allow it for encoding
+ * declarations but not for preprocessor declarations.
+ */
+ if (line &&
+ (STRNEQ (line, PP_COOKIE, 4) || STRNEQ (line, ".\\\" ", 4))) {
+ const char *newline = strchr (line, '\n');
+
+ directive = line + 4;
+ directive_end = newline ? newline : strchr (directive, '\0');
+ pp_search = memmem (directive, directive_end - directive,
+ "-*-", 3);
+ }
+
+ if (directive && pp_search) {
+ pp_search += 3;
+ while (pp_search && pp_search < directive_end && *pp_search) {
+ while (*pp_search == ' ')
+ ++pp_search;
+ if (STRNEQ (pp_search, "coding:", 7)) {
+ const char *pp_encoding_allow;
+ pp_search += 7;
+ while (*pp_search == ' ')
+ ++pp_search;
+ pp_encoding_allow = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789-_/:.()";
+ pp_encoding_len = strspn (pp_search,
+ pp_encoding_allow);
+ pp_encoding = xstrndup (pp_search,
+ pp_encoding_len);
+ pp_encoding = convert_encoding (pp_encoding);
+ debug ("preprocessor encoding: %s\n",
+ pp_encoding);
+ break;
+ } else {
+ pp_search = memchr (pp_search, ';',
+ directive_end - pp_search);
+ if (pp_search)
+ ++pp_search;
+ }
+ }
+ }
+
+ if (to_encoding && modified_line &&
+ pp_encoding && strcasecmp (pp_encoding, to_encoding)) {
+ assert (directive_end);
+ assert (pp_search);
+ *modified_line = xasprintf
+ ("%.*s%s%.*s\n",
+ (int) (pp_search - line), line,
+ to_encoding,
+ (int) (directive_end - (pp_search + pp_encoding_len)),
+ pp_search + pp_encoding_len);
+ }
+
+ return pp_encoding;
+}
+
+static int add_output (const char *inbuf, size_t inlen,
+ struct manconv_outbuf *outbuf)
+{
+ int ret = 0;
+
+ if (outbuf) {
+ if (outbuf->len + inlen >= outbuf->max)
+ fatal (0, "out of space in output buffer");
+ memcpy (outbuf->buf + outbuf->len, inbuf, inlen);
+ outbuf->len += inlen;
+ } else {
+ int errno_save = errno;
+ if (fwrite (inbuf, 1, inlen, stdout) < inlen ||
+ ferror (stdout)) {
+ error (0, 0, _("can't write to standard output"));
+ ret = -1;
+ }
+ errno = errno_save;
+ }
+
+ return ret;
+}
+
+#ifdef HAVE_ICONV
+
+/* When converting text containing an invalid multibyte sequence to
+ * UTF-8//IGNORE, GNU libc's iconv returns EILSEQ but sets *inbuf to the end
+ * of the input buffer. I'm not sure whether this is a bug or not (it seems
+ * to contradict the documentation), but work around it anyway by recoding
+ * to UTF-8 so that we can accurately position the error.
+ */
+static off_t locate_error (const char *try_from_code,
+ const char *input, size_t input_size,
+ char *utf8, size_t utf8_size)
+{
+ iconv_t cd_utf8_strict;
+ char *inptr = (char *) input, *utf8ptr = utf8;
+ size_t inleft = input_size, utf8left = utf8_size;
+ size_t n;
+ off_t ret;
+
+ cd_utf8_strict = iconv_open ("UTF-8", try_from_code);
+ if (cd_utf8_strict == (iconv_t) -1) {
+ error (0, errno, "iconv_open (\"UTF-8\", \"%s\")",
+ try_from_code);
+ return 0;
+ }
+
+ n = iconv (cd_utf8_strict, (ICONV_CONST char **) &inptr, &inleft,
+ &utf8ptr, &utf8left);
+ if (n == (size_t) -1)
+ ret = inptr - input;
+ else
+ ret = 0;
+
+ iconv_close (cd_utf8_strict);
+
+ return ret;
+}
+
+typedef enum {
+ TRIED_ICONV_OK = 0,
+ TRIED_ICONV_ERROR = -1, /* can continue with another encoding */
+ TRIED_ICONV_FATAL = -2 /* must give up */
+} tried_iconv;
+
+static tried_iconv try_iconv (decompress *decomp, const char *try_from_code,
+ const char *to, bool last,
+ struct manconv_outbuf *outbuf)
+{
+ char *try_to_code = xstrdup (to);
+ static const size_t buf_size = 65536;
+ size_t input_size = buf_size;
+ off_t input_pos = 0;
+ const char *input;
+ static char *utf8 = NULL, *output = NULL;
+ size_t utf8left = 0;
+ iconv_t cd_utf8, cd = NULL;
+ bool to_utf8 = STREQ (try_to_code, "UTF-8") ||
+ STRNEQ (try_to_code, "UTF-8//", 7);
+ const char *utf8_target = last ? "UTF-8//IGNORE" : "UTF-8";
+ bool ignore_errors = (strstr (try_to_code, "//IGNORE") != NULL);
+ tried_iconv ret = TRIED_ICONV_OK;
+
+ debug ("trying encoding %s -> %s\n", try_from_code, try_to_code);
+
+ cd_utf8 = iconv_open (utf8_target, try_from_code);
+ if (cd_utf8 == (iconv_t) -1) {
+ error (0, errno, "iconv_open (\"%s\", \"%s\")",
+ utf8_target, try_from_code);
+ free (try_to_code);
+ return TRIED_ICONV_ERROR;
+ }
+
+ if (!to_utf8) {
+ cd = iconv_open (try_to_code, "UTF-8");
+ if (cd == (iconv_t) -1) {
+ error (0, errno, "iconv_open (\"%s\", \"UTF-8\")",
+ try_to_code);
+ free (try_to_code);
+ return TRIED_ICONV_ERROR;
+ }
+ }
+
+ input = decompress_peek (decomp, &input_size);
+ if (input_size < buf_size) {
+ /* End of file, error, or just a short read? Repeat until we
+ * have either a full buffer or EOF/error.
+ */
+ while (input_size < buf_size) {
+ size_t old_input_size = input_size;
+ input_size = buf_size;
+ input = decompress_peek (decomp, &input_size);
+ if (input_size == old_input_size)
+ break;
+ }
+ }
+
+ if (!utf8)
+ utf8 = xmalloc (buf_size);
+ if (!output)
+ output = xmalloc (buf_size);
+
+ while (input_size || utf8left) {
+ int handle_iconv_errors = 0;
+ char *inptr = (char *) input, *utf8ptr = utf8, *outptr;
+ size_t inleft = input_size, outleft;
+ size_t n, n2 = -1;
+
+ if (!utf8left) {
+ /* First, convert the text to UTF-8. By assumption,
+ * all validly-encoded text can be converted to
+ * UTF-8 assuming that we picked the correct
+ * encoding. Any errors at this stage are due to
+ * selecting an incorrect encoding, or due to
+ * misencoded source text.
+ */
+ utf8left = buf_size;
+ n = iconv (cd_utf8, (ICONV_CONST char **) &inptr,
+ &inleft, &utf8ptr, &utf8left);
+ utf8left = buf_size - utf8left;
+
+ /* If we need to try the next encoding, do that
+ * before writing anything.
+ */
+ if (!last && n == (size_t) -1 &&
+ (errno == EILSEQ ||
+ (errno == EINVAL && input_size < buf_size))) {
+ ret = TRIED_ICONV_ERROR;
+ break;
+ } else if (n == (size_t) -1)
+ handle_iconv_errors = errno;
+ }
+
+ /* If the target encoding is UTF-8 (the common case), then
+ * we can just write out what we've got. Otherwise, we need
+ * to convert to the target encoding. Any errors at this
+ * stage are due to characters that are not representable in
+ * the target encoding.
+ */
+ if (handle_iconv_errors)
+ /* Fall back to error handling below. If we have
+ * anything to write out, we'll do it next time
+ * round the loop.
+ */
+ outptr = output;
+ else if (to_utf8) {
+ memcpy (output, utf8, utf8left);
+ outptr = output + utf8left;
+ outleft = utf8left;
+ utf8left = 0;
+ } else if (utf8left) {
+ outptr = output;
+ outleft = buf_size;
+ utf8ptr = utf8;
+ n2 = iconv (
+ cd, (ICONV_CONST char **) &utf8ptr, &utf8left,
+ &outptr, &outleft);
+ outleft = buf_size - outleft;
+ if (n2 == (size_t) -1)
+ handle_iconv_errors = errno;
+
+ if (n2 == (size_t) -1 &&
+ errno == EILSEQ && ignore_errors)
+ errno = 0;
+ } else
+ /* We appear to have converted some input text, but
+ * not actually ended up with any UTF-8 text. This
+ * is odd. However, we can at least continue round
+ * the loop, skip the input text we converted, and
+ * then we should get a different result next time.
+ */
+ outptr = output;
+
+ if (outptr != output) {
+ /* We have something to write out. */
+ if (add_output (output, outleft, outbuf) != 0) {
+ ret = TRIED_ICONV_FATAL;
+ goto out;
+ }
+ }
+
+ if (!to_utf8 && n2 != (size_t) -1) {
+ /* All the UTF-8 text we have so far was processed.
+ * For state-dependent character sets we have to
+ * flush the state now.
+ */
+ outptr = output;
+ outleft = buf_size;
+ iconv (cd, NULL, NULL, &outptr, &outleft);
+ outleft = buf_size - outleft;
+
+ if (outptr != output) {
+ /* We have something to write out. */
+ if (add_output (output, outleft,
+ outbuf) != 0) {
+ ret = TRIED_ICONV_FATAL;
+ goto out;
+ }
+ }
+ } else if (handle_iconv_errors) {
+ intmax_t error_pos;
+
+ if (handle_iconv_errors == EILSEQ && !ignore_errors) {
+ if (!quiet) {
+ error_pos = input_pos + locate_error (
+ try_from_code,
+ input, input_size,
+ utf8, buf_size);
+ error (0, handle_iconv_errors,
+ "byte %jd: iconv", error_pos);
+ }
+ ret = TRIED_ICONV_FATAL;
+ goto out;
+ } else if (handle_iconv_errors == EINVAL &&
+ input_size < buf_size) {
+ if (!quiet) {
+ error_pos = input_pos + locate_error (
+ try_from_code,
+ input, input_size,
+ utf8, buf_size);
+ error (0, 0, "byte %jd: %s", error_pos,
+ _("iconv: incomplete character "
+ "at end of buffer"));
+ }
+ ret = TRIED_ICONV_FATAL;
+ goto out;
+ }
+ }
+
+ if (inptr != input) {
+ decompress_peek_skip (decomp, input_size - inleft);
+ input_pos += input_size - inleft;
+ }
+
+ /* Unless we have some UTF-8 text left (which will only
+ * happen if the output encoding is more verbose than UTF-8,
+ * so is unlikely for legacy encodings), we need to fetch
+ * more input text now.
+ */
+ if (!utf8left) {
+ input_size = buf_size;
+ input = decompress_peek (decomp, &input_size);
+ while (input_size < buf_size) {
+ size_t old_input_size = input_size;
+ input_size = buf_size;
+ input = decompress_peek (decomp, &input_size);
+ if (input_size == old_input_size)
+ break;
+ }
+ }
+ }
+
+out:
+ if (!to_utf8)
+ iconv_close (cd);
+ iconv_close (cd_utf8);
+ free (try_to_code);
+
+ return ret;
+}
+
+int manconv (decompress *decomp, gl_list_t from, const char *to,
+ struct manconv_outbuf *outbuf)
+{
+ char *pp_encoding;
+ const char *try_from_code;
+ char *plain_to, *modified_pp_line = NULL;
+ tried_iconv tried;
+ int ret = 0;
+
+ plain_to = xstrndup (to, strcspn (to, "/"));
+ pp_encoding = check_preprocessor_encoding
+ (decomp, plain_to, &modified_pp_line);
+ if (pp_encoding) {
+ if (modified_pp_line) {
+ size_t len = strlen (modified_pp_line);
+ decompress_readline (decomp);
+ if (add_output (modified_pp_line, len, outbuf) != 0) {
+ ret = -1;
+ goto out;
+ }
+ }
+ tried = try_iconv (decomp, pp_encoding, to, 1, outbuf);
+ if (tried == TRIED_ICONV_FATAL)
+ ret = -1;
+ } else {
+ GL_LIST_FOREACH (from, try_from_code) {
+ bool last = !gl_list_next_node (from, from_node);
+ tried = try_iconv (decomp, try_from_code, to, last,
+ outbuf);
+ if (tried == TRIED_ICONV_OK)
+ break;
+ else if (tried == TRIED_ICONV_FATAL) {
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+
+out:
+ free (modified_pp_line);
+ free (pp_encoding);
+ free (plain_to);
+ return ret;
+}
+
+#else /* !HAVE_ICONV */
+
+/* If we don't have iconv, there isn't much we can do; just pass everything
+ * through unchanged.
+ */
+int manconv (decompress *decomp, gl_list_t from MAYBE_UNUSED,
+ const char *to MAYBE_UNUSED, struct manconv_outbuf *outbuf)
+{
+ for (;;) {
+ size_t len = 4096;
+ const char *buffer = decompress_read (decomp, &len);
+ if (len == 0)
+ break;
+ if (add_output (buffer, len, outbuf) != 0)
+ return -1;
+ }
+ return 0;
+}
+
+#endif /* HAVE_ICONV */