/*
 * manconv.c: convert manual page from one encoding to another
 *
 * Copyright (C) 2007, 2008, 2009, 2010, 2012 Colin Watson.
 * Based loosely on parts of glibc's iconv_prog.c, which is:
 * Copyright (C) 1998-2004, 2005, 2006, 2007 Free Software Foundation, Inc.
 *
 * This file is part of man-db.
 *
 * man-db is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * man-db is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with man-db; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

/* This program arose during a discussion with Adam Borowski. See:
 *   https://lists.debian.org/debian-mentors/2007/09/msg00245.html
 * It behaves like iconv, but allows multiple source encodings and
 * attempts to guess the first one that works. An Emacs-style
 * "-*- coding:" declaration overrides this.
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include <assert.h>
#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <unistd.h>

#ifdef HAVE_ICONV
#  include <iconv.h>
#endif /* HAVE_ICONV */

#include "argp.h"
#include "attribute.h"
#include "error.h"
#include "gl_list.h"
#include "xalloc.h"
#include "xstrndup.h"
#include "xvasprintf.h"

#include "gettext.h"
#include <locale.h>
#define _(String) gettext (String)

#include "manconfig.h"

#include "debug.h"
#include "fatal.h"
#include "glcontainers.h"

#include "decompress.h"
#include "manconv.h"

/* Encoding conversions from groff-1.20/src/preproc/preconv/preconv.cpp.
 * I've only included those not already recognised by GNU libiconv.
 */
struct conversion_entry {
	const char *from;
	const char *to;
};

static struct conversion_entry conversion_table[] = {
	{ "chinese-big5",			"Big5" },
	{ "chinese-euc",			"GB2312" },
	{ "chinese-iso-8bit",			"GB2312" },
	{ "cn-gb-2312",				"GB2312" },
	{ "cp878",				"KOI8-R" },
	{ "cyrillic-iso-8bit",			"ISO-8859-5" },
	{ "cyrillic-koi8",			"KOI8-R" },
	{ "euc-china",				"GB2312" },
	{ "euc-japan",				"EUC-JP" },
	{ "euc-japan-1990",			"EUC-JP" },
	{ "euc-kr",				"EUC-KR" },
	{ "greek-iso-8bit",			"ISO-8859-7" },
	{ "iso-latin-1",			"ISO-8859-1" },
	{ "iso-latin-2",			"ISO-8859-2" },
	{ "iso-latin-5",			"ISO-8859-9" },
	{ "iso-latin-7",			"ISO-8859-13" },
	{ "iso-latin-9",			"ISO-8859-15" },
	{ "japanese-iso-8bit",			"EUC-JP" },
	{ "japanese-euc",			"EUC-JP" },
	{ "jis8",				"EUC-JP" },
	{ "korean-euc",				"EUC-KR" },
	{ "korean-iso-8bit",			"EUC-KR" },
	{ "latin-0",				"ISO-8859-15" },
	{ "latin-1",				"ISO-8859-1" },
	{ "latin-2",				"ISO-8859-2" },
	{ "latin-5",				"ISO-8859-9" },
	{ "latin-7",				"ISO-8859-13" },
	{ "mule-utf-16",			"UTF-16" },
	{ "mule-utf-16be",			"UTF-16BE" },
	{ "mule-utf-16-be",			"UTF-16BE" },
	{ "mule-utf-16be-with-signature",	"UTF-16" },
	{ "mule-utf-16le",			"UTF-16LE" },
	{ "mule-utf-16-le",			"UTF-16LE" },
	{ "mule-utf-16le-with-signature",	"UTF-16" },
	{ "mule-utf-8",				"UTF-8" },
	{ "utf-16-be",				"UTF-16BE" },
	{ "utf-16be-with-signature",		"UTF-16" },
	{ "utf-16-be-with-signature",		"UTF-16" },
	{ "utf-16-le",				"UTF-16LE" },
	{ "utf-16le-with-signature",		"UTF-16" },
	{ "utf-16-le-with-signature",		"UTF-16" },
	{ NULL,					NULL }
};

/* Convert Emacs-style coding tags to ones that libiconv understands. */
static char *convert_encoding (char *encoding)
{
	size_t encoding_len = strlen (encoding);
	const struct conversion_entry *entry;

#define STRIP(s, l) do { \
	if (encoding_len > (l) && \
	    !strcasecmp (encoding + encoding_len - (l), (s))) \
		encoding[encoding_len - (l)] = '\0'; \
} while (0)

	STRIP ("-dos", 4);
	STRIP ("-mac", 4);
	STRIP ("-unix", 5);

#undef STRIP

	for (entry = conversion_table; entry->from; ++entry)
		if (!strcasecmp (entry->from, encoding)) {
			free (encoding);
			return xstrdup (entry->to);
		}

	return encoding;
}

/* Inspect the first line of data from a decompressor for preprocessor
 * encoding declarations.
 *
 * If to_encoding and modified_line are both non-NULL, and if the encoding
 * declaration in the input does not match to_encoding, then return an
 * encoding declaration line modified to refer to the given to_encoding in
 * *modified_line.  The caller should free *modified_line.
 */
char *check_preprocessor_encoding (decompress *decomp, const char *to_encoding,
				   char **modified_line)
{
	char *pp_encoding = NULL;
	const char *line = decompress_peekline (decomp);
	const char *directive = NULL, *directive_end = NULL, *pp_search = NULL;
	size_t pp_encoding_len = 0;

	/* Some people use .\" incorrectly. We allow it for encoding
	 * declarations but not for preprocessor declarations.
	 */
	if (line &&
	    (STRNEQ (line, PP_COOKIE, 4) || STRNEQ (line, ".\\\" ", 4))) {
		const char *newline = strchr (line, '\n');

		directive = line + 4;
		directive_end = newline ? newline : strchr (directive, '\0');
		pp_search = memmem (directive, directive_end - directive,
				    "-*-", 3);
	}

	if (directive && pp_search) {
		pp_search += 3;
		while (pp_search && pp_search < directive_end && *pp_search) {
			while (*pp_search == ' ')
				++pp_search;
			if (STRNEQ (pp_search, "coding:", 7)) {
				const char *pp_encoding_allow;
				pp_search += 7;
				while (*pp_search == ' ')
					++pp_search;
				pp_encoding_allow = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
						    "abcdefghijklmnopqrstuvwxyz"
						    "0123456789-_/:.()";
				pp_encoding_len = strspn (pp_search,
							  pp_encoding_allow);
				pp_encoding = xstrndup (pp_search,
							pp_encoding_len);
				pp_encoding = convert_encoding (pp_encoding);
				debug ("preprocessor encoding: %s\n",
				       pp_encoding);
				break;
			} else {
				pp_search = memchr (pp_search, ';',
						    directive_end - pp_search);
				if (pp_search)
					++pp_search;
			}
		}
	}

	if (to_encoding && modified_line &&
	    pp_encoding && strcasecmp (pp_encoding, to_encoding)) {
		assert (directive_end);
		assert (pp_search);
		*modified_line = xasprintf
			("%.*s%s%.*s\n",
			 (int) (pp_search - line), line,
			 to_encoding,
			 (int) (directive_end - (pp_search + pp_encoding_len)),
			 pp_search + pp_encoding_len);
	}

	return pp_encoding;
}

static int add_output (const char *inbuf, size_t inlen,
		       struct manconv_outbuf *outbuf)
{
	int ret = 0;

	if (outbuf) {
		if (outbuf->len + inlen >= outbuf->max)
			fatal (0, "out of space in output buffer");
		memcpy (outbuf->buf + outbuf->len, inbuf, inlen);
		outbuf->len += inlen;
	} else {
		int errno_save = errno;
		if (fwrite (inbuf, 1, inlen, stdout) < inlen ||
		    ferror (stdout)) {
			error (0, 0, _("can't write to standard output"));
			ret = -1;
		}
		errno = errno_save;
	}

	return ret;
}

#ifdef HAVE_ICONV

/* When converting text containing an invalid multibyte sequence to
 * UTF-8//IGNORE, GNU libc's iconv returns EILSEQ but sets *inbuf to the end
 * of the input buffer.  I'm not sure whether this is a bug or not (it seems
 * to contradict the documentation), but work around it anyway by recoding
 * to UTF-8 so that we can accurately position the error.
 */
static off_t locate_error (const char *try_from_code,
			   const char *input, size_t input_size,
			   char *utf8, size_t utf8_size)
{
	iconv_t cd_utf8_strict;
	char *inptr = (char *) input, *utf8ptr = utf8;
	size_t inleft = input_size, utf8left = utf8_size;
	size_t n;
	off_t ret;

	cd_utf8_strict = iconv_open ("UTF-8", try_from_code);
	if (cd_utf8_strict == (iconv_t) -1) {
		error (0, errno, "iconv_open (\"UTF-8\", \"%s\")",
		       try_from_code);
		return 0;
	}

	n = iconv (cd_utf8_strict, (ICONV_CONST char **) &inptr, &inleft,
		   &utf8ptr, &utf8left);
	if (n == (size_t) -1)
		ret = inptr - input;
	else
		ret = 0;

	iconv_close (cd_utf8_strict);

	return ret;
}

typedef enum {
	TRIED_ICONV_OK = 0,
	TRIED_ICONV_ERROR = -1,  /* can continue with another encoding */
	TRIED_ICONV_FATAL = -2   /* must give up */
} tried_iconv;

static tried_iconv try_iconv (decompress *decomp, const char *try_from_code,
			      const char *to, bool last,
			      struct manconv_outbuf *outbuf)
{
	char *try_to_code = xstrdup (to);
	static const size_t buf_size = 65536;
	size_t input_size = buf_size;
	off_t input_pos = 0;
	const char *input;
	static char *utf8 = NULL, *output = NULL;
	size_t utf8left = 0;
	iconv_t cd_utf8, cd = NULL;
	bool to_utf8 = STREQ (try_to_code, "UTF-8") ||
		       STRNEQ (try_to_code, "UTF-8//", 7);
	const char *utf8_target = last ? "UTF-8//IGNORE" : "UTF-8";
	bool ignore_errors = (strstr (try_to_code, "//IGNORE") != NULL);
	tried_iconv ret = TRIED_ICONV_OK;

	debug ("trying encoding %s -> %s\n", try_from_code, try_to_code);

	cd_utf8 = iconv_open (utf8_target, try_from_code);
	if (cd_utf8 == (iconv_t) -1) {
		error (0, errno, "iconv_open (\"%s\", \"%s\")",
		       utf8_target, try_from_code);
		free (try_to_code);
		return TRIED_ICONV_ERROR;
	}

	if (!to_utf8) {
		cd = iconv_open (try_to_code, "UTF-8");
		if (cd == (iconv_t) -1) {
			error (0, errno, "iconv_open (\"%s\", \"UTF-8\")",
			       try_to_code);
			free (try_to_code);
			return TRIED_ICONV_ERROR;
		}
	}

	input = decompress_peek (decomp, &input_size);
	if (input_size < buf_size) {
		/* End of file, error, or just a short read? Repeat until we
		 * have either a full buffer or EOF/error.
		 */
		while (input_size < buf_size) {
			size_t old_input_size = input_size;
			input_size = buf_size;
			input = decompress_peek (decomp, &input_size);
			if (input_size == old_input_size)
				break;
		}
	}

	if (!utf8)
		utf8 = xmalloc (buf_size);
	if (!output)
		output = xmalloc (buf_size);

	while (input_size || utf8left) {
		int handle_iconv_errors = 0;
		char *inptr = (char *) input, *utf8ptr = utf8, *outptr;
		size_t inleft = input_size, outleft;
		size_t n, n2 = -1;

		if (!utf8left) {
			/* First, convert the text to UTF-8. By assumption,
			 * all validly-encoded text can be converted to
			 * UTF-8 assuming that we picked the correct
			 * encoding. Any errors at this stage are due to
			 * selecting an incorrect encoding, or due to
			 * misencoded source text.
			 */
			utf8left = buf_size;
			n = iconv (cd_utf8, (ICONV_CONST char **) &inptr,
				   &inleft, &utf8ptr, &utf8left);
			utf8left = buf_size - utf8left;

			/* If we need to try the next encoding, do that
			 * before writing anything.
			 */
			if (!last && n == (size_t) -1 &&
			    (errno == EILSEQ ||
			     (errno == EINVAL && input_size < buf_size))) {
				ret = TRIED_ICONV_ERROR;
				break;
			} else if (n == (size_t) -1)
				handle_iconv_errors = errno;
		}

		/* If the target encoding is UTF-8 (the common case), then
		 * we can just write out what we've got. Otherwise, we need
		 * to convert to the target encoding. Any errors at this
		 * stage are due to characters that are not representable in
		 * the target encoding.
		 */
		if (handle_iconv_errors)
			/* Fall back to error handling below.  If we have
			 * anything to write out, we'll do it next time
			 * round the loop.
			 */
			outptr = output;
		else if (to_utf8) {
			memcpy (output, utf8, utf8left);
			outptr = output + utf8left;
			outleft = utf8left;
			utf8left = 0;
		} else if (utf8left) {
			outptr = output;
			outleft = buf_size;
			utf8ptr = utf8;
			n2 = iconv (
				cd, (ICONV_CONST char **) &utf8ptr, &utf8left,
				&outptr, &outleft);
			outleft = buf_size - outleft;
			if (n2 == (size_t) -1)
				handle_iconv_errors = errno;

			if (n2 == (size_t) -1 &&
			    errno == EILSEQ && ignore_errors)
				errno = 0;
		} else
			/* We appear to have converted some input text, but
			 * not actually ended up with any UTF-8 text.  This
			 * is odd.  However, we can at least continue round
			 * the loop, skip the input text we converted, and
			 * then we should get a different result next time.
			 */
			outptr = output;

		if (outptr != output) {
			/* We have something to write out. */
			if (add_output (output, outleft, outbuf) != 0) {
				ret = TRIED_ICONV_FATAL;
				goto out;
			}
		}

		if (!to_utf8 && n2 != (size_t) -1) {
			/* All the UTF-8 text we have so far was processed.
			 * For state-dependent character sets we have to
			 * flush the state now.
			 */
			outptr = output;
			outleft = buf_size;
			iconv (cd, NULL, NULL, &outptr, &outleft);
			outleft = buf_size - outleft;

			if (outptr != output) {
				/* We have something to write out. */
				if (add_output (output, outleft,
						outbuf) != 0) {
					ret = TRIED_ICONV_FATAL;
					goto out;
				}
			}
		} else if (handle_iconv_errors) {
			intmax_t error_pos;

			if (handle_iconv_errors == EILSEQ && !ignore_errors) {
				if (!quiet) {
					error_pos = input_pos + locate_error (
						try_from_code,
						input, input_size,
						utf8, buf_size);
					error (0, handle_iconv_errors,
					       "byte %jd: iconv", error_pos);
				}
				ret = TRIED_ICONV_FATAL;
				goto out;
			} else if (handle_iconv_errors == EINVAL &&
				   input_size < buf_size) {
				if (!quiet) {
					error_pos = input_pos + locate_error (
						try_from_code,
						input, input_size,
						utf8, buf_size);
					error (0, 0, "byte %jd: %s", error_pos,
					       _("iconv: incomplete character "
						 "at end of buffer"));
				}
				ret = TRIED_ICONV_FATAL;
				goto out;
			}
		}

		if (inptr != input) {
			decompress_peek_skip (decomp, input_size - inleft);
			input_pos += input_size - inleft;
		}

		/* Unless we have some UTF-8 text left (which will only
		 * happen if the output encoding is more verbose than UTF-8,
		 * so is unlikely for legacy encodings), we need to fetch
		 * more input text now.
		 */
		if (!utf8left) {
			input_size = buf_size;
			input = decompress_peek (decomp, &input_size);
			while (input_size < buf_size) {
				size_t old_input_size = input_size;
				input_size = buf_size;
				input = decompress_peek (decomp, &input_size);
				if (input_size == old_input_size)
					break;
			}
		}
	}

out:
	if (!to_utf8)
		iconv_close (cd);
	iconv_close (cd_utf8);
	free (try_to_code);

	return ret;
}

int manconv (decompress *decomp, gl_list_t from, const char *to,
	     struct manconv_outbuf *outbuf)
{
	char *pp_encoding;
	const char *try_from_code;
	char *plain_to, *modified_pp_line = NULL;
	tried_iconv tried;
	int ret = 0;

	plain_to = xstrndup (to, strcspn (to, "/"));
	pp_encoding = check_preprocessor_encoding
		(decomp, plain_to, &modified_pp_line);
	if (pp_encoding) {
		if (modified_pp_line) {
			size_t len = strlen (modified_pp_line);
			decompress_readline (decomp);
			if (add_output (modified_pp_line, len, outbuf) != 0) {
				ret = -1;
				goto out;
			}
		}
		tried = try_iconv (decomp, pp_encoding, to, 1, outbuf);
		if (tried == TRIED_ICONV_FATAL)
			ret = -1;
	} else {
		GL_LIST_FOREACH (from, try_from_code) {
			bool last = !gl_list_next_node (from, from_node);
			tried = try_iconv (decomp, try_from_code, to, last,
					   outbuf);
			if (tried == TRIED_ICONV_OK)
				break;
			else if (tried == TRIED_ICONV_FATAL) {
				ret = -1;
				goto out;
			}
		}
	}

out:
	free (modified_pp_line);
	free (pp_encoding);
	free (plain_to);
	return ret;
}

#else /* !HAVE_ICONV */

/* If we don't have iconv, there isn't much we can do; just pass everything
 * through unchanged.
 */
int manconv (decompress *decomp, gl_list_t from MAYBE_UNUSED,
	     const char *to MAYBE_UNUSED, struct manconv_outbuf *outbuf)
{
	for (;;) {
		size_t len = 4096;
		const char *buffer = decompress_read (decomp, &len);
		if (len == 0)
			break;
		if (add_output (buffer, len, outbuf) != 0)
			return -1;
	}
	return 0;
}

#endif /* HAVE_ICONV */