man-db/lib/encodings.c

/*
 * encodings.c: locale and encoding handling for man
 *
 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
 *               Colin Watson.
 *
 * This file is part of man-db.
 *
 * man-db is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * man-db is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with man-db; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include <ctype.h>
#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "attribute.h"
#include "gettext.h"
#include "localcharset.h"
#include "xalloc.h"
#include "xstrndup.h"

#include "manconfig.h"

#include "debug.h"
#include "encodings.h"
#include "pathsearch.h"

/* Due to historical limitations in groff (which may be removed in the
 * future), there is no mechanism for a man page to specify its own
 * encoding. This means that each national language directory needs to carry
 * with it information about its encoding, and each groff device needs to
 * have a default encoding associated with it. Out of the box, groff
 * formally allows only ISO-8859-1 on input; however, patches originating
 * with Debian and imported by many other GNU/Linux distributions change
 * this somewhat.
 *
 * Eventually, groff will support proper Unicode input, and much of this
 * horror can go away.
 *
 * Do *not* confuse source encoding with groff encoding. The encoding
 * specified in this table is the encoding in which the source man pages in
 * each language directory are expected to be written. The groff encoding is
 * determined by the selected groff device and sometimes also by the user's
 * locale.
 *
 * The standard output encoding is the encoding assumed for cat pages for
 * each language directory. It must *not* be used to discover the actual
 * output encoding displayed to the user; that is determined by the locale.
 * TODO: it would be useful to be able to change the standard output
 * encoding in the configuration file.
 *
 * This table is expected to change over time, particularly as man pages
 * begin to move towards UTF-8. Feel free to patch this for your
 * distribution; send me updates for languages I've missed.
 *
 * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
 * table.
 */
struct directory_entry {
	const char *lang_dir;
	const char *source_encoding;
};

static struct directory_entry directory_table[] = {
        {"C",        "ISO-8859-1" }, /* English */
        {"POSIX",    "ISO-8859-1" }, /* English */
        {"be",       "CP1251"     }, /* Belarusian */
        {"bg",       "CP1251"     }, /* Bulgarian */
        {"cs",       "ISO-8859-2" }, /* Czech */
        {"da",       "ISO-8859-1" }, /* Danish */
        {"de",       "ISO-8859-1" }, /* German */
        {"el",       "ISO-8859-7" }, /* Greek */
        {"en",       "ISO-8859-1" }, /* English */
        {"es",       "ISO-8859-1" }, /* Spanish */
        {"et",       "ISO-8859-1" }, /* Estonian */
        {"fi",       "ISO-8859-1" }, /* Finnish */
        {"fr",       "ISO-8859-1" }, /* French */
        {"ga",       "ISO-8859-1" }, /* Irish */
        {"gl",       "ISO-8859-1" }, /* Galician */
        {"hr",       "ISO-8859-2" }, /* Croatian */
        {"hu",       "ISO-8859-2" }, /* Hungarian */
        {"id",       "ISO-8859-1" }, /* Indonesian */
        {"is",       "ISO-8859-1" }, /* Icelandic */
        {"it",       "ISO-8859-1" }, /* Italian */
        {"ja",       "EUC-JP"     }, /* Japanese */
        {"ko",       "EUC-KR"     }, /* Korean */
        {"lt",       "ISO-8859-13"}, /* Lithuanian */
        {"lv",       "ISO-8859-13"}, /* Latvian */
        {"mk",       "ISO-8859-5" }, /* Macedonian */
        {"nb",       "ISO-8859-1" }, /* Norwegian Bokmål */
        {"nl",       "ISO-8859-1" }, /* Dutch */
        {"nn",       "ISO-8859-1" }, /* Norwegian Nynorsk */
        {"no",       "ISO-8859-1" }, /* Norwegian */
        {"pl",       "ISO-8859-2" }, /* Polish */
        {"pt",       "ISO-8859-1" }, /* Portuguese */
        {"ro",       "ISO-8859-2" }, /* Romanian */
        {"ru",       "KOI8-R"     }, /* Russian */
        {"sk",       "ISO-8859-2" }, /* Slovak */
        {"sl",       "ISO-8859-2" }, /* Slovenian */
        /* sr@latin must precede sr, due to top-down left-substring matching
           later */
        {"sr@latin", "ISO-8859-2" }, /* Serbian Latin */
        {"sr",       "ISO-8859-5" }, /* Serbian */
        {"sv",       "ISO-8859-1" }, /* Swedish */
        {"tr",       "ISO-8859-9" }, /* Turkish */
        {"uk",       "KOI8-U"     }, /* Ukrainian */
        {"vi",       "TCVN5712-1" }, /* Vietnamese */
        {"zh_CN",    "GBK"        }, /* Simplified Chinese */
        {"zh_SG",    "GBK"        }, /* Simplified Chinese, Singapore */
        {"zh_HK",    "BIG5HKSCS"  }, /* Traditional Chinese, Hong Kong */
        {"zh_TW",    "BIG5"       }, /* Traditional Chinese */

        {NULL,       NULL         }
};

static const char fallback_source_encoding[] = "ISO-8859-1";

/* Unfortunately, there is no portable way to inspect iconv's internal table
 * of character set aliases. We copy the most interesting ones here so that
 * we can deal with them if they appear in directory names. Note that all
 * names will be converted to upper case before looking them up in this
 * table.
 */
struct charset_alias_entry {
	const char *alias;
	const char *canonical_name;
};

static struct charset_alias_entry charset_alias_table[] = {
        /* The FHS is silly and requires numeric-only aliases that iconv
         * does not support.
         */
        {"88591",      "ISO-8859-1"    },
        {"88592",      "ISO-8859-2"    },
        {"88593",      "ISO-8859-3"    },
        {"88594",      "ISO-8859-4"    },
        {"88595",      "ISO-8859-5"    },
        {"88596",      "ISO-8859-6"    },
        {"88597",      "ISO-8859-7"    },
        {"88598",      "ISO-8859-8"    },
        {"88599",      "ISO-8859-9"    },
        {"885910",     "ISO-8859-10"   },
        {"885911",     "ISO-8859-11"   },
        {"885913",     "ISO-8859-13"   },
        {"885914",     "ISO-8859-14"   },
        {"885915",     "ISO-8859-15"   },
        {"885916",     "ISO-8859-16"   },

        {"ASCII",      "ANSI_X3.4-1968"},
        {"BIG-5",      "BIG5"          },
        {"BIG5-HKSCS", "BIG5HKSCS"     },
        {"EUCCN",      "EUC-CN"        },
        {"EUCJP",      "EUC-JP"        },
        {"EUCKR",      "EUC-KR"        },
        {"EUCTW",      "EUC-TW"        },
        {"GB2312",     "EUC-CN"        },
        {"ISO8859-1",  "ISO-8859-1"    },
        {"ISO8859-2",  "ISO-8859-2"    },
        {"ISO8859-3",  "ISO-8859-3"    },
        {"ISO8859-4",  "ISO-8859-4"    },
        {"ISO8859-5",  "ISO-8859-5"    },
        {"ISO8859-6",  "ISO-8859-6"    },
        {"ISO8859-7",  "ISO-8859-7"    },
        {"ISO8859-8",  "ISO-8859-8"    },
        {"ISO8859-9",  "ISO-8859-9"    },
        {"ISO8859-10", "ISO-8859-10"   },
        {"ISO8859-11", "ISO-8859-11"   },
        {"ISO8859-13", "ISO-8859-13"   },
        {"ISO8859-14", "ISO-8859-14"   },
        {"ISO8859-15", "ISO-8859-15"   },
        {"ISO8859-16", "ISO-8859-16"   },
        {"KOI8R",      "KOI8-R"        },
        {"KOI8U",      "KOI8-U"        },
        {"UJIS",       "EUC-JP"        },
        {"US-ASCII",   "ANSI_X3.4-1968"},
        {"UTF8",       "UTF-8"         },

        {NULL,         NULL            }
};

/* The default groff terminal output device to be used is determined based
 * on locale_charset (), which returns the character set used by the current
 * locale.
 */
struct charset_entry {
	const char *charset_from_locale;
	const char *default_device;
};

static struct charset_entry charset_table[] = {
        {"ANSI_X3.4-1968", "ascii" },
#ifndef HEIRLOOM_NROFF
        {"ISO-8859-1",     "latin1"},
#endif  /* HEIRLOOM_NROFF */
        {"UTF-8",          "utf8"  },

        {NULL,             NULL    }
};

static const char *fallback_default_device = "ascii";

/* The encoding used for the text passed to groff is a function of the
 * selected groff device. Traditional devices expect ISO-8859-1 on input
 * (yes, even the utf8 device); devices added in the Debian multibyte patch
 * expect other encodings. The ascii8 device passes top-bit-set characters
 * straight through so is (probably ...) encoding-agnostic. If this encoding
 * does not match the source encoding, an iconv pipe is used (if available)
 * to perform recoding.
 */
struct device_entry {
	const char *roff_device;
	const char *roff_encoding;
	const char *output_encoding;
};

static struct device_entry device_table[] = {
        /* nroff devices */
        {"ascii",   "ANSI_X3.4-1968", "ANSI_X3.4-1968"},
        {"latin1",  "ISO-8859-1",     "ISO-8859-1"    },
        {"utf8",    "ISO-8859-1",     "UTF-8"         },

#ifdef HEIRLOOM_NROFF
        /* Not strictly accurate, but we only use this in UTF-8 locales. */
        {"locale",  "UTF-8",          "UTF-8"         },
#endif  /* HEIRLOOM_NROFF */

        /* troff devices */
        {"X75",     NULL,             NULL            },
        {"X75-12",  NULL,             NULL            },
        {"X100",    NULL,             NULL            },
        {"X100-12", NULL,             NULL            },
        {"dvi",     NULL,             NULL            },
        {"html",    NULL,             NULL            },
        {"lbp",     NULL,             NULL            },
        {"lj4",     NULL,             NULL            },
        {"ps",      NULL,             NULL            },

        {NULL,      NULL,             NULL            }
};

static const char fallback_roff_encoding[] = "ISO-8859-1";

/* Setting less_charset to iso8859 tells the less pager that characters
 * between 0xA0 and 0xFF are displayable, not that its input is encoded in
 * ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
 *
 * Character set names compatible only with jless go in jless_charset.
 */
struct less_charset_entry {
	const char *charset_from_locale;
	const char *less_charset;
	const char *jless_charset;
};

static struct less_charset_entry less_charset_table[] = {
        {"ANSI_X3.4-1968", "ascii",   NULL           },
        {"CP1251",         "windows", NULL           },
        {"EUC-JP",         "iso8859", "japanese-ujis"},
        {"ISO-8859-1",     "iso8859", NULL           },
        {"KOI8-R",         "koi8-r",  NULL           },
        /* close enough? */
        {"KOI8-U",         "koi8-r",  NULL           },
        {"UTF-8",          "utf-8",   NULL           },

        {NULL,             NULL,      NULL           }
};

static const char fallback_less_charset[] = "iso8859";

static const char *groff_preconv = NULL;

/* Is the groff "preconv" helper available? If so, return its name.
 * Otherwise, return NULL.
 */
const char *get_groff_preconv (void)
{
	if (groff_preconv) {
		if (*groff_preconv)
			return groff_preconv;
		else
			return NULL;
	}

	if (pathsearch_executable ("gpreconv"))
		groff_preconv = "gpreconv";
	else if (pathsearch_executable ("preconv"))
		groff_preconv = "preconv";
	else
		groff_preconv = "";

	if (*groff_preconv)
		return groff_preconv;
	else
		return NULL;
}

/* Return the assumed encoding of the source man page, based on the
 * directory in which it was found. The caller should attempt to recode from
 * this to whatever encoding is expected by groff.
 *
 * The caller should free the returned string when it is finished with it.
 */
char *ATTRIBUTE_MALLOC get_page_encoding (const char *lang)
{
	const struct directory_entry *entry;
	const char *dot;

	if (!lang || !*lang) {
		/* Guess based on the locale. */
		lang = setlocale (LC_MESSAGES, NULL);
		if (!lang)
			return xstrdup (fallback_source_encoding);
	}

	dot = strchr (lang, '.');
	if (dot) {
		/* The FHS has the worst specification of what's supposed to
		 * go after the dot here that I've ever seen. To quote from
		 * version 2.1:
		 *
		 * "It is recommended that this be a numeric representation
		 * if possible (ISO standards, especially), not include
		 * additional punctuation symbols, and that any letters be
		 * in lowercase."
		 *
		 * Any sane standard would use directory names like
		 * de_DE.ISO-8859-1; the examples in the FHS recommend
		 * de_DE.88591 instead. Considering that there is no other
		 * conceivable use for encodings in directory names other
		 * than to pass them to iconv or similar, this is quite
		 * startlingly useless.
		 *
		 * While we now support this thanks to
		 * get_canonical_charset_name, the FHS specification is
		 * obviously wrong and I plan to petition to have it
		 * changed. I recommend ignoring this part of the FHS.
		 */
		char *dir_encoding =
		        xstrndup (dot + 1, strcspn (dot + 1, ",@"));
		char *canonical_dir_encoding =
		        xstrdup (get_canonical_charset_name (dir_encoding));
		free (dir_encoding);
		return canonical_dir_encoding;
	}

	for (entry = directory_table; entry->lang_dir; ++entry)
		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
			return xstrdup (entry->source_encoding);

	return xstrdup (fallback_source_encoding);
}

/* Return the canonical encoding for source man pages in the specified
 * language. This ignores any encoding specification in the language
 * directory name. The source encoding should be used as a basis for
 * determining the correct roff device to use: that is, the caller should
 * behave as if it is recoding from the page encoding to the source encoding
 * first, although in practice it should recode directly from the page
 * encoding to the roff encoding.
 *
 * You should normally only call this function if the page encoding is
 * UTF-8, in which case older versions of groff that lack preconv need to
 * have the page recoded to some legacy encoding). If the page is in a
 * legacy encoding, then attempting to recode from that to some other legacy
 * encoding will probably do more harm than good.
 *
 * Here are a few concrete examples of why these distinctions are important:
 *
 *   /usr/share/man/en_GB.UTF-8, locale C
 *     page encoding = UTF-8
 *     source encoding = ISO-8859-1
 *     roff encoding = ISO-8859-1
 *     output encoding = UTF-8
 *     UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
 *
 *   /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
 *     page encoding = UTF-8
 *     source encoding = ISO-8859-2
 *     roff encoding = ISO-8859-2
 *     output encoding = ISO-8859-2
 *     UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
 *                    -> ISO-8859-2 -> iconv -> UTF-8
 *
 *   /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
 *     page encoding = EUC-JP
 *     source encoding = EUC-JP
 *     roff encoding = UTF-8
 *     output encoding = UTF-8
 *     EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
 *
 *   /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
 *     page encoding = ISO-8859-15
 *     source encoding = ISO-8859-15
 *     roff encoding = ISO-8859-15
 *     output encoding = ISO-8859-15
 *     ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
 */
const char *get_source_encoding (const char *lang)
{
	const struct directory_entry *entry;

	if (!lang || !*lang) {
		/* Guess based on the locale. */
		lang = setlocale (LC_MESSAGES, NULL);
		if (!lang)
			return fallback_source_encoding;
	}

	for (entry = directory_table; entry->lang_dir; ++entry)
		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
			return entry->source_encoding;

	return fallback_source_encoding;
}

const char *ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL
        get_canonical_charset_name (const char *charset)
{
	const struct charset_alias_entry *entry;
	char *charset_upper = xstrdup (charset);
	char *p;

	for (p = charset_upper; *p; ++p)
		*p = CTYPE (toupper, *p);

	for (entry = charset_alias_table; entry->alias; ++entry)
		if (STREQ (entry->alias, charset_upper)) {
			free (charset_upper);
			return entry->canonical_name;
		}

	free (charset_upper);
	return charset;
}

/* Return the current locale's character set. */
const char *ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void)
{
	const char *charset;
	char *saved_locale;

	/* We need to modify LC_CTYPE temporarily in order to look at the
	 * codeset, so save it first.
	 */
	saved_locale = setlocale (LC_CTYPE, NULL);
	if (saved_locale)
		saved_locale = xstrdup (saved_locale);

	setlocale (LC_CTYPE, "");

	charset = locale_charset ();

	/* Restore LC_CTYPE to its value on entry to this function. */
	setlocale (LC_CTYPE, saved_locale);
	free (saved_locale);

	if (!charset || !*charset)
		charset = "ANSI_X3.4-1968";
	return get_canonical_charset_name (charset);
}

/* Find a locale with this character set. This is a non-portable operation,
 * but required to make col(1) work correctly with -E. If no locale can be
 * found, or if none needs to be set, return NULL.
 *
 * The caller should free the returned string when it is finished with it.
 */
char *find_charset_locale (const char *charset)
{
	const char *canonical_charset = get_canonical_charset_name (charset);
	char *saved_locale;
	const char supported_path[] = "/usr/share/i18n/SUPPORTED";
	FILE *supported = NULL;
	char *line = NULL;
	size_t n = 0;
	char *locale = NULL;

	if (STREQ (charset, get_locale_charset ()))
		return NULL;

	saved_locale = setlocale (LC_CTYPE, NULL);
	if (saved_locale)
		saved_locale = xstrdup (saved_locale);

	supported = fopen (supported_path, "r");
	while (supported && getline (&line, &n, supported) >= 0) {
		const char *space = strchr (line, ' ');
		if (space) {
			char *encoding = xstrdup (space + 1);
			char *newline = strchr (encoding, '\n');
			if (newline)
				*newline = 0;
			if (STREQ (canonical_charset,
			           get_canonical_charset_name (encoding))) {
				locale = xstrndup (line, space - line);
				/* Is this locale actually installed? */
				if (setlocale (LC_CTYPE, locale)) {
					free (encoding);
					goto out;
				} else {
					free (locale);
					locale = NULL;
				}
			}
			free (encoding);
		}
		free (line);
		line = NULL;
	}

	if (strlen (canonical_charset) >= 5 &&
	    STRNEQ (canonical_charset, "UTF-8", 5)) {
		locale = xstrdup ("C.UTF-8");
		if (setlocale (LC_CTYPE, locale))
			goto out;
		free (locale);
		locale = xstrdup ("en_US.UTF-8");
		if (setlocale (LC_CTYPE, locale))
			goto out;
		free (locale);
		locale = NULL;
	}

out:
	free (line);
	setlocale (LC_CTYPE, saved_locale);
	free (saved_locale);
	if (supported)
		fclose (supported);
	return locale;
}

/* Can we take this input encoding and produce this output encoding, perhaps
 * with the help of some iconv pipes? */
static bool ATTRIBUTE_PURE compatible_encodings (const char *input,
                                                 const char *output)
{
	if (STREQ (input, output))
		return true;

	/* If the input is ASCII, recoding should be easy. Try it. */
	if (STREQ (input, "ANSI_X3.4-1968"))
		return true;

	/* If the input is UTF-8, it's either a simple recoding of whatever
	 * we want or else it probably won't work at all no matter what we
	 * do. We might as well try it for now.
	 */
	if (STREQ (input, "UTF-8"))
		return true;

	/* If the output is ASCII, this is probably because the caller
	 * explicitly asked for it, so we have little choice but to try.
	 */
	if (STREQ (output, "ANSI_X3.4-1968"))
		return true;

	return false;
}

/* Return the default groff device for the given character set. This may be
 * overridden by the user. The page's source encoding is needed to ensure
 * that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
 * and a trailing iconv pipe to recode to UTF-8.
 *
 * All this encoding compatibility stuff feels like a slightly nasty hack,
 * but I haven't yet come up with a cleaner way to do it.
 */
const char *get_default_device (const char *charset_from_locale,
                                const char *source_encoding)
{
	const struct charset_entry *entry;

	if (get_groff_preconv ()) {
		/* ASCII is a special case, and the only way we can get
		 * things like bullet marks to come out right is by using
		 * the ascii device. People using such a basic locale
		 * probably don't want anything fancy anyway.
		 */
		if (charset_from_locale &&
		    STREQ (charset_from_locale, "ANSI_X3.4-1968"))
			return "ascii";
		else
			return "utf8";
	}

	if (!charset_from_locale)
		return fallback_default_device;

	for (entry = charset_table; entry->charset_from_locale; ++entry) {
		if (STREQ (entry->charset_from_locale, charset_from_locale)) {
			const char *roff_encoding = get_roff_encoding (
			        entry->default_device, source_encoding);
			if (compatible_encodings (source_encoding,
			                          roff_encoding))
				return entry->default_device;
		}
	}

	return fallback_default_device;
}

/* Is this a known *roff device name? */
bool ATTRIBUTE_PURE is_roff_device (const char *device)
{
	const struct device_entry *entry;

	for (entry = device_table; entry->roff_device; ++entry) {
		if (STREQ (entry->roff_device, device))
			return true;
	}

	return false;
}

/* Find the input encoding expected by groff. */
const char *ATTRIBUTE_PURE get_roff_encoding (const char *device,
                                              const char *source_encoding)
{
	const struct device_entry *entry;
	bool found = false;
	const char *roff_encoding = NULL;

	if (device) {
		for (entry = device_table; entry->roff_device; ++entry) {
			if (STREQ (entry->roff_device, device)) {
				found = true;
				roff_encoding = entry->roff_encoding;
				break;
			}
		}
	}

	if (!found)
		roff_encoding = fallback_roff_encoding;

	return roff_encoding ? roff_encoding : source_encoding;
}

/* Find the output encoding that this device will produce, or NULL if it
 * will simply pass through the input encoding.
 */
const char *ATTRIBUTE_PURE get_output_encoding (const char *device)
{
	const struct device_entry *entry;

	for (entry = device_table; entry->roff_device; ++entry)
		if (STREQ (entry->roff_device, device))
			return entry->output_encoding;

	return NULL;
}

/* Return the value of LESSCHARSET appropriate for this locale. */
const char *ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale)
{
	if (charset_from_locale) {
		const struct less_charset_entry *entry;

		for (entry = less_charset_table; entry->charset_from_locale;
		     ++entry)
			if (STREQ (entry->charset_from_locale,
			           charset_from_locale))
				return entry->less_charset;
	}

	return fallback_less_charset;
}

/* Return the value of JLESSCHARSET appropriate for this locale. May return
 * NULL.
 */
const char *ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale)
{
	if (charset_from_locale) {
		const struct less_charset_entry *entry;

		for (entry = less_charset_table; entry->charset_from_locale;
		     ++entry)
			if (STREQ (entry->charset_from_locale,
			           charset_from_locale))
				return entry->jless_charset;
	}

	return NULL;
}