/* * encodings.c: locale and encoding handling for man * * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 * Colin Watson. * * This file is part of man-db. * * man-db is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * man-db is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with man-db; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifdef HAVE_CONFIG_H # include "config.h" #endif /* HAVE_CONFIG_H */ #include #include #include #include #include #include #include #include "attribute.h" #include "gettext.h" #include "localcharset.h" #include "xalloc.h" #include "xstrndup.h" #include "manconfig.h" #include "debug.h" #include "encodings.h" #include "pathsearch.h" /* Due to historical limitations in groff (which may be removed in the * future), there is no mechanism for a man page to specify its own * encoding. This means that each national language directory needs to carry * with it information about its encoding, and each groff device needs to * have a default encoding associated with it. Out of the box, groff * formally allows only ISO-8859-1 on input; however, patches originating * with Debian and imported by many other GNU/Linux distributions change * this somewhat. * * Eventually, groff will support proper Unicode input, and much of this * horror can go away. * * Do *not* confuse source encoding with groff encoding. The encoding * specified in this table is the encoding in which the source man pages in * each language directory are expected to be written. The groff encoding is * determined by the selected groff device and sometimes also by the user's * locale. * * The standard output encoding is the encoding assumed for cat pages for * each language directory. It must *not* be used to discover the actual * output encoding displayed to the user; that is determined by the locale. * TODO: it would be useful to be able to change the standard output * encoding in the configuration file. * * This table is expected to change over time, particularly as man pages * begin to move towards UTF-8. Feel free to patch this for your * distribution; send me updates for languages I've missed. * * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this * table. */ struct directory_entry { const char *lang_dir; const char *source_encoding; }; static struct directory_entry directory_table[] = { { "C", "ISO-8859-1" }, /* English */ { "POSIX", "ISO-8859-1" }, /* English */ { "da", "ISO-8859-1" }, /* Danish */ { "de", "ISO-8859-1" }, /* German */ { "en", "ISO-8859-1" }, /* English */ { "es", "ISO-8859-1" }, /* Spanish */ { "et", "ISO-8859-1" }, /* Estonian */ { "fi", "ISO-8859-1" }, /* Finnish */ { "fr", "ISO-8859-1" }, /* French */ { "ga", "ISO-8859-1" }, /* Irish */ { "gl", "ISO-8859-1" }, /* Galician */ { "id", "ISO-8859-1" }, /* Indonesian */ { "is", "ISO-8859-1" }, /* Icelandic */ { "it", "ISO-8859-1" }, /* Italian */ { "nb", "ISO-8859-1" }, /* Norwegian Bokmål */ { "nl", "ISO-8859-1" }, /* Dutch */ { "nn", "ISO-8859-1" }, /* Norwegian Nynorsk */ { "no", "ISO-8859-1" }, /* Norwegian */ { "pt", "ISO-8859-1" }, /* Portuguese */ { "sv", "ISO-8859-1" }, /* Swedish */ #ifdef MULTIBYTE_GROFF /* These languages require a patched version of groff with the * ascii8 and nippon devices. */ { "be", "CP1251" }, /* Belarusian */ { "bg", "CP1251" }, /* Bulgarian */ { "cs", "ISO-8859-2" }, /* Czech */ { "el", "ISO-8859-7" }, /* Greek */ { "hr", "ISO-8859-2" }, /* Croatian */ { "hu", "ISO-8859-2" }, /* Hungarian */ { "ja", "EUC-JP" }, /* Japanese */ { "ko", "EUC-KR" }, /* Korean */ { "lt", "ISO-8859-13" }, /* Lithuanian */ { "lv", "ISO-8859-13" }, /* Latvian */ { "mk", "ISO-8859-5" }, /* Macedonian */ { "pl", "ISO-8859-2" }, /* Polish */ { "ro", "ISO-8859-2" }, /* Romanian */ { "ru", "KOI8-R" }, /* Russian */ { "sk", "ISO-8859-2" }, /* Slovak */ { "sl", "ISO-8859-2" }, /* Slovenian */ /* sr@latin must precede sr, due to top-down left-substring matching later */ { "sr@latin", "ISO-8859-2" }, /* Serbian Latin */ { "sr", "ISO-8859-5" }, /* Serbian */ { "tr", "ISO-8859-9" }, /* Turkish */ { "uk", "KOI8-U" }, /* Ukrainian */ { "vi", "TCVN5712-1" }, /* Vietnamese */ { "zh_CN", "GBK" }, /* Simplified Chinese */ { "zh_SG", "GBK" }, /* Simplified Chinese, Singapore */ { "zh_HK", "BIG5HKSCS" }, /* Traditional Chinese, Hong Kong */ { "zh_TW", "BIG5" }, /* Traditional Chinese */ #endif /* MULTIBYTE_GROFF */ { NULL, NULL } }; static const char fallback_source_encoding[] = "ISO-8859-1"; /* Unfortunately, there is no portable way to inspect iconv's internal table * of character set aliases. We copy the most interesting ones here so that * we can deal with them if they appear in directory names. Note that all * names will be converted to upper case before looking them up in this * table. */ struct charset_alias_entry { const char *alias; const char *canonical_name; }; static struct charset_alias_entry charset_alias_table[] = { /* The FHS is silly and requires numeric-only aliases that iconv * does not support. */ { "88591", "ISO-8859-1" }, { "88592", "ISO-8859-2" }, { "88593", "ISO-8859-3" }, { "88594", "ISO-8859-4" }, { "88595", "ISO-8859-5" }, { "88596", "ISO-8859-6" }, { "88597", "ISO-8859-7" }, { "88598", "ISO-8859-8" }, { "88599", "ISO-8859-9" }, { "885910", "ISO-8859-10" }, { "885911", "ISO-8859-11" }, { "885913", "ISO-8859-13" }, { "885914", "ISO-8859-14" }, { "885915", "ISO-8859-15" }, { "885916", "ISO-8859-16" }, { "ASCII", "ANSI_X3.4-1968" }, { "BIG-5", "BIG5" }, { "BIG5-HKSCS", "BIG5HKSCS" }, { "EUCCN", "EUC-CN" }, { "EUCJP", "EUC-JP" }, { "EUCKR", "EUC-KR" }, { "EUCTW", "EUC-TW" }, { "GB2312", "EUC-CN" }, { "ISO8859-1", "ISO-8859-1" }, { "ISO8859-2", "ISO-8859-2" }, { "ISO8859-3", "ISO-8859-3" }, { "ISO8859-4", "ISO-8859-4" }, { "ISO8859-5", "ISO-8859-5" }, { "ISO8859-6", "ISO-8859-6" }, { "ISO8859-7", "ISO-8859-7" }, { "ISO8859-8", "ISO-8859-8" }, { "ISO8859-9", "ISO-8859-9" }, { "ISO8859-10", "ISO-8859-10" }, { "ISO8859-11", "ISO-8859-11" }, { "ISO8859-13", "ISO-8859-13" }, { "ISO8859-14", "ISO-8859-14" }, { "ISO8859-15", "ISO-8859-15" }, { "ISO8859-16", "ISO-8859-16" }, { "KOI8R", "KOI8-R" }, { "KOI8U", "KOI8-U" }, { "UJIS", "EUC-JP" }, { "US-ASCII", "ANSI_X3.4-1968" }, { "UTF8", "UTF-8" }, { NULL, NULL } }; /* The default groff terminal output device to be used is determined based * on locale_charset (), which returns the character set used by the current * locale. */ struct charset_entry { const char *charset_from_locale; const char *default_device; }; static struct charset_entry charset_table[] = { { "ANSI_X3.4-1968", "ascii" }, #ifndef HEIRLOOM_NROFF { "ISO-8859-1", "latin1" }, #endif /* HEIRLOOM_NROFF */ { "UTF-8", "utf8" }, #ifndef HEIRLOOM_NROFF # ifdef MULTIBYTE_GROFF { "BIG5", "nippon" }, { "BIG5HKSCS", "nippon" }, { "EUC-CN", "nippon" }, { "EUC-JP", "nippon" }, { "EUC-TW", "nippon" }, { "GBK", "nippon" }, # else /* !MULTIBYTE_GROFF */ /* If we have a smarter version of groff, this is better dealt with * using either ascii8 (Debian multibyte patch) or preconv (as of * groff 1.20). This is a not-quite-right stopgap in case we have * neither. */ { "ISO-8859-15", "latin1" }, # endif /* MULTIBYTE_GROFF */ #endif /* HEIRLOOM_NROFF */ { NULL, NULL } }; static const char *fallback_default_device = #ifdef MULTIBYTE_GROFF "ascii8" #else /* !MULTIBYTE_GROFF */ "ascii" #endif /* MULTIBYTE_GROFF */ ; /* The encoding used for the text passed to groff is a function of the * selected groff device. Traditional devices expect ISO-8859-1 on input * (yes, even the utf8 device); devices added in the Debian multibyte patch * expect other encodings. The ascii8 device passes top-bit-set characters * straight through so is (probably ...) encoding-agnostic. If this encoding * does not match the source encoding, an iconv pipe is used (if available) * to perform recoding. */ struct device_entry { const char *roff_device; const char *roff_encoding; const char *output_encoding; }; static struct device_entry device_table[] = { /* nroff devices */ { "ascii", "ANSI_X3.4-1968", "ANSI_X3.4-1968" }, { "latin1", "ISO-8859-1", "ISO-8859-1" }, { "utf8", "ISO-8859-1", "UTF-8" }, #ifdef MULTIBYTE_GROFF { "ascii8", NULL, NULL }, { "nippon", NULL, NULL }, #endif /* MULTIBYTE_GROFF */ #ifdef HEIRLOOM_NROFF /* Not strictly accurate, but we only use this in UTF-8 locales. */ { "locale", "UTF-8", "UTF-8" }, #endif /* HEIRLOOM_NROFF */ /* troff devices */ { "X75", NULL, NULL }, { "X75-12", NULL, NULL }, { "X100", NULL, NULL }, { "X100-12", NULL, NULL }, { "dvi", NULL, NULL }, { "html", NULL, NULL }, { "lbp", NULL, NULL }, { "lj4", NULL, NULL }, { "ps", NULL, NULL }, { NULL, NULL, NULL } }; static const char fallback_roff_encoding[] = "ISO-8859-1"; /* Setting less_charset to iso8859 tells the less pager that characters * between 0xA0 and 0xFF are displayable, not that its input is encoded in * ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better. * * Character set names compatible only with jless go in jless_charset. */ struct less_charset_entry { const char *charset_from_locale; const char *less_charset; const char *jless_charset; }; static struct less_charset_entry less_charset_table[] = { { "ANSI_X3.4-1968", "ascii", NULL }, { "ISO-8859-1", "iso8859", NULL }, { "UTF-8", "utf-8", NULL }, #ifdef MULTIBYTE_GROFF { "CP1251", "windows", NULL }, { "EUC-JP", "iso8859", "japanese-ujis" }, { "KOI8-R", "koi8-r", NULL }, /* close enough? */ { "KOI8-U", "koi8-r", NULL }, #endif /* MULTIBYTE_GROFF */ { NULL, NULL, NULL } }; static const char fallback_less_charset[] = "iso8859"; const char *groff_preconv = NULL; /* Is the groff "preconv" helper available? If so, return its name. * Otherwise, return NULL. */ const char *get_groff_preconv (void) { if (groff_preconv) { if (*groff_preconv) return groff_preconv; else return NULL; } if (pathsearch_executable ("gpreconv")) groff_preconv = "gpreconv"; else if (pathsearch_executable ("preconv")) groff_preconv = "preconv"; else groff_preconv = ""; if (*groff_preconv) return groff_preconv; else return NULL; } /* Return the assumed encoding of the source man page, based on the * directory in which it was found. The caller should attempt to recode from * this to whatever encoding is expected by groff. * * The caller should free the returned string when it is finished with it. */ char * ATTRIBUTE_MALLOC get_page_encoding (const char *lang) { const struct directory_entry *entry; const char *dot; if (!lang || !*lang) { /* Guess based on the locale. */ lang = setlocale (LC_MESSAGES, NULL); if (!lang) return xstrdup (fallback_source_encoding); } dot = strchr (lang, '.'); if (dot) { /* The FHS has the worst specification of what's supposed to * go after the dot here that I've ever seen. To quote from * version 2.1: * * "It is recommended that this be a numeric representation * if possible (ISO standards, especially), not include * additional punctuation symbols, and that any letters be * in lowercase." * * Any sane standard would use directory names like * de_DE.ISO-8859-1; the examples in the FHS recommend * de_DE.88591 instead. Considering that there is no other * conceivable use for encodings in directory names other * than to pass them to iconv or similar, this is quite * startlingly useless. * * While we now support this thanks to * get_canonical_charset_name, the FHS specification is * obviously wrong and I plan to petition to have it * changed. I recommend ignoring this part of the FHS. */ char *dir_encoding = xstrndup (dot + 1, strcspn (dot + 1, ",@")); char *canonical_dir_encoding = xstrdup (get_canonical_charset_name (dir_encoding)); free (dir_encoding); return canonical_dir_encoding; } for (entry = directory_table; entry->lang_dir; ++entry) if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir))) return xstrdup (entry->source_encoding); return xstrdup (fallback_source_encoding); } /* Return the canonical encoding for source man pages in the specified * language. This ignores any encoding specification in the language * directory name. The source encoding should be used as a basis for * determining the correct roff device to use: that is, the caller should * behave as if it is recoding from the page encoding to the source encoding * first, although in practice it should recode directly from the page * encoding to the roff encoding. * * You should normally only call this function if the page encoding is * UTF-8, in which case older versions of groff that lack preconv need to * have the page recoded to some legacy encoding). If the page is in a * legacy encoding, then attempting to recode from that to some other legacy * encoding will probably do more harm than good. * * Here are a few concrete examples of why these distinctions are important: * * /usr/share/man/en_GB.UTF-8, locale C * page encoding = UTF-8 * source encoding = ISO-8859-1 * roff encoding = ISO-8859-1 * output encoding = UTF-8 * UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968 * * /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8 * page encoding = UTF-8 * source encoding = ISO-8859-2 * roff encoding = ISO-8859-2 * output encoding = ISO-8859-2 * UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8 * -> ISO-8859-2 -> iconv -> UTF-8 * * /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8 * page encoding = EUC-JP * source encoding = EUC-JP * roff encoding = UTF-8 * output encoding = UTF-8 * EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8 * * /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8 * page encoding = ISO-8859-15 * source encoding = ISO-8859-15 * roff encoding = ISO-8859-15 * output encoding = ISO-8859-15 * ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8 */ const char *get_source_encoding (const char *lang) { const struct directory_entry *entry; if (!lang || !*lang) { /* Guess based on the locale. */ lang = setlocale (LC_MESSAGES, NULL); if (!lang) return fallback_source_encoding; } for (entry = directory_table; entry->lang_dir; ++entry) if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir))) return entry->source_encoding; return fallback_source_encoding; } const char * ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL get_canonical_charset_name (const char *charset) { const struct charset_alias_entry *entry; char *charset_upper = xstrdup (charset); char *p; for (p = charset_upper; *p; ++p) *p = CTYPE (toupper, *p); for (entry = charset_alias_table; entry->alias; ++entry) if (STREQ (entry->alias, charset_upper)) { free (charset_upper); return entry->canonical_name; } free (charset_upper); return charset; } /* Return the current locale's character set. */ const char * ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void) { const char *charset; char *saved_locale; /* We need to modify LC_CTYPE temporarily in order to look at the * codeset, so save it first. */ saved_locale = setlocale (LC_CTYPE, NULL); if (saved_locale) saved_locale = xstrdup (saved_locale); setlocale (LC_CTYPE, ""); charset = locale_charset (); /* Restore LC_CTYPE to its value on entry to this function. */ setlocale (LC_CTYPE, saved_locale); free (saved_locale); if (!charset || !*charset) charset = "ANSI_X3.4-1968"; return get_canonical_charset_name (charset); } /* Find a locale with this character set. This is a non-portable operation, * but required to make col(1) work correctly with -E. If no locale can be * found, or if none needs to be set, return NULL. * * The caller should free the returned string when it is finished with it. */ char *find_charset_locale (const char *charset) { const char *canonical_charset = get_canonical_charset_name (charset); char *saved_locale; const char supported_path[] = "/usr/share/i18n/SUPPORTED"; FILE *supported = NULL; char *line = NULL; size_t n = 0; char *locale = NULL; if (STREQ (charset, get_locale_charset ())) return NULL; saved_locale = setlocale (LC_CTYPE, NULL); if (saved_locale) saved_locale = xstrdup (saved_locale); supported = fopen (supported_path, "r"); while (supported && getline (&line, &n, supported) >= 0) { const char *space = strchr (line, ' '); if (space) { char *encoding = xstrdup (space + 1); char *newline = strchr (encoding, '\n'); if (newline) *newline = 0; if (STREQ (canonical_charset, get_canonical_charset_name (encoding))) { locale = xstrndup (line, space - line); /* Is this locale actually installed? */ if (setlocale (LC_CTYPE, locale)) { free (encoding); goto out; } else { free (locale); locale = NULL; } } free (encoding); } free (line); line = NULL; } if (strlen (canonical_charset) >= 5 && STRNEQ (canonical_charset, "UTF-8", 5)) { locale = xstrdup ("C.UTF-8"); if (setlocale (LC_CTYPE, locale)) goto out; free (locale); locale = xstrdup ("en_US.UTF-8"); if (setlocale (LC_CTYPE, locale)) goto out; free (locale); locale = NULL; } out: free (line); setlocale (LC_CTYPE, saved_locale); free (saved_locale); if (supported) fclose (supported); return locale; } /* Can we take this input encoding and produce this output encoding, perhaps * with the help of some iconv pipes? */ static bool ATTRIBUTE_PURE compatible_encodings (const char *input, const char *output) { if (STREQ (input, output)) return true; /* If the input is ASCII, recoding should be easy. Try it. */ if (STREQ (input, "ANSI_X3.4-1968")) return true; /* If the input is UTF-8, it's either a simple recoding of whatever * we want or else it probably won't work at all no matter what we * do. We might as well try it for now. */ if (STREQ (input, "UTF-8")) return true; /* If the output is ASCII, this is probably because the caller * explicitly asked for it, so we have little choice but to try. */ if (STREQ (output, "ANSI_X3.4-1968")) return true; #ifdef MULTIBYTE_GROFF /* Special case for some CJK UTF-8 locales, which take UTF-8 input * recoded from EUC-JP (etc.) and produce UTF-8 output. This is * rather filthy. */ if ((STREQ (input, "BIG5") || STREQ (input, "BIG5HKSCS") || STREQ (input, "EUC-JP") || STREQ (input, "EUC-CN") || STREQ (input, "GBK") || STREQ (input, "EUC-KR") || STREQ (input, "EUC-TW")) && STREQ (output, "UTF-8")) return true; #endif /* MULTIBYTE_GROFF */ return false; } /* Return the default groff device for the given character set. This may be * overridden by the user. The page's source encoding is needed to ensure * that the device is compatible: consider ru_RU.UTF-8, which needs ascii8 * and a trailing iconv pipe to recode to UTF-8. * * All this encoding compatibility stuff feels like a slightly nasty hack, * but I haven't yet come up with a cleaner way to do it. */ const char *get_default_device (const char *charset_from_locale, const char *source_encoding) { const struct charset_entry *entry; if (get_groff_preconv ()) { /* ASCII is a special case, and the only way we can get * things like bullet marks to come out right is by using * the ascii device. People using such a basic locale * probably don't want anything fancy anyway. */ if (charset_from_locale && STREQ (charset_from_locale, "ANSI_X3.4-1968")) return "ascii"; else return "utf8"; } if (!charset_from_locale) return fallback_default_device; for (entry = charset_table; entry->charset_from_locale; ++entry) { if (STREQ (entry->charset_from_locale, charset_from_locale)) { const char *roff_encoding = get_roff_encoding (entry->default_device, source_encoding); if (compatible_encodings (source_encoding, roff_encoding)) return entry->default_device; } } return fallback_default_device; } /* Is this a known *roff device name? */ bool ATTRIBUTE_PURE is_roff_device (const char *device) { const struct device_entry *entry; for (entry = device_table; entry->roff_device; ++entry) { if (STREQ (entry->roff_device, device)) return true; } return false; } /* Find the input encoding expected by groff, and set the LESSCHARSET * environment variable appropriately. */ const char *get_roff_encoding (const char *device, const char *source_encoding) { const struct device_entry *entry; bool found = false; const char *roff_encoding = NULL; if (device) { for (entry = device_table; entry->roff_device; ++entry) { if (STREQ (entry->roff_device, device)) { found = true; roff_encoding = entry->roff_encoding; break; } } } if (!found) roff_encoding = fallback_roff_encoding; #ifdef MULTIBYTE_GROFF /* An ugly special case is needed here. The utf8 device normally * takes ISO-8859-1 input. However, with the multibyte patch, when * recoding from CJK character sets it takes UTF-8 input instead. * This is evil, but there's not much that can be done about it * apart from waiting for groff 2.0. */ if (device && STREQ (device, "utf8") && !get_groff_preconv () && STREQ (get_locale_charset (), "UTF-8")) { const char *ctype = setlocale (LC_CTYPE, NULL); if (STRNEQ (ctype, "ja_JP", 5) || STRNEQ (ctype, "ko_KR", 5) || STRNEQ (ctype, "zh_CN", 5) || STRNEQ (ctype, "zh_HK", 5) || STRNEQ (ctype, "zh_SG", 5) || STRNEQ (ctype, "zh_TW", 5)) roff_encoding = "UTF-8"; } #endif /* MULTIBYTE_GROFF */ return roff_encoding ? roff_encoding : source_encoding; } /* Find the output encoding that this device will produce, or NULL if it * will simply pass through the input encoding. */ const char * ATTRIBUTE_PURE get_output_encoding (const char *device) { const struct device_entry *entry; for (entry = device_table; entry->roff_device; ++entry) if (STREQ (entry->roff_device, device)) return entry->output_encoding; return NULL; } /* Return the value of LESSCHARSET appropriate for this locale. */ const char * ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale) { if (charset_from_locale) { const struct less_charset_entry *entry; for (entry = less_charset_table; entry->charset_from_locale; ++entry) if (STREQ (entry->charset_from_locale, charset_from_locale)) return entry->less_charset; } return fallback_less_charset; } /* Return the value of JLESSCHARSET appropriate for this locale. May return * NULL. */ const char * ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale) { if (charset_from_locale) { const struct less_charset_entry *entry; for (entry = less_charset_table; entry->charset_from_locale; ++entry) if (STREQ (entry->charset_from_locale, charset_from_locale)) return entry->jless_charset; } return NULL; }