1
0
Fork 0
man-db/lib/encodings.c
Daniel Baumann 1fa764a8d3
Adding upstream version 2.13.1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-21 08:13:55 +02:00

706 lines
23 KiB
C

/*
* encodings.c: locale and encoding handling for man
*
* Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
* Colin Watson.
*
* This file is part of man-db.
*
* man-db is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* man-db is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with man-db; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include <ctype.h>
#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "attribute.h"
#include "gettext.h"
#include "localcharset.h"
#include "xalloc.h"
#include "xstrndup.h"
#include "manconfig.h"
#include "debug.h"
#include "encodings.h"
#include "pathsearch.h"
/* Due to historical limitations in groff (which may be removed in the
* future), there is no mechanism for a man page to specify its own
* encoding. This means that each national language directory needs to carry
* with it information about its encoding, and each groff device needs to
* have a default encoding associated with it. Out of the box, groff
* formally allows only ISO-8859-1 on input; however, patches originating
* with Debian and imported by many other GNU/Linux distributions change
* this somewhat.
*
* Eventually, groff will support proper Unicode input, and much of this
* horror can go away.
*
* Do *not* confuse source encoding with groff encoding. The encoding
* specified in this table is the encoding in which the source man pages in
* each language directory are expected to be written. The groff encoding is
* determined by the selected groff device and sometimes also by the user's
* locale.
*
* The standard output encoding is the encoding assumed for cat pages for
* each language directory. It must *not* be used to discover the actual
* output encoding displayed to the user; that is determined by the locale.
* TODO: it would be useful to be able to change the standard output
* encoding in the configuration file.
*
* This table is expected to change over time, particularly as man pages
* begin to move towards UTF-8. Feel free to patch this for your
* distribution; send me updates for languages I've missed.
*
* Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
* table.
*/
struct directory_entry {
const char *lang_dir;
const char *source_encoding;
};
static struct directory_entry directory_table[] = {
{"C", "ISO-8859-1" }, /* English */
{"POSIX", "ISO-8859-1" }, /* English */
{"be", "CP1251" }, /* Belarusian */
{"bg", "CP1251" }, /* Bulgarian */
{"cs", "ISO-8859-2" }, /* Czech */
{"da", "ISO-8859-1" }, /* Danish */
{"de", "ISO-8859-1" }, /* German */
{"el", "ISO-8859-7" }, /* Greek */
{"en", "ISO-8859-1" }, /* English */
{"es", "ISO-8859-1" }, /* Spanish */
{"et", "ISO-8859-1" }, /* Estonian */
{"fi", "ISO-8859-1" }, /* Finnish */
{"fr", "ISO-8859-1" }, /* French */
{"ga", "ISO-8859-1" }, /* Irish */
{"gl", "ISO-8859-1" }, /* Galician */
{"hr", "ISO-8859-2" }, /* Croatian */
{"hu", "ISO-8859-2" }, /* Hungarian */
{"id", "ISO-8859-1" }, /* Indonesian */
{"is", "ISO-8859-1" }, /* Icelandic */
{"it", "ISO-8859-1" }, /* Italian */
{"ja", "EUC-JP" }, /* Japanese */
{"ko", "EUC-KR" }, /* Korean */
{"lt", "ISO-8859-13"}, /* Lithuanian */
{"lv", "ISO-8859-13"}, /* Latvian */
{"mk", "ISO-8859-5" }, /* Macedonian */
{"nb", "ISO-8859-1" }, /* Norwegian Bokmål */
{"nl", "ISO-8859-1" }, /* Dutch */
{"nn", "ISO-8859-1" }, /* Norwegian Nynorsk */
{"no", "ISO-8859-1" }, /* Norwegian */
{"pl", "ISO-8859-2" }, /* Polish */
{"pt", "ISO-8859-1" }, /* Portuguese */
{"ro", "ISO-8859-2" }, /* Romanian */
{"ru", "KOI8-R" }, /* Russian */
{"sk", "ISO-8859-2" }, /* Slovak */
{"sl", "ISO-8859-2" }, /* Slovenian */
/* sr@latin must precede sr, due to top-down left-substring matching
later */
{"sr@latin", "ISO-8859-2" }, /* Serbian Latin */
{"sr", "ISO-8859-5" }, /* Serbian */
{"sv", "ISO-8859-1" }, /* Swedish */
{"tr", "ISO-8859-9" }, /* Turkish */
{"uk", "KOI8-U" }, /* Ukrainian */
{"vi", "TCVN5712-1" }, /* Vietnamese */
{"zh_CN", "GBK" }, /* Simplified Chinese */
{"zh_SG", "GBK" }, /* Simplified Chinese, Singapore */
{"zh_HK", "BIG5HKSCS" }, /* Traditional Chinese, Hong Kong */
{"zh_TW", "BIG5" }, /* Traditional Chinese */
{NULL, NULL }
};
static const char fallback_source_encoding[] = "ISO-8859-1";
/* Unfortunately, there is no portable way to inspect iconv's internal table
* of character set aliases. We copy the most interesting ones here so that
* we can deal with them if they appear in directory names. Note that all
* names will be converted to upper case before looking them up in this
* table.
*/
struct charset_alias_entry {
const char *alias;
const char *canonical_name;
};
static struct charset_alias_entry charset_alias_table[] = {
/* The FHS is silly and requires numeric-only aliases that iconv
* does not support.
*/
{"88591", "ISO-8859-1" },
{"88592", "ISO-8859-2" },
{"88593", "ISO-8859-3" },
{"88594", "ISO-8859-4" },
{"88595", "ISO-8859-5" },
{"88596", "ISO-8859-6" },
{"88597", "ISO-8859-7" },
{"88598", "ISO-8859-8" },
{"88599", "ISO-8859-9" },
{"885910", "ISO-8859-10" },
{"885911", "ISO-8859-11" },
{"885913", "ISO-8859-13" },
{"885914", "ISO-8859-14" },
{"885915", "ISO-8859-15" },
{"885916", "ISO-8859-16" },
{"ASCII", "ANSI_X3.4-1968"},
{"BIG-5", "BIG5" },
{"BIG5-HKSCS", "BIG5HKSCS" },
{"EUCCN", "EUC-CN" },
{"EUCJP", "EUC-JP" },
{"EUCKR", "EUC-KR" },
{"EUCTW", "EUC-TW" },
{"GB2312", "EUC-CN" },
{"ISO8859-1", "ISO-8859-1" },
{"ISO8859-2", "ISO-8859-2" },
{"ISO8859-3", "ISO-8859-3" },
{"ISO8859-4", "ISO-8859-4" },
{"ISO8859-5", "ISO-8859-5" },
{"ISO8859-6", "ISO-8859-6" },
{"ISO8859-7", "ISO-8859-7" },
{"ISO8859-8", "ISO-8859-8" },
{"ISO8859-9", "ISO-8859-9" },
{"ISO8859-10", "ISO-8859-10" },
{"ISO8859-11", "ISO-8859-11" },
{"ISO8859-13", "ISO-8859-13" },
{"ISO8859-14", "ISO-8859-14" },
{"ISO8859-15", "ISO-8859-15" },
{"ISO8859-16", "ISO-8859-16" },
{"KOI8R", "KOI8-R" },
{"KOI8U", "KOI8-U" },
{"UJIS", "EUC-JP" },
{"US-ASCII", "ANSI_X3.4-1968"},
{"UTF8", "UTF-8" },
{NULL, NULL }
};
/* The default groff terminal output device to be used is determined based
* on locale_charset (), which returns the character set used by the current
* locale.
*/
struct charset_entry {
const char *charset_from_locale;
const char *default_device;
};
static struct charset_entry charset_table[] = {
{"ANSI_X3.4-1968", "ascii" },
#ifndef HEIRLOOM_NROFF
{"ISO-8859-1", "latin1"},
#endif /* HEIRLOOM_NROFF */
{"UTF-8", "utf8" },
{NULL, NULL }
};
static const char *fallback_default_device = "ascii";
/* The encoding used for the text passed to groff is a function of the
* selected groff device. Traditional devices expect ISO-8859-1 on input
* (yes, even the utf8 device); devices added in the Debian multibyte patch
* expect other encodings. The ascii8 device passes top-bit-set characters
* straight through so is (probably ...) encoding-agnostic. If this encoding
* does not match the source encoding, an iconv pipe is used (if available)
* to perform recoding.
*/
struct device_entry {
const char *roff_device;
const char *roff_encoding;
const char *output_encoding;
};
static struct device_entry device_table[] = {
/* nroff devices */
{"ascii", "ANSI_X3.4-1968", "ANSI_X3.4-1968"},
{"latin1", "ISO-8859-1", "ISO-8859-1" },
{"utf8", "ISO-8859-1", "UTF-8" },
#ifdef HEIRLOOM_NROFF
/* Not strictly accurate, but we only use this in UTF-8 locales. */
{"locale", "UTF-8", "UTF-8" },
#endif /* HEIRLOOM_NROFF */
/* troff devices */
{"X75", NULL, NULL },
{"X75-12", NULL, NULL },
{"X100", NULL, NULL },
{"X100-12", NULL, NULL },
{"dvi", NULL, NULL },
{"html", NULL, NULL },
{"lbp", NULL, NULL },
{"lj4", NULL, NULL },
{"ps", NULL, NULL },
{NULL, NULL, NULL }
};
static const char fallback_roff_encoding[] = "ISO-8859-1";
/* Setting less_charset to iso8859 tells the less pager that characters
* between 0xA0 and 0xFF are displayable, not that its input is encoded in
* ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
*
* Character set names compatible only with jless go in jless_charset.
*/
struct less_charset_entry {
const char *charset_from_locale;
const char *less_charset;
const char *jless_charset;
};
static struct less_charset_entry less_charset_table[] = {
{"ANSI_X3.4-1968", "ascii", NULL },
{"CP1251", "windows", NULL },
{"EUC-JP", "iso8859", "japanese-ujis"},
{"ISO-8859-1", "iso8859", NULL },
{"KOI8-R", "koi8-r", NULL },
/* close enough? */
{"KOI8-U", "koi8-r", NULL },
{"UTF-8", "utf-8", NULL },
{NULL, NULL, NULL }
};
static const char fallback_less_charset[] = "iso8859";
static const char *groff_preconv = NULL;
/* Is the groff "preconv" helper available? If so, return its name.
* Otherwise, return NULL.
*/
const char *get_groff_preconv (void)
{
if (groff_preconv) {
if (*groff_preconv)
return groff_preconv;
else
return NULL;
}
if (pathsearch_executable ("gpreconv"))
groff_preconv = "gpreconv";
else if (pathsearch_executable ("preconv"))
groff_preconv = "preconv";
else
groff_preconv = "";
if (*groff_preconv)
return groff_preconv;
else
return NULL;
}
/* Return the assumed encoding of the source man page, based on the
* directory in which it was found. The caller should attempt to recode from
* this to whatever encoding is expected by groff.
*
* The caller should free the returned string when it is finished with it.
*/
char *ATTRIBUTE_MALLOC get_page_encoding (const char *lang)
{
const struct directory_entry *entry;
const char *dot;
if (!lang || !*lang) {
/* Guess based on the locale. */
lang = setlocale (LC_MESSAGES, NULL);
if (!lang)
return xstrdup (fallback_source_encoding);
}
dot = strchr (lang, '.');
if (dot) {
/* The FHS has the worst specification of what's supposed to
* go after the dot here that I've ever seen. To quote from
* version 2.1:
*
* "It is recommended that this be a numeric representation
* if possible (ISO standards, especially), not include
* additional punctuation symbols, and that any letters be
* in lowercase."
*
* Any sane standard would use directory names like
* de_DE.ISO-8859-1; the examples in the FHS recommend
* de_DE.88591 instead. Considering that there is no other
* conceivable use for encodings in directory names other
* than to pass them to iconv or similar, this is quite
* startlingly useless.
*
* While we now support this thanks to
* get_canonical_charset_name, the FHS specification is
* obviously wrong and I plan to petition to have it
* changed. I recommend ignoring this part of the FHS.
*/
char *dir_encoding =
xstrndup (dot + 1, strcspn (dot + 1, ",@"));
char *canonical_dir_encoding =
xstrdup (get_canonical_charset_name (dir_encoding));
free (dir_encoding);
return canonical_dir_encoding;
}
for (entry = directory_table; entry->lang_dir; ++entry)
if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
return xstrdup (entry->source_encoding);
return xstrdup (fallback_source_encoding);
}
/* Return the canonical encoding for source man pages in the specified
* language. This ignores any encoding specification in the language
* directory name. The source encoding should be used as a basis for
* determining the correct roff device to use: that is, the caller should
* behave as if it is recoding from the page encoding to the source encoding
* first, although in practice it should recode directly from the page
* encoding to the roff encoding.
*
* You should normally only call this function if the page encoding is
* UTF-8, in which case older versions of groff that lack preconv need to
* have the page recoded to some legacy encoding). If the page is in a
* legacy encoding, then attempting to recode from that to some other legacy
* encoding will probably do more harm than good.
*
* Here are a few concrete examples of why these distinctions are important:
*
* /usr/share/man/en_GB.UTF-8, locale C
* page encoding = UTF-8
* source encoding = ISO-8859-1
* roff encoding = ISO-8859-1
* output encoding = UTF-8
* UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
*
* /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
* page encoding = UTF-8
* source encoding = ISO-8859-2
* roff encoding = ISO-8859-2
* output encoding = ISO-8859-2
* UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
* -> ISO-8859-2 -> iconv -> UTF-8
*
* /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
* page encoding = EUC-JP
* source encoding = EUC-JP
* roff encoding = UTF-8
* output encoding = UTF-8
* EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
*
* /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
* page encoding = ISO-8859-15
* source encoding = ISO-8859-15
* roff encoding = ISO-8859-15
* output encoding = ISO-8859-15
* ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
*/
const char *get_source_encoding (const char *lang)
{
const struct directory_entry *entry;
if (!lang || !*lang) {
/* Guess based on the locale. */
lang = setlocale (LC_MESSAGES, NULL);
if (!lang)
return fallback_source_encoding;
}
for (entry = directory_table; entry->lang_dir; ++entry)
if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
return entry->source_encoding;
return fallback_source_encoding;
}
const char *ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL
get_canonical_charset_name (const char *charset)
{
const struct charset_alias_entry *entry;
char *charset_upper = xstrdup (charset);
char *p;
for (p = charset_upper; *p; ++p)
*p = CTYPE (toupper, *p);
for (entry = charset_alias_table; entry->alias; ++entry)
if (STREQ (entry->alias, charset_upper)) {
free (charset_upper);
return entry->canonical_name;
}
free (charset_upper);
return charset;
}
/* Return the current locale's character set. */
const char *ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void)
{
const char *charset;
char *saved_locale;
/* We need to modify LC_CTYPE temporarily in order to look at the
* codeset, so save it first.
*/
saved_locale = setlocale (LC_CTYPE, NULL);
if (saved_locale)
saved_locale = xstrdup (saved_locale);
setlocale (LC_CTYPE, "");
charset = locale_charset ();
/* Restore LC_CTYPE to its value on entry to this function. */
setlocale (LC_CTYPE, saved_locale);
free (saved_locale);
if (!charset || !*charset)
charset = "ANSI_X3.4-1968";
return get_canonical_charset_name (charset);
}
/* Find a locale with this character set. This is a non-portable operation,
* but required to make col(1) work correctly with -E. If no locale can be
* found, or if none needs to be set, return NULL.
*
* The caller should free the returned string when it is finished with it.
*/
char *find_charset_locale (const char *charset)
{
const char *canonical_charset = get_canonical_charset_name (charset);
char *saved_locale;
const char supported_path[] = "/usr/share/i18n/SUPPORTED";
FILE *supported = NULL;
char *line = NULL;
size_t n = 0;
char *locale = NULL;
if (STREQ (charset, get_locale_charset ()))
return NULL;
saved_locale = setlocale (LC_CTYPE, NULL);
if (saved_locale)
saved_locale = xstrdup (saved_locale);
supported = fopen (supported_path, "r");
while (supported && getline (&line, &n, supported) >= 0) {
const char *space = strchr (line, ' ');
if (space) {
char *encoding = xstrdup (space + 1);
char *newline = strchr (encoding, '\n');
if (newline)
*newline = 0;
if (STREQ (canonical_charset,
get_canonical_charset_name (encoding))) {
locale = xstrndup (line, space - line);
/* Is this locale actually installed? */
if (setlocale (LC_CTYPE, locale)) {
free (encoding);
goto out;
} else {
free (locale);
locale = NULL;
}
}
free (encoding);
}
free (line);
line = NULL;
}
if (strlen (canonical_charset) >= 5 &&
STRNEQ (canonical_charset, "UTF-8", 5)) {
locale = xstrdup ("C.UTF-8");
if (setlocale (LC_CTYPE, locale))
goto out;
free (locale);
locale = xstrdup ("en_US.UTF-8");
if (setlocale (LC_CTYPE, locale))
goto out;
free (locale);
locale = NULL;
}
out:
free (line);
setlocale (LC_CTYPE, saved_locale);
free (saved_locale);
if (supported)
fclose (supported);
return locale;
}
/* Can we take this input encoding and produce this output encoding, perhaps
* with the help of some iconv pipes? */
static bool ATTRIBUTE_PURE compatible_encodings (const char *input,
const char *output)
{
if (STREQ (input, output))
return true;
/* If the input is ASCII, recoding should be easy. Try it. */
if (STREQ (input, "ANSI_X3.4-1968"))
return true;
/* If the input is UTF-8, it's either a simple recoding of whatever
* we want or else it probably won't work at all no matter what we
* do. We might as well try it for now.
*/
if (STREQ (input, "UTF-8"))
return true;
/* If the output is ASCII, this is probably because the caller
* explicitly asked for it, so we have little choice but to try.
*/
if (STREQ (output, "ANSI_X3.4-1968"))
return true;
return false;
}
/* Return the default groff device for the given character set. This may be
* overridden by the user. The page's source encoding is needed to ensure
* that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
* and a trailing iconv pipe to recode to UTF-8.
*
* All this encoding compatibility stuff feels like a slightly nasty hack,
* but I haven't yet come up with a cleaner way to do it.
*/
const char *get_default_device (const char *charset_from_locale,
const char *source_encoding)
{
const struct charset_entry *entry;
if (get_groff_preconv ()) {
/* ASCII is a special case, and the only way we can get
* things like bullet marks to come out right is by using
* the ascii device. People using such a basic locale
* probably don't want anything fancy anyway.
*/
if (charset_from_locale &&
STREQ (charset_from_locale, "ANSI_X3.4-1968"))
return "ascii";
else
return "utf8";
}
if (!charset_from_locale)
return fallback_default_device;
for (entry = charset_table; entry->charset_from_locale; ++entry) {
if (STREQ (entry->charset_from_locale, charset_from_locale)) {
const char *roff_encoding = get_roff_encoding (
entry->default_device, source_encoding);
if (compatible_encodings (source_encoding,
roff_encoding))
return entry->default_device;
}
}
return fallback_default_device;
}
/* Is this a known *roff device name? */
bool ATTRIBUTE_PURE is_roff_device (const char *device)
{
const struct device_entry *entry;
for (entry = device_table; entry->roff_device; ++entry) {
if (STREQ (entry->roff_device, device))
return true;
}
return false;
}
/* Find the input encoding expected by groff. */
const char *ATTRIBUTE_PURE get_roff_encoding (const char *device,
const char *source_encoding)
{
const struct device_entry *entry;
bool found = false;
const char *roff_encoding = NULL;
if (device) {
for (entry = device_table; entry->roff_device; ++entry) {
if (STREQ (entry->roff_device, device)) {
found = true;
roff_encoding = entry->roff_encoding;
break;
}
}
}
if (!found)
roff_encoding = fallback_roff_encoding;
return roff_encoding ? roff_encoding : source_encoding;
}
/* Find the output encoding that this device will produce, or NULL if it
* will simply pass through the input encoding.
*/
const char *ATTRIBUTE_PURE get_output_encoding (const char *device)
{
const struct device_entry *entry;
for (entry = device_table; entry->roff_device; ++entry)
if (STREQ (entry->roff_device, device))
return entry->output_encoding;
return NULL;
}
/* Return the value of LESSCHARSET appropriate for this locale. */
const char *ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale)
{
if (charset_from_locale) {
const struct less_charset_entry *entry;
for (entry = less_charset_table; entry->charset_from_locale;
++entry)
if (STREQ (entry->charset_from_locale,
charset_from_locale))
return entry->less_charset;
}
return fallback_less_charset;
}
/* Return the value of JLESSCHARSET appropriate for this locale. May return
* NULL.
*/
const char *ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale)
{
if (charset_from_locale) {
const struct less_charset_entry *entry;
for (entry = less_charset_table; entry->charset_from_locale;
++entry)
if (STREQ (entry->charset_from_locale,
charset_from_locale))
return entry->jless_charset;
}
return NULL;
}