706 lines
23 KiB
C
706 lines
23 KiB
C
/*
|
|
* encodings.c: locale and encoding handling for man
|
|
*
|
|
* Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
|
|
* Colin Watson.
|
|
*
|
|
* This file is part of man-db.
|
|
*
|
|
* man-db is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* man-db is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with man-db; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
# include "config.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#include <ctype.h>
|
|
#include <locale.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
|
|
#include "attribute.h"
|
|
#include "gettext.h"
|
|
#include "localcharset.h"
|
|
#include "xalloc.h"
|
|
#include "xstrndup.h"
|
|
|
|
#include "manconfig.h"
|
|
|
|
#include "debug.h"
|
|
#include "encodings.h"
|
|
#include "pathsearch.h"
|
|
|
|
/* Due to historical limitations in groff (which may be removed in the
|
|
* future), there is no mechanism for a man page to specify its own
|
|
* encoding. This means that each national language directory needs to carry
|
|
* with it information about its encoding, and each groff device needs to
|
|
* have a default encoding associated with it. Out of the box, groff
|
|
* formally allows only ISO-8859-1 on input; however, patches originating
|
|
* with Debian and imported by many other GNU/Linux distributions change
|
|
* this somewhat.
|
|
*
|
|
* Eventually, groff will support proper Unicode input, and much of this
|
|
* horror can go away.
|
|
*
|
|
* Do *not* confuse source encoding with groff encoding. The encoding
|
|
* specified in this table is the encoding in which the source man pages in
|
|
* each language directory are expected to be written. The groff encoding is
|
|
* determined by the selected groff device and sometimes also by the user's
|
|
* locale.
|
|
*
|
|
* The standard output encoding is the encoding assumed for cat pages for
|
|
* each language directory. It must *not* be used to discover the actual
|
|
* output encoding displayed to the user; that is determined by the locale.
|
|
* TODO: it would be useful to be able to change the standard output
|
|
* encoding in the configuration file.
|
|
*
|
|
* This table is expected to change over time, particularly as man pages
|
|
* begin to move towards UTF-8. Feel free to patch this for your
|
|
* distribution; send me updates for languages I've missed.
|
|
*
|
|
* Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
|
|
* table.
|
|
*/
|
|
struct directory_entry {
|
|
const char *lang_dir;
|
|
const char *source_encoding;
|
|
};
|
|
|
|
static struct directory_entry directory_table[] = {
|
|
{"C", "ISO-8859-1" }, /* English */
|
|
{"POSIX", "ISO-8859-1" }, /* English */
|
|
{"be", "CP1251" }, /* Belarusian */
|
|
{"bg", "CP1251" }, /* Bulgarian */
|
|
{"cs", "ISO-8859-2" }, /* Czech */
|
|
{"da", "ISO-8859-1" }, /* Danish */
|
|
{"de", "ISO-8859-1" }, /* German */
|
|
{"el", "ISO-8859-7" }, /* Greek */
|
|
{"en", "ISO-8859-1" }, /* English */
|
|
{"es", "ISO-8859-1" }, /* Spanish */
|
|
{"et", "ISO-8859-1" }, /* Estonian */
|
|
{"fi", "ISO-8859-1" }, /* Finnish */
|
|
{"fr", "ISO-8859-1" }, /* French */
|
|
{"ga", "ISO-8859-1" }, /* Irish */
|
|
{"gl", "ISO-8859-1" }, /* Galician */
|
|
{"hr", "ISO-8859-2" }, /* Croatian */
|
|
{"hu", "ISO-8859-2" }, /* Hungarian */
|
|
{"id", "ISO-8859-1" }, /* Indonesian */
|
|
{"is", "ISO-8859-1" }, /* Icelandic */
|
|
{"it", "ISO-8859-1" }, /* Italian */
|
|
{"ja", "EUC-JP" }, /* Japanese */
|
|
{"ko", "EUC-KR" }, /* Korean */
|
|
{"lt", "ISO-8859-13"}, /* Lithuanian */
|
|
{"lv", "ISO-8859-13"}, /* Latvian */
|
|
{"mk", "ISO-8859-5" }, /* Macedonian */
|
|
{"nb", "ISO-8859-1" }, /* Norwegian Bokmål */
|
|
{"nl", "ISO-8859-1" }, /* Dutch */
|
|
{"nn", "ISO-8859-1" }, /* Norwegian Nynorsk */
|
|
{"no", "ISO-8859-1" }, /* Norwegian */
|
|
{"pl", "ISO-8859-2" }, /* Polish */
|
|
{"pt", "ISO-8859-1" }, /* Portuguese */
|
|
{"ro", "ISO-8859-2" }, /* Romanian */
|
|
{"ru", "KOI8-R" }, /* Russian */
|
|
{"sk", "ISO-8859-2" }, /* Slovak */
|
|
{"sl", "ISO-8859-2" }, /* Slovenian */
|
|
/* sr@latin must precede sr, due to top-down left-substring matching
|
|
later */
|
|
{"sr@latin", "ISO-8859-2" }, /* Serbian Latin */
|
|
{"sr", "ISO-8859-5" }, /* Serbian */
|
|
{"sv", "ISO-8859-1" }, /* Swedish */
|
|
{"tr", "ISO-8859-9" }, /* Turkish */
|
|
{"uk", "KOI8-U" }, /* Ukrainian */
|
|
{"vi", "TCVN5712-1" }, /* Vietnamese */
|
|
{"zh_CN", "GBK" }, /* Simplified Chinese */
|
|
{"zh_SG", "GBK" }, /* Simplified Chinese, Singapore */
|
|
{"zh_HK", "BIG5HKSCS" }, /* Traditional Chinese, Hong Kong */
|
|
{"zh_TW", "BIG5" }, /* Traditional Chinese */
|
|
|
|
{NULL, NULL }
|
|
};
|
|
|
|
static const char fallback_source_encoding[] = "ISO-8859-1";
|
|
|
|
/* Unfortunately, there is no portable way to inspect iconv's internal table
|
|
* of character set aliases. We copy the most interesting ones here so that
|
|
* we can deal with them if they appear in directory names. Note that all
|
|
* names will be converted to upper case before looking them up in this
|
|
* table.
|
|
*/
|
|
struct charset_alias_entry {
|
|
const char *alias;
|
|
const char *canonical_name;
|
|
};
|
|
|
|
static struct charset_alias_entry charset_alias_table[] = {
|
|
/* The FHS is silly and requires numeric-only aliases that iconv
|
|
* does not support.
|
|
*/
|
|
{"88591", "ISO-8859-1" },
|
|
{"88592", "ISO-8859-2" },
|
|
{"88593", "ISO-8859-3" },
|
|
{"88594", "ISO-8859-4" },
|
|
{"88595", "ISO-8859-5" },
|
|
{"88596", "ISO-8859-6" },
|
|
{"88597", "ISO-8859-7" },
|
|
{"88598", "ISO-8859-8" },
|
|
{"88599", "ISO-8859-9" },
|
|
{"885910", "ISO-8859-10" },
|
|
{"885911", "ISO-8859-11" },
|
|
{"885913", "ISO-8859-13" },
|
|
{"885914", "ISO-8859-14" },
|
|
{"885915", "ISO-8859-15" },
|
|
{"885916", "ISO-8859-16" },
|
|
|
|
{"ASCII", "ANSI_X3.4-1968"},
|
|
{"BIG-5", "BIG5" },
|
|
{"BIG5-HKSCS", "BIG5HKSCS" },
|
|
{"EUCCN", "EUC-CN" },
|
|
{"EUCJP", "EUC-JP" },
|
|
{"EUCKR", "EUC-KR" },
|
|
{"EUCTW", "EUC-TW" },
|
|
{"GB2312", "EUC-CN" },
|
|
{"ISO8859-1", "ISO-8859-1" },
|
|
{"ISO8859-2", "ISO-8859-2" },
|
|
{"ISO8859-3", "ISO-8859-3" },
|
|
{"ISO8859-4", "ISO-8859-4" },
|
|
{"ISO8859-5", "ISO-8859-5" },
|
|
{"ISO8859-6", "ISO-8859-6" },
|
|
{"ISO8859-7", "ISO-8859-7" },
|
|
{"ISO8859-8", "ISO-8859-8" },
|
|
{"ISO8859-9", "ISO-8859-9" },
|
|
{"ISO8859-10", "ISO-8859-10" },
|
|
{"ISO8859-11", "ISO-8859-11" },
|
|
{"ISO8859-13", "ISO-8859-13" },
|
|
{"ISO8859-14", "ISO-8859-14" },
|
|
{"ISO8859-15", "ISO-8859-15" },
|
|
{"ISO8859-16", "ISO-8859-16" },
|
|
{"KOI8R", "KOI8-R" },
|
|
{"KOI8U", "KOI8-U" },
|
|
{"UJIS", "EUC-JP" },
|
|
{"US-ASCII", "ANSI_X3.4-1968"},
|
|
{"UTF8", "UTF-8" },
|
|
|
|
{NULL, NULL }
|
|
};
|
|
|
|
/* The default groff terminal output device to be used is determined based
|
|
* on locale_charset (), which returns the character set used by the current
|
|
* locale.
|
|
*/
|
|
struct charset_entry {
|
|
const char *charset_from_locale;
|
|
const char *default_device;
|
|
};
|
|
|
|
static struct charset_entry charset_table[] = {
|
|
{"ANSI_X3.4-1968", "ascii" },
|
|
#ifndef HEIRLOOM_NROFF
|
|
{"ISO-8859-1", "latin1"},
|
|
#endif /* HEIRLOOM_NROFF */
|
|
{"UTF-8", "utf8" },
|
|
|
|
{NULL, NULL }
|
|
};
|
|
|
|
static const char *fallback_default_device = "ascii";
|
|
|
|
/* The encoding used for the text passed to groff is a function of the
|
|
* selected groff device. Traditional devices expect ISO-8859-1 on input
|
|
* (yes, even the utf8 device); devices added in the Debian multibyte patch
|
|
* expect other encodings. The ascii8 device passes top-bit-set characters
|
|
* straight through so is (probably ...) encoding-agnostic. If this encoding
|
|
* does not match the source encoding, an iconv pipe is used (if available)
|
|
* to perform recoding.
|
|
*/
|
|
struct device_entry {
|
|
const char *roff_device;
|
|
const char *roff_encoding;
|
|
const char *output_encoding;
|
|
};
|
|
|
|
static struct device_entry device_table[] = {
|
|
/* nroff devices */
|
|
{"ascii", "ANSI_X3.4-1968", "ANSI_X3.4-1968"},
|
|
{"latin1", "ISO-8859-1", "ISO-8859-1" },
|
|
{"utf8", "ISO-8859-1", "UTF-8" },
|
|
|
|
#ifdef HEIRLOOM_NROFF
|
|
/* Not strictly accurate, but we only use this in UTF-8 locales. */
|
|
{"locale", "UTF-8", "UTF-8" },
|
|
#endif /* HEIRLOOM_NROFF */
|
|
|
|
/* troff devices */
|
|
{"X75", NULL, NULL },
|
|
{"X75-12", NULL, NULL },
|
|
{"X100", NULL, NULL },
|
|
{"X100-12", NULL, NULL },
|
|
{"dvi", NULL, NULL },
|
|
{"html", NULL, NULL },
|
|
{"lbp", NULL, NULL },
|
|
{"lj4", NULL, NULL },
|
|
{"ps", NULL, NULL },
|
|
|
|
{NULL, NULL, NULL }
|
|
};
|
|
|
|
static const char fallback_roff_encoding[] = "ISO-8859-1";
|
|
|
|
/* Setting less_charset to iso8859 tells the less pager that characters
|
|
* between 0xA0 and 0xFF are displayable, not that its input is encoded in
|
|
* ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
|
|
*
|
|
* Character set names compatible only with jless go in jless_charset.
|
|
*/
|
|
struct less_charset_entry {
|
|
const char *charset_from_locale;
|
|
const char *less_charset;
|
|
const char *jless_charset;
|
|
};
|
|
|
|
static struct less_charset_entry less_charset_table[] = {
|
|
{"ANSI_X3.4-1968", "ascii", NULL },
|
|
{"CP1251", "windows", NULL },
|
|
{"EUC-JP", "iso8859", "japanese-ujis"},
|
|
{"ISO-8859-1", "iso8859", NULL },
|
|
{"KOI8-R", "koi8-r", NULL },
|
|
/* close enough? */
|
|
{"KOI8-U", "koi8-r", NULL },
|
|
{"UTF-8", "utf-8", NULL },
|
|
|
|
{NULL, NULL, NULL }
|
|
};
|
|
|
|
static const char fallback_less_charset[] = "iso8859";
|
|
|
|
static const char *groff_preconv = NULL;
|
|
|
|
/* Is the groff "preconv" helper available? If so, return its name.
|
|
* Otherwise, return NULL.
|
|
*/
|
|
const char *get_groff_preconv (void)
|
|
{
|
|
if (groff_preconv) {
|
|
if (*groff_preconv)
|
|
return groff_preconv;
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
if (pathsearch_executable ("gpreconv"))
|
|
groff_preconv = "gpreconv";
|
|
else if (pathsearch_executable ("preconv"))
|
|
groff_preconv = "preconv";
|
|
else
|
|
groff_preconv = "";
|
|
|
|
if (*groff_preconv)
|
|
return groff_preconv;
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
/* Return the assumed encoding of the source man page, based on the
|
|
* directory in which it was found. The caller should attempt to recode from
|
|
* this to whatever encoding is expected by groff.
|
|
*
|
|
* The caller should free the returned string when it is finished with it.
|
|
*/
|
|
char *ATTRIBUTE_MALLOC get_page_encoding (const char *lang)
|
|
{
|
|
const struct directory_entry *entry;
|
|
const char *dot;
|
|
|
|
if (!lang || !*lang) {
|
|
/* Guess based on the locale. */
|
|
lang = setlocale (LC_MESSAGES, NULL);
|
|
if (!lang)
|
|
return xstrdup (fallback_source_encoding);
|
|
}
|
|
|
|
dot = strchr (lang, '.');
|
|
if (dot) {
|
|
/* The FHS has the worst specification of what's supposed to
|
|
* go after the dot here that I've ever seen. To quote from
|
|
* version 2.1:
|
|
*
|
|
* "It is recommended that this be a numeric representation
|
|
* if possible (ISO standards, especially), not include
|
|
* additional punctuation symbols, and that any letters be
|
|
* in lowercase."
|
|
*
|
|
* Any sane standard would use directory names like
|
|
* de_DE.ISO-8859-1; the examples in the FHS recommend
|
|
* de_DE.88591 instead. Considering that there is no other
|
|
* conceivable use for encodings in directory names other
|
|
* than to pass them to iconv or similar, this is quite
|
|
* startlingly useless.
|
|
*
|
|
* While we now support this thanks to
|
|
* get_canonical_charset_name, the FHS specification is
|
|
* obviously wrong and I plan to petition to have it
|
|
* changed. I recommend ignoring this part of the FHS.
|
|
*/
|
|
char *dir_encoding =
|
|
xstrndup (dot + 1, strcspn (dot + 1, ",@"));
|
|
char *canonical_dir_encoding =
|
|
xstrdup (get_canonical_charset_name (dir_encoding));
|
|
free (dir_encoding);
|
|
return canonical_dir_encoding;
|
|
}
|
|
|
|
for (entry = directory_table; entry->lang_dir; ++entry)
|
|
if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
|
|
return xstrdup (entry->source_encoding);
|
|
|
|
return xstrdup (fallback_source_encoding);
|
|
}
|
|
|
|
/* Return the canonical encoding for source man pages in the specified
|
|
* language. This ignores any encoding specification in the language
|
|
* directory name. The source encoding should be used as a basis for
|
|
* determining the correct roff device to use: that is, the caller should
|
|
* behave as if it is recoding from the page encoding to the source encoding
|
|
* first, although in practice it should recode directly from the page
|
|
* encoding to the roff encoding.
|
|
*
|
|
* You should normally only call this function if the page encoding is
|
|
* UTF-8, in which case older versions of groff that lack preconv need to
|
|
* have the page recoded to some legacy encoding). If the page is in a
|
|
* legacy encoding, then attempting to recode from that to some other legacy
|
|
* encoding will probably do more harm than good.
|
|
*
|
|
* Here are a few concrete examples of why these distinctions are important:
|
|
*
|
|
* /usr/share/man/en_GB.UTF-8, locale C
|
|
* page encoding = UTF-8
|
|
* source encoding = ISO-8859-1
|
|
* roff encoding = ISO-8859-1
|
|
* output encoding = UTF-8
|
|
* UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
|
|
*
|
|
* /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
|
|
* page encoding = UTF-8
|
|
* source encoding = ISO-8859-2
|
|
* roff encoding = ISO-8859-2
|
|
* output encoding = ISO-8859-2
|
|
* UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
|
|
* -> ISO-8859-2 -> iconv -> UTF-8
|
|
*
|
|
* /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
|
|
* page encoding = EUC-JP
|
|
* source encoding = EUC-JP
|
|
* roff encoding = UTF-8
|
|
* output encoding = UTF-8
|
|
* EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
|
|
*
|
|
* /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
|
|
* page encoding = ISO-8859-15
|
|
* source encoding = ISO-8859-15
|
|
* roff encoding = ISO-8859-15
|
|
* output encoding = ISO-8859-15
|
|
* ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
|
|
*/
|
|
const char *get_source_encoding (const char *lang)
|
|
{
|
|
const struct directory_entry *entry;
|
|
|
|
if (!lang || !*lang) {
|
|
/* Guess based on the locale. */
|
|
lang = setlocale (LC_MESSAGES, NULL);
|
|
if (!lang)
|
|
return fallback_source_encoding;
|
|
}
|
|
|
|
for (entry = directory_table; entry->lang_dir; ++entry)
|
|
if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
|
|
return entry->source_encoding;
|
|
|
|
return fallback_source_encoding;
|
|
}
|
|
|
|
const char *ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL
|
|
get_canonical_charset_name (const char *charset)
|
|
{
|
|
const struct charset_alias_entry *entry;
|
|
char *charset_upper = xstrdup (charset);
|
|
char *p;
|
|
|
|
for (p = charset_upper; *p; ++p)
|
|
*p = CTYPE (toupper, *p);
|
|
|
|
for (entry = charset_alias_table; entry->alias; ++entry)
|
|
if (STREQ (entry->alias, charset_upper)) {
|
|
free (charset_upper);
|
|
return entry->canonical_name;
|
|
}
|
|
|
|
free (charset_upper);
|
|
return charset;
|
|
}
|
|
|
|
/* Return the current locale's character set. */
|
|
const char *ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void)
|
|
{
|
|
const char *charset;
|
|
char *saved_locale;
|
|
|
|
/* We need to modify LC_CTYPE temporarily in order to look at the
|
|
* codeset, so save it first.
|
|
*/
|
|
saved_locale = setlocale (LC_CTYPE, NULL);
|
|
if (saved_locale)
|
|
saved_locale = xstrdup (saved_locale);
|
|
|
|
setlocale (LC_CTYPE, "");
|
|
|
|
charset = locale_charset ();
|
|
|
|
/* Restore LC_CTYPE to its value on entry to this function. */
|
|
setlocale (LC_CTYPE, saved_locale);
|
|
free (saved_locale);
|
|
|
|
if (!charset || !*charset)
|
|
charset = "ANSI_X3.4-1968";
|
|
return get_canonical_charset_name (charset);
|
|
}
|
|
|
|
/* Find a locale with this character set. This is a non-portable operation,
|
|
* but required to make col(1) work correctly with -E. If no locale can be
|
|
* found, or if none needs to be set, return NULL.
|
|
*
|
|
* The caller should free the returned string when it is finished with it.
|
|
*/
|
|
char *find_charset_locale (const char *charset)
|
|
{
|
|
const char *canonical_charset = get_canonical_charset_name (charset);
|
|
char *saved_locale;
|
|
const char supported_path[] = "/usr/share/i18n/SUPPORTED";
|
|
FILE *supported = NULL;
|
|
char *line = NULL;
|
|
size_t n = 0;
|
|
char *locale = NULL;
|
|
|
|
if (STREQ (charset, get_locale_charset ()))
|
|
return NULL;
|
|
|
|
saved_locale = setlocale (LC_CTYPE, NULL);
|
|
if (saved_locale)
|
|
saved_locale = xstrdup (saved_locale);
|
|
|
|
supported = fopen (supported_path, "r");
|
|
while (supported && getline (&line, &n, supported) >= 0) {
|
|
const char *space = strchr (line, ' ');
|
|
if (space) {
|
|
char *encoding = xstrdup (space + 1);
|
|
char *newline = strchr (encoding, '\n');
|
|
if (newline)
|
|
*newline = 0;
|
|
if (STREQ (canonical_charset,
|
|
get_canonical_charset_name (encoding))) {
|
|
locale = xstrndup (line, space - line);
|
|
/* Is this locale actually installed? */
|
|
if (setlocale (LC_CTYPE, locale)) {
|
|
free (encoding);
|
|
goto out;
|
|
} else {
|
|
free (locale);
|
|
locale = NULL;
|
|
}
|
|
}
|
|
free (encoding);
|
|
}
|
|
free (line);
|
|
line = NULL;
|
|
}
|
|
|
|
if (strlen (canonical_charset) >= 5 &&
|
|
STRNEQ (canonical_charset, "UTF-8", 5)) {
|
|
locale = xstrdup ("C.UTF-8");
|
|
if (setlocale (LC_CTYPE, locale))
|
|
goto out;
|
|
free (locale);
|
|
locale = xstrdup ("en_US.UTF-8");
|
|
if (setlocale (LC_CTYPE, locale))
|
|
goto out;
|
|
free (locale);
|
|
locale = NULL;
|
|
}
|
|
|
|
out:
|
|
free (line);
|
|
setlocale (LC_CTYPE, saved_locale);
|
|
free (saved_locale);
|
|
if (supported)
|
|
fclose (supported);
|
|
return locale;
|
|
}
|
|
|
|
/* Can we take this input encoding and produce this output encoding, perhaps
|
|
* with the help of some iconv pipes? */
|
|
static bool ATTRIBUTE_PURE compatible_encodings (const char *input,
|
|
const char *output)
|
|
{
|
|
if (STREQ (input, output))
|
|
return true;
|
|
|
|
/* If the input is ASCII, recoding should be easy. Try it. */
|
|
if (STREQ (input, "ANSI_X3.4-1968"))
|
|
return true;
|
|
|
|
/* If the input is UTF-8, it's either a simple recoding of whatever
|
|
* we want or else it probably won't work at all no matter what we
|
|
* do. We might as well try it for now.
|
|
*/
|
|
if (STREQ (input, "UTF-8"))
|
|
return true;
|
|
|
|
/* If the output is ASCII, this is probably because the caller
|
|
* explicitly asked for it, so we have little choice but to try.
|
|
*/
|
|
if (STREQ (output, "ANSI_X3.4-1968"))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Return the default groff device for the given character set. This may be
|
|
* overridden by the user. The page's source encoding is needed to ensure
|
|
* that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
|
|
* and a trailing iconv pipe to recode to UTF-8.
|
|
*
|
|
* All this encoding compatibility stuff feels like a slightly nasty hack,
|
|
* but I haven't yet come up with a cleaner way to do it.
|
|
*/
|
|
const char *get_default_device (const char *charset_from_locale,
|
|
const char *source_encoding)
|
|
{
|
|
const struct charset_entry *entry;
|
|
|
|
if (get_groff_preconv ()) {
|
|
/* ASCII is a special case, and the only way we can get
|
|
* things like bullet marks to come out right is by using
|
|
* the ascii device. People using such a basic locale
|
|
* probably don't want anything fancy anyway.
|
|
*/
|
|
if (charset_from_locale &&
|
|
STREQ (charset_from_locale, "ANSI_X3.4-1968"))
|
|
return "ascii";
|
|
else
|
|
return "utf8";
|
|
}
|
|
|
|
if (!charset_from_locale)
|
|
return fallback_default_device;
|
|
|
|
for (entry = charset_table; entry->charset_from_locale; ++entry) {
|
|
if (STREQ (entry->charset_from_locale, charset_from_locale)) {
|
|
const char *roff_encoding = get_roff_encoding (
|
|
entry->default_device, source_encoding);
|
|
if (compatible_encodings (source_encoding,
|
|
roff_encoding))
|
|
return entry->default_device;
|
|
}
|
|
}
|
|
|
|
return fallback_default_device;
|
|
}
|
|
|
|
/* Is this a known *roff device name? */
|
|
bool ATTRIBUTE_PURE is_roff_device (const char *device)
|
|
{
|
|
const struct device_entry *entry;
|
|
|
|
for (entry = device_table; entry->roff_device; ++entry) {
|
|
if (STREQ (entry->roff_device, device))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Find the input encoding expected by groff. */
|
|
const char *ATTRIBUTE_PURE get_roff_encoding (const char *device,
|
|
const char *source_encoding)
|
|
{
|
|
const struct device_entry *entry;
|
|
bool found = false;
|
|
const char *roff_encoding = NULL;
|
|
|
|
if (device) {
|
|
for (entry = device_table; entry->roff_device; ++entry) {
|
|
if (STREQ (entry->roff_device, device)) {
|
|
found = true;
|
|
roff_encoding = entry->roff_encoding;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!found)
|
|
roff_encoding = fallback_roff_encoding;
|
|
|
|
return roff_encoding ? roff_encoding : source_encoding;
|
|
}
|
|
|
|
/* Find the output encoding that this device will produce, or NULL if it
|
|
* will simply pass through the input encoding.
|
|
*/
|
|
const char *ATTRIBUTE_PURE get_output_encoding (const char *device)
|
|
{
|
|
const struct device_entry *entry;
|
|
|
|
for (entry = device_table; entry->roff_device; ++entry)
|
|
if (STREQ (entry->roff_device, device))
|
|
return entry->output_encoding;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Return the value of LESSCHARSET appropriate for this locale. */
|
|
const char *ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale)
|
|
{
|
|
if (charset_from_locale) {
|
|
const struct less_charset_entry *entry;
|
|
|
|
for (entry = less_charset_table; entry->charset_from_locale;
|
|
++entry)
|
|
if (STREQ (entry->charset_from_locale,
|
|
charset_from_locale))
|
|
return entry->less_charset;
|
|
}
|
|
|
|
return fallback_less_charset;
|
|
}
|
|
|
|
/* Return the value of JLESSCHARSET appropriate for this locale. May return
|
|
* NULL.
|
|
*/
|
|
const char *ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale)
|
|
{
|
|
if (charset_from_locale) {
|
|
const struct less_charset_entry *entry;
|
|
|
|
for (entry = less_charset_table; entry->charset_from_locale;
|
|
++entry)
|
|
if (STREQ (entry->charset_from_locale,
|
|
charset_from_locale))
|
|
return entry->jless_charset;
|
|
}
|
|
|
|
return NULL;
|
|
}
|