Adding upstream version 2.12.0.upstream/2.12.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 19:37:10 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 19:37:10 +0000
commit: c9addba5cc770d2d231b34f6739f32c6be8690f1 (patch)
tree: c643da154a95a1d163137135050bb47858a1654e /lib/encodings.c
parent: Initial commit. (diff)
download: man-db-c9addba5cc770d2d231b34f6739f32c6be8690f1.tar.xz
man-db-c9addba5cc770d2d231b34f6739f32c6be8690f1.zip
1 files changed, 780 insertions, 0 deletions
diff --git a/lib/encodings.c b/lib/encodings.c
new file mode 100644
index 0000000..68c06c6
--- /dev/null
+++ b/lib/encodings.c
@@ -0,0 +1,780 @@
+/*
+ * encodings.c: locale and encoding handling for man
+ *
+ * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+ *               Colin Watson.
+ *
+ * This file is part of man-db.
+ *
+ * man-db is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * man-db is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with man-db; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <locale.h>
+#include <ctype.h>
+
+#include "attribute.h"
+#include "gettext.h"
+#include "localcharset.h"
+#include "xalloc.h"
+#include "xstrndup.h"
+
+#include "manconfig.h"
+
+#include "debug.h"
+#include "encodings.h"
+#include "pathsearch.h"
+
+
+/* Due to historical limitations in groff (which may be removed in the
+ * future), there is no mechanism for a man page to specify its own
+ * encoding. This means that each national language directory needs to carry
+ * with it information about its encoding, and each groff device needs to
+ * have a default encoding associated with it. Out of the box, groff
+ * formally allows only ISO-8859-1 on input; however, patches originating
+ * with Debian and imported by many other GNU/Linux distributions change
+ * this somewhat.
+ *
+ * Eventually, groff will support proper Unicode input, and much of this
+ * horror can go away.
+ *
+ * Do *not* confuse source encoding with groff encoding. The encoding
+ * specified in this table is the encoding in which the source man pages in
+ * each language directory are expected to be written. The groff encoding is
+ * determined by the selected groff device and sometimes also by the user's
+ * locale.
+ *
+ * The standard output encoding is the encoding assumed for cat pages for
+ * each language directory. It must *not* be used to discover the actual
+ * output encoding displayed to the user; that is determined by the locale.
+ * TODO: it would be useful to be able to change the standard output
+ * encoding in the configuration file.
+ *
+ * This table is expected to change over time, particularly as man pages
+ * begin to move towards UTF-8. Feel free to patch this for your
+ * distribution; send me updates for languages I've missed.
+ *
+ * Explicit encodings in the directory name (e.g. de_DE.UTF-8) override this
+ * table.
+ */
+struct directory_entry {
+	const char *lang_dir;
+	const char *source_encoding;
+};
+
+static struct directory_entry directory_table[] = {
+	{ "C",		"ISO-8859-1"	}, /* English */
+	{ "POSIX",	"ISO-8859-1"	}, /* English */
+	{ "da",		"ISO-8859-1"	}, /* Danish */
+	{ "de",		"ISO-8859-1"	}, /* German */
+	{ "en",		"ISO-8859-1"	}, /* English */
+	{ "es",		"ISO-8859-1"	}, /* Spanish */
+	{ "et",		"ISO-8859-1"	}, /* Estonian */
+	{ "fi",		"ISO-8859-1"	}, /* Finnish */
+	{ "fr",		"ISO-8859-1"	}, /* French */
+	{ "ga",		"ISO-8859-1"	}, /* Irish */
+	{ "gl",		"ISO-8859-1"	}, /* Galician */
+	{ "id",		"ISO-8859-1"	}, /* Indonesian */
+	{ "is",		"ISO-8859-1"	}, /* Icelandic */
+	{ "it",		"ISO-8859-1"	}, /* Italian */
+	{ "nb",		"ISO-8859-1"	}, /* Norwegian Bokmål */
+	{ "nl",		"ISO-8859-1"	}, /* Dutch */
+	{ "nn",		"ISO-8859-1"	}, /* Norwegian Nynorsk */
+	{ "no",		"ISO-8859-1"	}, /* Norwegian */
+	{ "pt",		"ISO-8859-1"	}, /* Portuguese */
+	{ "sv",		"ISO-8859-1"	}, /* Swedish */
+
+#ifdef MULTIBYTE_GROFF
+	/* These languages require a patched version of groff with the
+	 * ascii8 and nippon devices.
+	 */
+	{ "be",		"CP1251"	}, /* Belarusian */
+	{ "bg",		"CP1251"	}, /* Bulgarian */
+	{ "cs",		"ISO-8859-2"	}, /* Czech */
+	{ "el",		"ISO-8859-7"	}, /* Greek */
+	{ "hr",		"ISO-8859-2"	}, /* Croatian */
+	{ "hu",		"ISO-8859-2"	}, /* Hungarian */
+	{ "ja",		"EUC-JP"	}, /* Japanese */
+	{ "ko",		"EUC-KR"	}, /* Korean */
+	{ "lt",		"ISO-8859-13"	}, /* Lithuanian */
+	{ "lv",		"ISO-8859-13"	}, /* Latvian */
+	{ "mk",		"ISO-8859-5"	}, /* Macedonian */
+	{ "pl",		"ISO-8859-2"	}, /* Polish */
+	{ "ro",		"ISO-8859-2"	}, /* Romanian */
+	{ "ru",		"KOI8-R"	}, /* Russian */
+	{ "sk",		"ISO-8859-2"	}, /* Slovak */
+	{ "sl",		"ISO-8859-2"	}, /* Slovenian */
+	/* sr@latin must precede sr, due to top-down left-substring matching later */
+	{ "sr@latin",	"ISO-8859-2"	}, /* Serbian Latin */
+	{ "sr",		"ISO-8859-5"	}, /* Serbian */
+	{ "tr",		"ISO-8859-9"	}, /* Turkish */
+	{ "uk",		"KOI8-U"	}, /* Ukrainian */
+	{ "vi",		"TCVN5712-1"	}, /* Vietnamese */
+	{ "zh_CN",	"GBK"		}, /* Simplified Chinese */
+	{ "zh_SG",	"GBK"		}, /* Simplified Chinese, Singapore */
+	{ "zh_HK",	"BIG5HKSCS"	}, /* Traditional Chinese, Hong Kong */
+	{ "zh_TW",	"BIG5"		}, /* Traditional Chinese */
+#endif /* MULTIBYTE_GROFF */
+
+	{ NULL,		NULL		}
+};
+
+static const char fallback_source_encoding[] = "ISO-8859-1";
+
+/* Unfortunately, there is no portable way to inspect iconv's internal table
+ * of character set aliases. We copy the most interesting ones here so that
+ * we can deal with them if they appear in directory names. Note that all
+ * names will be converted to upper case before looking them up in this
+ * table.
+ */
+struct charset_alias_entry {
+	const char *alias;
+	const char *canonical_name;
+};
+
+static struct charset_alias_entry charset_alias_table[] = {
+	/* The FHS is silly and requires numeric-only aliases that iconv
+	 * does not support.
+	 */
+	{ "88591",		"ISO-8859-1"		},
+	{ "88592",		"ISO-8859-2"		},
+	{ "88593",		"ISO-8859-3"		},
+	{ "88594",		"ISO-8859-4"		},
+	{ "88595",		"ISO-8859-5"		},
+	{ "88596",		"ISO-8859-6"		},
+	{ "88597",		"ISO-8859-7"		},
+	{ "88598",		"ISO-8859-8"		},
+	{ "88599",		"ISO-8859-9"		},
+	{ "885910",		"ISO-8859-10"		},
+	{ "885911",		"ISO-8859-11"		},
+	{ "885913",		"ISO-8859-13"		},
+	{ "885914",		"ISO-8859-14"		},
+	{ "885915",		"ISO-8859-15"		},
+	{ "885916",		"ISO-8859-16"		},
+
+	{ "ASCII",		"ANSI_X3.4-1968"	},
+	{ "BIG-5",		"BIG5"			},
+	{ "BIG5-HKSCS",		"BIG5HKSCS"		},
+	{ "EUCCN",		"EUC-CN"		},
+	{ "EUCJP",		"EUC-JP"		},
+	{ "EUCKR",		"EUC-KR"		},
+	{ "EUCTW",		"EUC-TW"		},
+	{ "GB2312",		"EUC-CN"		},
+	{ "ISO8859-1",		"ISO-8859-1"		},
+	{ "ISO8859-2",		"ISO-8859-2"		},
+	{ "ISO8859-3",		"ISO-8859-3"		},
+	{ "ISO8859-4",		"ISO-8859-4"		},
+	{ "ISO8859-5",		"ISO-8859-5"		},
+	{ "ISO8859-6",		"ISO-8859-6"		},
+	{ "ISO8859-7",		"ISO-8859-7"		},
+	{ "ISO8859-8",		"ISO-8859-8"		},
+	{ "ISO8859-9",		"ISO-8859-9"		},
+	{ "ISO8859-10",		"ISO-8859-10"		},
+	{ "ISO8859-11",		"ISO-8859-11"		},
+	{ "ISO8859-13",		"ISO-8859-13"		},
+	{ "ISO8859-14",		"ISO-8859-14"		},
+	{ "ISO8859-15",		"ISO-8859-15"		},
+	{ "ISO8859-16",		"ISO-8859-16"		},
+	{ "KOI8R",		"KOI8-R"		},
+	{ "KOI8U",		"KOI8-U"		},
+	{ "UJIS",		"EUC-JP"		},
+	{ "US-ASCII",		"ANSI_X3.4-1968"	},
+	{ "UTF8",		"UTF-8"			},
+
+	{ NULL,			NULL			}
+};
+
+/* The default groff terminal output device to be used is determined based
+ * on locale_charset (), which returns the character set used by the current
+ * locale.
+ */
+struct charset_entry {
+	const char *charset_from_locale;
+	const char *default_device;
+};
+
+static struct charset_entry charset_table[] = {
+	{ "ANSI_X3.4-1968",	"ascii"		},
+#ifndef HEIRLOOM_NROFF
+	{ "ISO-8859-1",		"latin1"	},
+#endif /* HEIRLOOM_NROFF */
+	{ "UTF-8",		"utf8"		},
+
+#ifndef HEIRLOOM_NROFF
+# ifdef MULTIBYTE_GROFF
+	{ "BIG5",		"nippon"	},
+	{ "BIG5HKSCS",		"nippon"	},
+	{ "EUC-CN",		"nippon"	},
+	{ "EUC-JP",		"nippon"	},
+	{ "EUC-TW",		"nippon"	},
+	{ "GBK",		"nippon"	},
+# else /* !MULTIBYTE_GROFF */
+	/* If we have a smarter version of groff, this is better dealt with
+	 * using either ascii8 (Debian multibyte patch) or preconv (as of
+	 * groff 1.20). This is a not-quite-right stopgap in case we have
+	 * neither.
+	 */
+	{ "ISO-8859-15",    	"latin1"	},
+# endif /* MULTIBYTE_GROFF */
+#endif /* HEIRLOOM_NROFF */
+
+	{ NULL,			NULL		}
+};
+
+static const char *fallback_default_device =
+#ifdef MULTIBYTE_GROFF
+	"ascii8"
+#else /* !MULTIBYTE_GROFF */
+	"ascii"
+#endif /* MULTIBYTE_GROFF */
+	;
+
+/* The encoding used for the text passed to groff is a function of the
+ * selected groff device. Traditional devices expect ISO-8859-1 on input
+ * (yes, even the utf8 device); devices added in the Debian multibyte patch
+ * expect other encodings. The ascii8 device passes top-bit-set characters
+ * straight through so is (probably ...) encoding-agnostic. If this encoding
+ * does not match the source encoding, an iconv pipe is used (if available)
+ * to perform recoding.
+ */
+struct device_entry {
+	const char *roff_device;
+	const char *roff_encoding;
+	const char *output_encoding;
+};
+
+static struct device_entry device_table[] = {
+	/* nroff devices */
+	{ "ascii",	"ANSI_X3.4-1968",	"ANSI_X3.4-1968"	},
+	{ "latin1",	"ISO-8859-1",		"ISO-8859-1"		},
+	{ "utf8",	"ISO-8859-1",		"UTF-8"			},
+
+#ifdef MULTIBYTE_GROFF
+	{ "ascii8",	NULL,			NULL			},
+	{ "nippon",	NULL,			NULL			},
+#endif /* MULTIBYTE_GROFF */
+
+#ifdef HEIRLOOM_NROFF
+	/* Not strictly accurate, but we only use this in UTF-8 locales. */
+	{ "locale",	"UTF-8",		"UTF-8"			},
+#endif /* HEIRLOOM_NROFF */
+
+	/* troff devices */
+	{ "X75",	NULL,			NULL			},
+	{ "X75-12",	NULL,			NULL			},
+	{ "X100",	NULL,			NULL			},
+	{ "X100-12",	NULL,			NULL			},
+	{ "dvi",	NULL,			NULL			},
+	{ "html",	NULL,			NULL			},
+	{ "lbp",	NULL,			NULL			},
+	{ "lj4",	NULL,			NULL			},
+	{ "ps",		NULL,			NULL			},
+
+	{ NULL,		NULL,			NULL			}
+};
+
+static const char fallback_roff_encoding[] = "ISO-8859-1";
+
+/* Setting less_charset to iso8859 tells the less pager that characters
+ * between 0xA0 and 0xFF are displayable, not that its input is encoded in
+ * ISO-8859-*. TODO: Perhaps using LESSCHARDEF would be better.
+ *
+ * Character set names compatible only with jless go in jless_charset.
+ */
+struct less_charset_entry {
+	const char *charset_from_locale;
+	const char *less_charset;
+	const char *jless_charset;
+};
+
+static struct less_charset_entry less_charset_table[] = {
+	{ "ANSI_X3.4-1968",	"ascii",	NULL		},
+	{ "ISO-8859-1",		"iso8859",	NULL		},
+	{ "UTF-8",		"utf-8",	NULL		},
+
+#ifdef MULTIBYTE_GROFF
+	{ "CP1251",		"windows",	NULL		},
+	{ "EUC-JP",		"iso8859",	"japanese-ujis"	},
+	{ "KOI8-R",		"koi8-r",	NULL		},
+	/* close enough? */
+	{ "KOI8-U",		"koi8-r",	NULL		},
+#endif /* MULTIBYTE_GROFF */
+
+	{ NULL,			NULL,		NULL		}
+};
+
+static const char fallback_less_charset[] = "iso8859";
+
+const char *groff_preconv = NULL;
+
+/* Is the groff "preconv" helper available? If so, return its name.
+ * Otherwise, return NULL.
+ */
+const char *get_groff_preconv (void)
+{
+	if (groff_preconv) {
+		if (*groff_preconv)
+			return groff_preconv;
+		else
+			return NULL;
+	}
+
+	if (pathsearch_executable ("gpreconv"))
+		groff_preconv = "gpreconv";
+	else if (pathsearch_executable ("preconv"))
+		groff_preconv = "preconv";
+	else
+		groff_preconv = "";
+
+	if (*groff_preconv)
+		return groff_preconv;
+	else
+		return NULL;
+}
+
+/* Return the assumed encoding of the source man page, based on the
+ * directory in which it was found. The caller should attempt to recode from
+ * this to whatever encoding is expected by groff.
+ *
+ * The caller should free the returned string when it is finished with it.
+ */
+char * ATTRIBUTE_MALLOC get_page_encoding (const char *lang)
+{
+	const struct directory_entry *entry;
+	const char *dot;
+
+	if (!lang || !*lang) {
+		/* Guess based on the locale. */
+		lang = setlocale (LC_MESSAGES, NULL);
+		if (!lang)
+			return xstrdup (fallback_source_encoding);
+	}
+
+	dot = strchr (lang, '.');
+	if (dot) {
+		/* The FHS has the worst specification of what's supposed to
+		 * go after the dot here that I've ever seen. To quote from
+		 * version 2.1:
+		 *
+		 * "It is recommended that this be a numeric representation
+		 * if possible (ISO standards, especially), not include
+		 * additional punctuation symbols, and that any letters be
+		 * in lowercase."
+		 *
+		 * Any sane standard would use directory names like
+		 * de_DE.ISO-8859-1; the examples in the FHS recommend
+		 * de_DE.88591 instead. Considering that there is no other
+		 * conceivable use for encodings in directory names other
+		 * than to pass them to iconv or similar, this is quite
+		 * startlingly useless.
+		 *
+		 * While we now support this thanks to
+		 * get_canonical_charset_name, the FHS specification is
+		 * obviously wrong and I plan to petition to have it
+		 * changed. I recommend ignoring this part of the FHS.
+		 */
+		char *dir_encoding =
+			xstrndup (dot + 1, strcspn (dot + 1, ",@"));
+		char *canonical_dir_encoding =
+			xstrdup (get_canonical_charset_name (dir_encoding));
+		free (dir_encoding);
+		return canonical_dir_encoding;
+	}
+
+	for (entry = directory_table; entry->lang_dir; ++entry)
+		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
+			return xstrdup (entry->source_encoding);
+
+	return xstrdup (fallback_source_encoding);
+}
+
+/* Return the canonical encoding for source man pages in the specified
+ * language. This ignores any encoding specification in the language
+ * directory name. The source encoding should be used as a basis for
+ * determining the correct roff device to use: that is, the caller should
+ * behave as if it is recoding from the page encoding to the source encoding
+ * first, although in practice it should recode directly from the page
+ * encoding to the roff encoding.
+ *
+ * You should normally only call this function if the page encoding is
+ * UTF-8, in which case older versions of groff that lack preconv need to
+ * have the page recoded to some legacy encoding). If the page is in a
+ * legacy encoding, then attempting to recode from that to some other legacy
+ * encoding will probably do more harm than good.
+ *
+ * Here are a few concrete examples of why these distinctions are important:
+ *
+ *   /usr/share/man/en_GB.UTF-8, locale C
+ *     page encoding = UTF-8
+ *     source encoding = ISO-8859-1
+ *     roff encoding = ISO-8859-1
+ *     output encoding = UTF-8
+ *     UTF-8 -> iconv -> ISO-8859-1 -> groff -Tascii -> ANSI_X3.4-1968
+ *
+ *   /usr/share/man/pl_PL.UTF-8, locale pl_PL.UTF-8
+ *     page encoding = UTF-8
+ *     source encoding = ISO-8859-2
+ *     roff encoding = ISO-8859-2
+ *     output encoding = ISO-8859-2
+ *     UTF-8 -> iconv -> ISO-8859-2 -> groff -Tascii8
+ *                    -> ISO-8859-2 -> iconv -> UTF-8
+ *
+ *   /usr/share/man/ja_JP.EUC-JP, locale ja_JP.UTF-8
+ *     page encoding = EUC-JP
+ *     source encoding = EUC-JP
+ *     roff encoding = UTF-8
+ *     output encoding = UTF-8
+ *     EUC-JP -> iconv -> UTF-8 -> groff -Tutf8 -> UTF-8
+ *
+ *   /usr/share/man/en_GB.ISO-8859-15, locale en_GB.UTF-8
+ *     page encoding = ISO-8859-15
+ *     source encoding = ISO-8859-15
+ *     roff encoding = ISO-8859-15
+ *     output encoding = ISO-8859-15
+ *     ISO-8859-15 -> groff -Tascii8 -> ISO-8859-15 -> iconv -> UTF-8
+ */
+const char *get_source_encoding (const char *lang)
+{
+	const struct directory_entry *entry;
+
+	if (!lang || !*lang) {
+		/* Guess based on the locale. */
+		lang = setlocale (LC_MESSAGES, NULL);
+		if (!lang)
+			return fallback_source_encoding;
+	}
+
+	for (entry = directory_table; entry->lang_dir; ++entry)
+		if (STRNEQ (entry->lang_dir, lang, strlen (entry->lang_dir)))
+			return entry->source_encoding;
+
+	return fallback_source_encoding;
+}
+
+const char * ATTRIBUTE_NONNULL ((1)) ATTRIBUTE_RETURNS_NONNULL
+	get_canonical_charset_name (const char *charset)
+{
+	const struct charset_alias_entry *entry;
+	char *charset_upper = xstrdup (charset);
+	char *p;
+
+	for (p = charset_upper; *p; ++p)
+		*p = CTYPE (toupper, *p);
+
+	for (entry = charset_alias_table; entry->alias; ++entry)
+		if (STREQ (entry->alias, charset_upper)) {
+			free (charset_upper);
+			return entry->canonical_name;
+		}
+
+	free (charset_upper);
+	return charset;
+}
+
+/* Return the current locale's character set. */
+const char * ATTRIBUTE_RETURNS_NONNULL get_locale_charset (void)
+{
+	const char *charset;
+	char *saved_locale;
+
+	/* We need to modify LC_CTYPE temporarily in order to look at the
+	 * codeset, so save it first.
+	 */
+	saved_locale = setlocale (LC_CTYPE, NULL);
+	if (saved_locale)
+		saved_locale = xstrdup (saved_locale);
+
+	setlocale (LC_CTYPE, "");
+
+	charset = locale_charset ();
+
+	/* Restore LC_CTYPE to its value on entry to this function. */
+	setlocale (LC_CTYPE, saved_locale);
+	free (saved_locale);
+
+	if (!charset || !*charset)
+		charset = "ANSI_X3.4-1968";
+	return get_canonical_charset_name (charset);
+}
+
+/* Find a locale with this character set. This is a non-portable operation,
+ * but required to make col(1) work correctly with -E. If no locale can be
+ * found, or if none needs to be set, return NULL.
+ *
+ * The caller should free the returned string when it is finished with it.
+ */
+char *find_charset_locale (const char *charset)
+{
+	const char *canonical_charset = get_canonical_charset_name (charset);
+	char *saved_locale;
+	const char supported_path[] = "/usr/share/i18n/SUPPORTED";
+	FILE *supported = NULL;
+	char *line = NULL;
+	size_t n = 0;
+	char *locale = NULL;
+
+	if (STREQ (charset, get_locale_charset ()))
+		return NULL;
+
+	saved_locale = setlocale (LC_CTYPE, NULL);
+	if (saved_locale)
+		saved_locale = xstrdup (saved_locale);
+
+	supported = fopen (supported_path, "r");
+	while (supported && getline (&line, &n, supported) >= 0) {
+		const char *space = strchr (line, ' ');
+		if (space) {
+			char *encoding = xstrdup (space + 1);
+			char *newline = strchr (encoding, '\n');
+			if (newline)
+				*newline = 0;
+			if (STREQ (canonical_charset,
+				   get_canonical_charset_name (encoding))) {
+				locale = xstrndup (line, space - line);
+				/* Is this locale actually installed? */
+				if (setlocale (LC_CTYPE, locale)) {
+					free (encoding);
+					goto out;
+				} else {
+					free (locale);
+					locale = NULL;
+				}
+			}
+			free (encoding);
+		}
+		free (line);
+		line = NULL;
+	}
+
+	if (strlen (canonical_charset) >= 5 &&
+	    STRNEQ (canonical_charset, "UTF-8", 5)) {
+		locale = xstrdup ("C.UTF-8");
+		if (setlocale (LC_CTYPE, locale))
+			goto out;
+		free (locale);
+		locale = xstrdup ("en_US.UTF-8");
+		if (setlocale (LC_CTYPE, locale))
+			goto out;
+		free (locale);
+		locale = NULL;
+	}
+
+out:
+	free (line);
+	setlocale (LC_CTYPE, saved_locale);
+	free (saved_locale);
+	if (supported)
+		fclose (supported);
+	return locale;
+}
+
+/* Can we take this input encoding and produce this output encoding, perhaps
+ * with the help of some iconv pipes? */
+static bool ATTRIBUTE_PURE compatible_encodings (const char *input,
+						 const char *output)
+{
+	if (STREQ (input, output))
+		return true;
+
+	/* If the input is ASCII, recoding should be easy. Try it. */
+	if (STREQ (input, "ANSI_X3.4-1968"))
+		return true;
+
+	/* If the input is UTF-8, it's either a simple recoding of whatever
+	 * we want or else it probably won't work at all no matter what we
+	 * do. We might as well try it for now.
+	 */
+	if (STREQ (input, "UTF-8"))
+		return true;
+
+	/* If the output is ASCII, this is probably because the caller
+	 * explicitly asked for it, so we have little choice but to try.
+	 */
+	if (STREQ (output, "ANSI_X3.4-1968"))
+		return true;
+
+#ifdef MULTIBYTE_GROFF
+	/* Special case for some CJK UTF-8 locales, which take UTF-8 input
+	 * recoded from EUC-JP (etc.) and produce UTF-8 output. This is
+	 * rather filthy.
+	 */
+	if ((STREQ (input, "BIG5") || STREQ (input, "BIG5HKSCS") ||
+	     STREQ (input, "EUC-JP") ||
+	     STREQ (input, "EUC-CN") || STREQ (input, "GBK") ||
+	     STREQ (input, "EUC-KR") ||
+	     STREQ (input, "EUC-TW")) &&
+	    STREQ (output, "UTF-8"))
+		return true;
+#endif /* MULTIBYTE_GROFF */
+
+	return false;
+}
+
+/* Return the default groff device for the given character set. This may be
+ * overridden by the user. The page's source encoding is needed to ensure
+ * that the device is compatible: consider ru_RU.UTF-8, which needs ascii8
+ * and a trailing iconv pipe to recode to UTF-8.
+ *
+ * All this encoding compatibility stuff feels like a slightly nasty hack,
+ * but I haven't yet come up with a cleaner way to do it.
+ */
+const char *get_default_device (const char *charset_from_locale,
+				const char *source_encoding)
+{
+	const struct charset_entry *entry;
+
+	if (get_groff_preconv ()) {
+		/* ASCII is a special case, and the only way we can get
+		 * things like bullet marks to come out right is by using
+		 * the ascii device. People using such a basic locale
+		 * probably don't want anything fancy anyway.
+		 */
+		if (charset_from_locale &&
+		    STREQ (charset_from_locale, "ANSI_X3.4-1968"))
+			return "ascii";
+		else
+			return "utf8";
+	}
+
+	if (!charset_from_locale)
+		return fallback_default_device;
+
+	for (entry = charset_table; entry->charset_from_locale; ++entry) {
+		if (STREQ (entry->charset_from_locale, charset_from_locale)) {
+			const char *roff_encoding =
+				get_roff_encoding (entry->default_device,
+						   source_encoding);
+			if (compatible_encodings (source_encoding,
+						  roff_encoding))
+				return entry->default_device;
+		}
+	}
+
+	return fallback_default_device;
+}
+
+/* Is this a known *roff device name? */
+bool ATTRIBUTE_PURE is_roff_device (const char *device)
+{
+	const struct device_entry *entry;
+
+	for (entry = device_table; entry->roff_device; ++entry) {
+		if (STREQ (entry->roff_device, device))
+			return true;
+	}
+
+	return false;
+}
+
+/* Find the input encoding expected by groff, and set the LESSCHARSET
+ * environment variable appropriately.
+ */
+const char *get_roff_encoding (const char *device, const char *source_encoding)
+{
+	const struct device_entry *entry;
+	bool found = false;
+	const char *roff_encoding = NULL;
+
+	if (device) {
+		for (entry = device_table; entry->roff_device; ++entry) {
+			if (STREQ (entry->roff_device, device)) {
+				found = true;
+				roff_encoding = entry->roff_encoding;
+				break;
+			}
+		}
+	}
+
+	if (!found)
+		roff_encoding = fallback_roff_encoding;
+
+#ifdef MULTIBYTE_GROFF
+	/* An ugly special case is needed here. The utf8 device normally
+	 * takes ISO-8859-1 input. However, with the multibyte patch, when
+	 * recoding from CJK character sets it takes UTF-8 input instead.
+	 * This is evil, but there's not much that can be done about it
+	 * apart from waiting for groff 2.0.
+	 */
+	if (device && STREQ (device, "utf8") && !get_groff_preconv () &&
+	    STREQ (get_locale_charset (), "UTF-8")) {
+		const char *ctype = setlocale (LC_CTYPE, NULL);
+		if (STRNEQ (ctype, "ja_JP", 5) ||
+		    STRNEQ (ctype, "ko_KR", 5) ||
+		    STRNEQ (ctype, "zh_CN", 5) ||
+		    STRNEQ (ctype, "zh_HK", 5) ||
+		    STRNEQ (ctype, "zh_SG", 5) ||
+		    STRNEQ (ctype, "zh_TW", 5))
+			roff_encoding = "UTF-8";
+	}
+#endif /* MULTIBYTE_GROFF */
+
+	return roff_encoding ? roff_encoding : source_encoding;
+}
+
+/* Find the output encoding that this device will produce, or NULL if it
+ * will simply pass through the input encoding.
+ */
+const char * ATTRIBUTE_PURE get_output_encoding (const char *device)
+{
+	const struct device_entry *entry;
+
+	for (entry = device_table; entry->roff_device; ++entry)
+		if (STREQ (entry->roff_device, device))
+			return entry->output_encoding;
+
+	return NULL;
+}
+
+/* Return the value of LESSCHARSET appropriate for this locale. */
+const char * ATTRIBUTE_PURE get_less_charset (const char *charset_from_locale)
+{
+	const struct less_charset_entry *entry;
+
+	if (charset_from_locale) {
+		for (entry = less_charset_table; entry->charset_from_locale;
+		     ++entry)
+			if (STREQ (entry->charset_from_locale,
+				   charset_from_locale))
+				return entry->less_charset;
+	}
+
+	return fallback_less_charset;
+}
+
+/* Return the value of JLESSCHARSET appropriate for this locale. May return
+ * NULL.
+ */
+const char * ATTRIBUTE_PURE get_jless_charset (const char *charset_from_locale)
+{
+	const struct less_charset_entry *entry;
+
+	if (charset_from_locale) {
+		for (entry = less_charset_table; entry->charset_from_locale;
+		     ++entry)
+			if (STREQ (entry->charset_from_locale,
+				   charset_from_locale))
+				return entry->jless_charset;
+	}
+
+	return NULL;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 19:37:10 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 19:37:10 +0000
commit	c9addba5cc770d2d231b34f6739f32c6be8690f1 (patch)
tree	c643da154a95a1d163137135050bb47858a1654e /lib/encodings.c
parent	Initial commit. (diff)
download	man-db-c9addba5cc770d2d231b34f6739f32c6be8690f1.tar.xz man-db-c9addba5cc770d2d231b34f6739f32c6be8690f1.zip