diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 19:44:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 19:44:05 +0000 |
commit | d318611dd6f23fcfedd50e9b9e24620b102ba96a (patch) | |
tree | 8b9eef82ca40fdd5a8deeabf07572074c236095d /src/preproc/preconv | |
parent | Initial commit. (diff) | |
download | groff-f22bf21391d2b916c7303c565592ae6e99efbb58.tar.xz groff-f22bf21391d2b916c7303c565592ae6e99efbb58.zip |
Adding upstream version 1.23.0.upstream/1.23.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/preproc/preconv')
-rw-r--r-- | src/preproc/preconv/preconv.1.man | 559 | ||||
-rw-r--r-- | src/preproc/preconv/preconv.am | 37 | ||||
-rw-r--r-- | src/preproc/preconv/preconv.cpp | 1318 | ||||
-rwxr-xr-x | src/preproc/preconv/tests/do-not-seek-the-unseekable.sh | 59 | ||||
-rwxr-xr-x | src/preproc/preconv/tests/smoke-test.sh | 88 |
5 files changed, 2061 insertions, 0 deletions
diff --git a/src/preproc/preconv/preconv.1.man b/src/preproc/preconv/preconv.1.man new file mode 100644 index 0000000..1535bae --- /dev/null +++ b/src/preproc/preconv/preconv.1.man @@ -0,0 +1,559 @@ +.TH preconv @MAN1EXT@ "@MDATE@" "groff @VERSION@" +.SH Name +preconv \- prepare files for typesetting with +.I groff +. +. +.\" ==================================================================== +.\" Legal Terms +.\" ==================================================================== +.\" +.\" Copyright (C) 2006-2020 Free Software Foundation, Inc. +.\" +.\" Permission is granted to make and distribute verbatim copies of this +.\" manual provided the copyright notice and this permission notice are +.\" preserved on all copies. +.\" +.\" Permission is granted to copy and distribute modified versions of +.\" this manual under the conditions for verbatim copying, provided that +.\" the entire resulting derived work is distributed under the terms of +.\" a permission notice identical to this one. +.\" +.\" Permission is granted to copy and distribute translations of this +.\" manual into another language, under the above conditions for +.\" modified versions, except that this permission notice may be +.\" included in translations approved by the Free Software Foundation +.\" instead of in the original English. +. +. +.\" Save and disable compatibility mode (for, e.g., Solaris 10/11). +.do nr *groff_preconv_1_man_C \n[.cp] +.cp 0 +. +.\" Define fallback for groff 1.23's MR macro if the system lacks it. +.nr do-fallback 0 +.if !\n(.f .nr do-fallback 1 \" mandoc +.if \n(.g .if !d MR .nr do-fallback 1 \" older groff +.if !\n(.g .nr do-fallback 1 \" non-groff *roff +.if \n[do-fallback] \{\ +. de MR +. ie \\n(.$=1 \ +. I \%\\$1 +. el \ +. IR \%\\$1 (\\$2)\\$3 +. . +.\} +.rr do-fallback +. +. +.\" ==================================================================== +.SH Synopsis +.\" ==================================================================== +. +.SY preconv +.RB [ \-dr ] +.RB [ \-D\~\c +.IR fallback-encoding ] +.RB [ \-e\~\c +.IR encoding ] +.RI [ file\~ .\|.\|.] +.YS +. +. +.SY preconv +.B \-h +. +.SY preconv +.B \-\-help +.YS +. +. +.SY preconv +.B \-v +. +.SY preconv +.B \-\-version +.YS +. +. +.\" ==================================================================== +.SH Description +.\" ==================================================================== +. +.I preconv +reads each +.IR file , +converts its encoded characters to a form +.MR @g@troff @MAN1EXT@ +can interpret, +and sends the result to the standard output stream. +. +Currently, +this means that code points in the range 0\[en]127 +(in US-ASCII, +ISO\~8859, +or Unicode) +remain as-is and the remainder are converted to the +.I groff +special character form +.RB \[lq] \[rs][\c +.BI u XXXX ]\c +\[rq], +where +.I XXXX +is a hexadecimal number of four to six digits corresponding to a Unicode +code point. +. +By default, +.I preconv +also inserts a +.I roff +.B .lf +request at the beginning of each +.IR file , +identifying it for the benefit of later processing +(including diagnostic messages); +the +.B \-r +option suppresses this behavior. +. +. +.PP +In typical usage scenarios, +.I preconv +need not be run directly; +instead it should be invoked with the +.B \-k +or +.B \-K +options of +.IR groff . +. +If no +.I file +operands are given on the command line, +or if +.I file +is +.RB \[lq] \- \[rq], +the standard input stream is read. +. +. +.PP +.I preconv +tries to find the input encoding with the following algorithm, +stopping at the first success. +. +. +.IP 1. 4n +If the input encoding has been explicitly specified with option +.BR \-e , +use it. +. +. +.IP 2. +If the input starts with a Unicode Byte Order Mark, +determine the encoding as UTF-8, +UTF-16, +or UTF-32 accordingly. +. +. +.IP 3. +If the input stream is seekable, +check the first and second input lines for a recognized GNU\~Emacs +file-local variable identifying the character encoding, +here referred to as the \[lq]coding tag\[rq] for brevity. +. +If found, +use it. +. +. +.IP 4. +If the input stream is seekable, +and if the +.I uchardet +library is available on the system, +use it to try to infer the encoding of the file. +. +. +.IP 5. +If the +.B \-D +option specifies an encoding, +use it. +. +. +.IP 6. +Use the encoding specified by the current locale +.RI ( LC_CTYPE ), +unless the locale is +\[lq]C\[rq], +\[lq]POSIX\[rq], +or empty, +in which case assume Latin-1 +(ISO\~8859-1). +. +. +.PP +The coding tag and +.I uchardet +methods in the above procedure rely upon a seekable input stream; +when +.I preconv +reads from a pipe, +the stream is not seekable, +and these detection methods are skipped. +. +If character encoding detection of your input files is unreliable, +arrange for one of the other methods to succeed by using +.IR preconv 's +.B \-D +or +.B \-e +options, +or by configuring your locale appropriately. +. +.I groff +also supports a +.I \%GROFF_ENCODING +environment variable, +which can be overridden by its +.B \-K +option. +. +Valid values for +(or parameters to) +all of these are enumerated in the lists of recognized coding tags in +the next subsection, +and are further influenced by +.I iconv +library support. +. +. +.\" ==================================================================== +.SS "Coding tags" +.\" ==================================================================== +. +Text editors that support more than a single character encoding need +tags within the input files to mark the file's encoding. +. +While it is possible to guess the right input encoding with the help of +heuristics that are reliable for a preponderance of natural language +texts, +they are not absolutely reliable. +. +Heuristics can fail on inputs that are too short or don't represent a +natural language. +. +. +.PP +Consequently, +.I preconv +supports the coding tag convention used by GNU\~Emacs +(with some restrictions). +. +This notation appears in specially marked regions of an input file +designated for \[lq]file-local variables\[rq]. +. +. +.PP +.I preconv +interprets the following syntax if it occurs in a +.I roff +comment +in the first or second line of the input file. +. +Both \[lq]\[rs]"\[rq] and \[lq]\[rs]#\[rq] comment forms are recognized, +but the control +(or no-break control) +character must be the default and must begin the line. +. +Similarly, +the escape character must be the default. +. +. +.RS +.EX +.B \-*\- \c +.RB [.\|.\|. ; ]\~\c +.B coding: \c +.I encoding\c +.RB [ ;\~ .\|.\|.\&]\~\c +.B \-*\- +.EE +.RE +. +. +.PP +The only variable +.I preconv +interprets is \[lq]coding\[rq], +which can take the values listed below. +. +. +.PP +The following list comprises all MIME \[lq]charset\[rq] parameter values +recognized, +case-insensitively, +by +.IR preconv . +. +.RS +\%big5, +\%cp1047, +\%euc\-jp, +\%euc\-kr, +\%gb2312, +\%iso\-8859\-1, +\%iso\-8859\-2, +\%iso\-8859\-5, +\%iso\-8859\-7, +\%iso\-8859\-9, +\%iso\-8859\-13, +\%iso\-8859\-15, +\%koi8\-r, +\%us\-ascii, +\%utf\-8, +\%utf\-16, +\%utf\-16be, +\%utf\-16le +.RE +. +. +.PP +In addition, +the following list of other coding tags is recognized, +each of which is mapped to an appropriate value from the list above. +. +.RS +\%ascii, +\%chinese\-big5, +\%chinese\-euc, +\%chinese\-iso\-8bit, +\%cn\-big5, +\%cn\-gb, +\%cn\-gb\-2312, +\%cp878, +\%csascii, +\%csisolatin1, +\%cyrillic\-iso\-8bit, +\%cyrillic\-koi8, +\%euc\-china, +\%euc\-cn, +\%euc\-japan, +\%euc\-japan\-1990, +\%euc\-korea, +\%greek\-iso\-8bit, +\%iso\-10646/utf8, +\%iso\-10646/utf\-8, +\%iso\-latin\-1, +\%iso\-latin\-2, +\%iso\-latin\-5, +\%iso\-latin\-7, +\%iso\-latin\-9, +\%japanese\-euc, +\%japanese\-iso\-8bit, +\%jis8, +\%koi8, +\%korean\-euc, +\%korean\-iso\-8bit, +\%latin\-0, +\%latin1, +\%latin\-1, +\%latin\-2, +\%latin\-5, +\%latin\-7, +\%latin\-9, +\%mule\-utf\-8, +\%mule\-utf\-16, +\%mule\-utf\-16be, +\%mule\-utf\-16\-be, +\%mule\-utf\-16be\-with\-signature, +\%mule\-utf\-16le, +\%mule\-utf\-16\-le, +\%mule\-utf\-16le\-with\-signature, +\%utf8, +\%utf\-16\-be, +\%utf\-16\-be\-with\-signature, +\%utf\-16be\-with\-signature, +\%utf\-16\-le, +\%utf\-16\-le\-with\-signature, +\%utf\-16le\-with\-signature +.RE +. +. +.PP +Trailing +\[lq]\-dos\[rq], +\[lq]\-unix\[rq], +and +\[lq]\-mac\[rq] +suffixes on coding tags +(which indicate the end-of-line convention used in the file) +are disregarded for the purpose of comparison with the above tags. +. +. +.\" ==================================================================== +.SS "\f[I]iconv\f[] support" +.\" ==================================================================== +. +While +.I preconv +recognizes all of the coding tags listed above, +it is capable on its own of interpreting only three encodings: +Latin-1, +code page 1047, +and UTF-8. +. +If +.I iconv +support is configured at compile time and available at run time, +all others are passed to +.I iconv +library functions, +which may recognize many additional encoding strings. +. +The command +.RB \[lq] preconv\~\-v \[rq] +discloses whether +.I iconv +support is configured. +. +. +.PP +The use of +.I iconv +means that characters in the input that encode invalid code points for +that encoding may be dropped from the output stream or mapped to the +Unicode replacement character +(U+FFFD). +. +Compare the following examples using the input \[lq]caf\['e]\[rq] +(note the \[lq]e\[rq] with an acute accent), +which due to its short length challenges inference of the encoding used. +. +.RS +.EX +printf \[aq]caf\[rs]351\[rs]n\[aq] | LC_ALL=en_US.UTF\-8 preconv +printf \[aq]caf\[rs]351\[rs]n\[aq] | preconv \-e us\-ascii +printf \[aq]caf\[rs]351\[rs]n\[aq] | preconv \-e latin\-1 +.EE +.RE +. +The fate of the accented \[lq]e\[rq] differs in each case. +. +In the first, +.I uchardet +fails to detect an encoding +(though the library on your system may behave differently) +and +.I preconv +falls back to the locale settings, +where octal 351 starts an incomplete UTF-8 sequence and results in the +Unicode replacement character. +. +In the second, +it is not a representable character in the declared input encoding of +US-ASCII and is discarded by +.IR iconv . +. +In the last, +it is correctly detected and mapped. +. +. +.\" ==================================================================== +.SS Limitations +.\" ==================================================================== +. +.I preconv +cannot perform any transformation on input that it cannot see. +. +Examples include files that are interpolated by preprocessors that run +subsequently, +including +.MR @g@soelim @MAN1EXT@ ; +files included by +.I @g@troff +itself through +.RB \[lq] so \[rq] +and similar requests; +and string definitions passed to +.I @g@troff +through its +.B \-d +command-line option. +. +. +.P +.I preconv +assumes that its input uses the default escape character, +a backslash +.BR \[rs] , +and writes special character escape sequences accordingly. +. +. +.\" ==================================================================== +.SH Options +.\" ==================================================================== +. +.B \-h +and +.B \-\-help +display a usage message, +while +.B \-v +and +.B \-\-version +show version information; +all exit afterward. +. +. +.TP +.B \-d +Emit debugging messages to the standard error stream. +. +. +.TP +.BI \-D\~ fallback-encoding +Report +.I fallback-encoding +if all detection methods fail. +. +. +.TP +.BI \-e\~ encoding +Skip detection and assume +.IR encoding ; +see +.IR groff 's +.B \-K +option. +. +. +.TP +.B \-r +Write files \[lq]raw\[rq]; +do not add +.B .lf +requests. +. +. +.\" ==================================================================== +.SH "See also" +.\" ==================================================================== +. +.MR groff @MAN1EXT@ , +.MR iconv 3 , +.MR locale 7 +. +. +.\" Restore compatibility mode (for, e.g., Solaris 10/11). +.cp \n[*groff_preconv_1_man_C] +.do rr *groff_preconv_1_man_C +. +. +.\" Local Variables: +.\" fill-column: 72 +.\" mode: nroff +.\" End: +.\" vim: set filetype=groff textwidth=72: diff --git a/src/preproc/preconv/preconv.am b/src/preproc/preconv/preconv.am new file mode 100644 index 0000000..199ff66 --- /dev/null +++ b/src/preproc/preconv/preconv.am @@ -0,0 +1,37 @@ +# Copyright (C) 2014-2020 Free Software Foundation, Inc. +# +# This file is part of groff. +# +# groff is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# groff is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +bin_PROGRAMS += preconv +preconv_LDADD = libgroff.a $(LIBM) $(LIBICONV) $(UCHARDET_LIBS) \ + lib/libgnu.a +preconv_SOURCES = src/preproc/preconv/preconv.cpp +preconv_CPPFLAGS = $(AM_CPPFLAGS) $(UCHARDET_CFLAGS) +man1_MANS += src/preproc/preconv/preconv.1 +EXTRA_DIST += src/preproc/preconv/preconv.1.man + +preconv_TESTS = \ + src/preproc/preconv/tests/do-not-seek-the-unseekable.sh \ + src/preproc/preconv/tests/smoke-test.sh +TESTS += $(preconv_TESTS) +EXTRA_DIST += $(preconv_TESTS) + + +# Local Variables: +# fill-column: 72 +# mode: makefile-automake +# End: +# vim: set autoindent filetype=automake textwidth=72: diff --git a/src/preproc/preconv/preconv.cpp b/src/preproc/preconv/preconv.cpp new file mode 100644 index 0000000..d403425 --- /dev/null +++ b/src/preproc/preconv/preconv.cpp @@ -0,0 +1,1318 @@ +/* Copyright (C) 2005-2020 Free Software Foundation, Inc. + Written by Werner Lemberg (wl@gnu.org) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "lib.h" + +#include <assert.h> +#include <stdlib.h> +#include <errno.h> +#include <sys/stat.h> +#ifdef HAVE_UCHARDET +#include <uchardet/uchardet.h> +#endif + +#include "errarg.h" +#include "error.h" +#include "localcharset.h" +#include "nonposix.h" +#include "stringclass.h" +#include "lf.h" + +#include <locale.h> + +#if HAVE_ICONV +# include <iconv.h> +# ifdef WORDS_BIGENDIAN +# define UNICODE "UTF-32BE" +# else +# define UNICODE "UTF-32LE" +# endif +#endif + +#define MAX_VAR_LEN 100 + +extern "C" const char *Version_string; + +char fallback_encoding[MAX_VAR_LEN]; +char user_encoding[MAX_VAR_LEN]; +char encoding_string[MAX_VAR_LEN]; +bool is_debugging = false; +int raw_flag = 0; + +struct conversion { + const char *from; + const char *to; +}; + +// The official list of MIME tags can be found at +// +// http://www.iana.org/assignments/character-sets +// +// For encodings which don't have a MIME tag we use GNU iconv's encoding +// names (which also work with the portable GNU libiconv package). They +// are marked with '*'. +// +// Encodings specific to XEmacs and Emacs are marked as such; no mark means +// that they are used by both Emacs and XEmacs. +// +// Encodings marked with '--' are special to Emacs, XEmacs, or other +// applications and shouldn't be used for data exchange. +// +// 'Not covered' means that the encoding can be handled neither by GNU iconv +// nor by libiconv, or just one of them has support for it. +// +// A special case is VIQR encoding: Despite of having a MIME tag it is +// missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6). +// +// Finally, we add all aliases of GNU iconv for 'ascii', 'latin1', and +// 'utf8' to catch those encoding names before iconv is called. +// +// Note that most entries are commented out -- only a small, (rather) +// reliable and stable subset of encodings is recognized (for coding tags) +// which are still in greater use today (January 2006). Most notably, all +// Windows-specific encodings are not selected because they lack stability: +// Microsoft has changed the mappings instead of creating new versions. +// +// Please contact the groff list if you find the selection inadequate. + +static const conversion +emacs_to_mime[] = { + {"ascii", "US-ASCII"}, // Emacs + {"big5", "Big5"}, + {"chinese-big5", "Big5"}, // Emacs + {"chinese-euc", "GB2312"}, // XEmacs + {"chinese-iso-8bit", "GB2312"}, // Emacs + {"cn-big5", "Big5"}, + {"cn-gb", "GB2312"}, // Emacs + {"cn-gb-2312", "GB2312"}, + {"cp878", "KOI8-R"}, // Emacs + {"cp1047", "CP1047"}, // EBCDIC + {"csascii", "US-ASCII"}, // alias + {"csisolatin1", "ISO-8859-1"}, // alias + {"cyrillic-iso-8bit", "ISO-8859-5"}, // Emacs + {"cyrillic-koi8", "KOI8-R"}, // not KOI8!, Emacs + {"euc-china", "GB2312"}, // Emacs + {"euc-cn", "GB2312"}, // Emacs + {"euc-japan", "EUC-JP"}, + {"euc-japan-1990", "EUC-JP"}, // Emacs + {"euc-jp", "EUC-JP"}, + {"euc-korea", "EUC-KR"}, + {"euc-kr", "EUC-KR"}, + {"gb2312", "GB2312"}, + {"greek-iso-8bit", "ISO-8859-7"}, + {"iso-10646/utf8", "UTF-8"}, // alias + {"iso-10646/utf-8", "UTF-8"}, // alias + {"iso-8859-1", "ISO-8859-1"}, + {"iso-8859-13", "ISO-8859-13"}, // Emacs + {"iso-8859-15", "ISO-8859-15"}, + {"iso-8859-2", "ISO-8859-2"}, + {"iso-8859-5", "ISO-8859-5"}, + {"iso-8859-7", "ISO-8859-7"}, + {"iso-8859-9", "ISO-8859-9"}, + {"iso-latin-1", "ISO-8859-1"}, + {"iso-latin-2", "ISO-8859-2"}, // Emacs + {"iso-latin-5", "ISO-8859-9"}, // Emacs + {"iso-latin-7", "ISO-8859-13"}, // Emacs + {"iso-latin-9", "ISO-8859-15"}, // Emacs + {"japanese-iso-8bit", "EUC-JP"}, // Emacs + {"japanese-euc", "EUC-JP"}, // XEmacs + {"jis8", "EUC-JP"}, // XEmacs + {"koi8", "KOI8-R"}, // not KOI8!, Emacs + {"koi8-r", "KOI8-R"}, + {"korean-euc", "EUC-KR"}, // XEmacs + {"korean-iso-8bit", "EUC-KR"}, // Emacs + {"latin1", "ISO-8859-1"}, // alias + {"latin-0", "ISO-8859-15"}, // Emacs + {"latin-1", "ISO-8859-1"}, // Emacs + {"latin-2", "ISO-8859-2"}, // Emacs + {"latin-5", "ISO-8859-9"}, // Emacs + {"latin-7", "ISO-8859-13"}, // Emacs + {"latin-9", "ISO-8859-15"}, // Emacs + {"mule-utf-16", "UTF-16"}, // Emacs + {"mule-utf-16be", "UTF-16BE"}, // Emacs + {"mule-utf-16-be", "UTF-16BE"}, // Emacs + {"mule-utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE + {"mule-utf-16le", "UTF-16LE"}, // Emacs + {"mule-utf-16-le", "UTF-16LE"}, // Emacs + {"mule-utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE + {"mule-utf-8", "UTF-8"}, // Emacs + {"us-ascii", "US-ASCII"}, // Emacs + {"utf8", "UTF-8"}, // alias + {"utf-16", "UTF-16"}, // Emacs + {"utf-16be", "UTF-16BE"}, // Emacs + {"utf-16-be", "UTF-16BE"}, // Emacs + {"utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE + {"utf-16-be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE + {"utf-16le", "UTF-16LE"}, // Emacs + {"utf-16-le", "UTF-16LE"}, // Emacs + {"utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE + {"utf-16-le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE + {"utf-8", "UTF-8"}, // Emacs + +// {"alternativnyj", ""}, // ? +// {"arabic-iso-8bit", "ISO-8859-6"}, // Emacs +// {"binary", ""}, // -- +// {"chinese-hz", "HZ-GB-2312"}, // Emacs +// {"chinese-iso-7bit", "ISO-2022-CN"}, // Emacs +// {"chinese-iso-8bit-with-esc", ""}, // -- +// {"compound-text", ""}, // -- +// {"compound-text-with-extension", ""}, // -- +// {"cp1125", "cp1125"}, // * +// {"cp1250", "windows-1250"},// Emacs +// {"cp1251", "windows-1251"},// Emacs +// {"cp1252", "windows-1252"},// Emacs +// {"cp1253", "windows-1253"},// Emacs +// {"cp1254", "windows-1254"},// Emacs +// {"cp1255", "windows-1255"},// Emacs +// {"cp1256", "windows-1256"},// Emacs +// {"cp1257", "windows-1257"},// Emacs +// {"cp1258", "windows-1258"},// Emacs +// {"cp437", "cp437"}, // Emacs +// {"cp720", ""}, // not covered +// {"cp737", "cp737"}, // *, Emacs +// {"cp775", "cp775"}, // Emacs +// {"cp850", "cp850"}, // Emacs +// {"cp851", "cp851"}, // Emacs +// {"cp852", "cp852"}, // Emacs +// {"cp855", "cp855"}, // Emacs +// {"cp857", "cp857"}, // Emacs +// {"cp860", "cp860"}, // Emacs +// {"cp861", "cp861"}, // Emacs +// {"cp862", "cp862"}, // Emacs +// {"cp863", "cp863"}, // Emacs +// {"cp864", "cp864"}, // Emacs +// {"cp865", "cp865"}, // Emacs +// {"cp866", "cp866"}, // Emacs +// {"cp866u", "cp1125"}, // *, Emacs +// {"cp869", "cp869"}, // Emacs +// {"cp874", "cp874"}, // *, Emacs +// {"cp932", "cp932"}, // *, Emacs +// {"cp936", "cp936"}, // Emacs +// {"cp949", "cp949"}, // *, Emacs +// {"cp950", "cp950"}, // *, Emacs +// {"ctext", ""}, // -- +// {"ctext-no-compositions", ""}, // -- +// {"ctext-with-extensions", ""}, // -- +// {"cyrillic-alternativnyj", ""}, // ?, Emacs +// {"cyrillic-iso-8bit-with-esc", ""}, // -- +// {"cyrillic-koi8-t", "KOI8-T"}, // *, Emacs +// {"devanagari", ""}, // not covered +// {"dos", ""}, // -- +// {"emacs-mule", ""}, // -- +// {"euc-jisx0213", "EUC-JISX0213"},// *, XEmacs? +// {"euc-jisx0213-with-esc", ""}, // XEmacs? +// {"euc-taiwan", "EUC-TW"}, // *, Emacs +// {"euc-tw", "EUC-TW"}, // *, Emacs +// {"georgian-ps", "GEORGIAN-PS"}, // *, Emacs +// {"greek-iso-8bit-with-esc", ""}, // -- +// {"hebrew-iso-8bit", "ISO-8859-8"}, // Emacs +// {"hebrew-iso-8bit-with-esc", ""}, // -- +// {"hz", "HZ-GB-2312"}, +// {"hz-gb-2312", "HZ-GB-2312"}, +// {"in-is13194", ""}, // not covered +// {"in-is13194-devanagari", ""}, // not covered +// {"in-is13194-with-esc", ""}, // -- +// {"iso-2022-7", ""}, // XEmacs? +// {"iso-2022-7bit", ""}, // -- +// {"iso-2022-7bit-lock", ""}, // -- +// {"iso-2022-7bit-lock-ss2", ""}, // -- +// {"iso-2022-7bit-ss2", ""}, // -- +// {"iso-2022-8", ""}, // XEmacs? +// {"iso-2022-8bit", ""}, // XEmacs? +// {"iso-2022-8bit-lock", ""}, // XEmacs? +// {"iso-2022-8bit-lock-ss2", ""}, // XEmacs? +// {"iso-2022-8bit-ss2", ""}, // -- +// {"iso-2022-cjk", ""}, // -- +// {"iso-2022-cn", "ISO-2022-CN"}, // Emacs +// {"iso-2022-cn-ext", "ISO-2022-CN-EXT"},// Emacs +// {"iso-2022-int-1", ""}, // -- +// {"iso-2022-jp", "ISO-2022-JP"}, +// {"iso-2022-jp-1978-irv", "ISO-2022-JP"}, +// {"iso-2022-jp-2", "ISO-2022-JP-2"}, +// {"iso-2022-jp-3", "ISO-2022-JP-3"},// *, XEmacs? +// {"iso-2022-jp-3-compatible", ""}, // XEmacs? +// {"iso-2022-jp-3-strict", "ISO-2022-JP-3"},// *, XEmacs? +// {"iso-2022-kr", "ISO-2022-KR"}, +// {"iso-2022-lock", ""}, // XEmacs? +// {"iso-8859-10", "ISO-8859-10"}, // Emacs +// {"iso-8859-11", "ISO-8859-11"}, // *, Emacs +// {"iso-8859-14", "ISO-8859-14"}, // Emacs +// {"iso-8859-16", "ISO-8859-16"}, +// {"iso-8859-3", "ISO-8859-3"}, +// {"iso-8859-4", "ISO-8859-4"}, +// {"iso-8859-6", "ISO-8859-6"}, +// {"iso-8859-8", "ISO-8859-8"}, +// {"iso-8859-8-e", "ISO-8859-8"}, +// {"iso-8859-8-i", "ISO-8859-8"}, // Emacs +// {"iso-latin-10", "ISO-8859-16"}, // Emacs +// {"iso-latin-1-with-esc", ""}, // -- +// {"iso-latin-2-with-esc", ""}, // -- +// {"iso-latin-3", "ISO-8859-3"}, // Emacs +// {"iso-latin-3-with-esc", ""}, // -- +// {"iso-latin-4", "ISO-8859-4"}, // Emacs +// {"iso-latin-4-with-esc", ""}, // -- +// {"iso-latin-5-with-esc", ""}, // -- +// {"iso-latin-6", "ISO-8859-10"}, // Emacs +// {"iso-latin-8", "ISO-8859-14"}, // Emacs +// {"iso-safe", ""}, // -- +// {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"}, // Emacs +// {"japanese-iso-8bit-with-esc", ""}, // -- +// {"japanese-shift-jis", "Shift_JIS"}, // Emacs +// {"japanese-shift-jisx0213", ""}, // XEmacs? +// {"jis7", "ISO-2022-JP"}, // Xemacs +// {"junet", "ISO-2022-JP"}, +// {"koi8-t", "KOI8-T"}, // *, Emacs +// {"koi8-u", "KOI8-U"}, // Emacs +// {"korean-iso-7bit-lock", "ISO-2022-KR"}, +// {"korean-iso-8bit-with-esc", ""}, // -- +// {"lao", ""}, // not covered +// {"lao-with-esc", ""}, // -- +// {"latin-10", "ISO-8859-16"}, // Emacs +// {"latin-3", "ISO-8859-3"}, // Emacs +// {"latin-4", "ISO-8859-4"}, // Emacs +// {"latin-6", "ISO-8859-10"}, // Emacs +// {"latin-8", "ISO-8859-14"}, // Emacs +// {"mac", ""}, // -- +// {"mac-roman", "MACINTOSH"}, // Emacs +// {"mik", ""}, // not covered +// {"next", "NEXTSTEP"}, // *, Emacs +// {"no-conversion", ""}, // -- +// {"old-jis", "ISO-2022-JP"}, +// {"pt154", "PT154"}, // Emacs +// {"raw-text", ""}, // -- +// {"ruscii", "cp1125"}, // *, Emacs +// {"shift-jis", "Shift_JIS"}, // XEmacs +// {"shift_jis", "Shift_JIS"}, +// {"shift_jisx0213", "Shift_JISX0213"},// *, XEmacs? +// {"sjis", "Shift_JIS"}, // Emacs +// {"tcvn", "TCVN"}, // *, Emacs +// {"tcvn-5712", "TCVN"}, // *, Emacs +// {"thai-tis620", "TIS-620"}, +// {"thai-tis620-with-esc", ""}, // -- +// {"th-tis620", "TIS-620"}, +// {"tibetan", ""}, // not covered +// {"tibetan-iso-8bit", ""}, // not covered +// {"tibetan-iso-8bit-with-esc", ""}, // -- +// {"tis-620", "TIS-620"}, +// {"tis620", "TIS-620"}, +// {"undecided", ""}, // -- +// {"unix", ""}, // -- +// {"utf-7", "UTF-7"}, // Emacs +// {"utf-7-safe", ""}, // XEmacs? +// {"utf-8-ws", "UTF-8"}, // XEmacs? +// {"vietnamese-tcvn", "TCVN"}, // *, Emacs +// {"vietnamese-viqr", "VIQR"}, // not covered +// {"vietnamese-viscii", "VISCII"}, +// {"vietnamese-vscii", ""}, // not covered +// {"viqr", "VIQR"}, // not covered +// {"viscii", "VISCII"}, +// {"vscii", ""}, // not covered +// {"windows-037", ""}, // not covered +// {"windows-10000", ""}, // not covered +// {"windows-10001", ""}, // not covered +// {"windows-10006", ""}, // not covered +// {"windows-10007", ""}, // not covered +// {"windows-10029", ""}, // not covered +// {"windows-10079", ""}, // not covered +// {"windows-10081", ""}, // not covered +// {"windows-1026", ""}, // not covered +// {"windows-1200", ""}, // not covered +// {"windows-1250", "windows-1250"}, +// {"windows-1251", "windows-1251"}, +// {"windows-1252", "windows-1252"}, +// {"windows-1253", "windows-1253"}, +// {"windows-1254", "windows-1254"}, +// {"windows-1255", "windows-1255"}, +// {"windows-1256", "windows-1256"}, +// {"windows-1257", "windows-1257"}, +// {"windows-1258", "windows-1258"}, +// {"windows-1361", "cp1361"}, // *, XEmacs +// {"windows-437", "cp437"}, // XEmacs +// {"windows-500", ""}, // not covered +// {"windows-708", ""}, // not covered +// {"windows-709", ""}, // not covered +// {"windows-710", ""}, // not covered +// {"windows-720", ""}, // not covered +// {"windows-737", "cp737"}, // *, XEmacs +// {"windows-775", "cp775"}, // XEmacs +// {"windows-850", "cp850"}, // XEmacs +// {"windows-852", "cp852"}, // XEmacs +// {"windows-855", "cp855"}, // XEmacs +// {"windows-857", "cp857"}, // XEmacs +// {"windows-860", "cp860"}, // XEmacs +// {"windows-861", "cp861"}, // XEmacs +// {"windows-862", "cp862"}, // XEmacs +// {"windows-863", "cp863"}, // XEmacs +// {"windows-864", "cp864"}, // XEmacs +// {"windows-865", "cp865"}, // XEmacs +// {"windows-866", "cp866"}, // XEmacs +// {"windows-869", "cp869"}, // XEmacs +// {"windows-874", "cp874"}, // XEmacs +// {"windows-875", ""}, // not covered +// {"windows-932", "cp932"}, // *, XEmacs +// {"windows-936", "cp936"}, // XEmacs +// {"windows-949", "cp949"}, // *, XEmacs +// {"windows-950", "cp950"}, // *, XEmacs +// {"x-ctext", ""}, // -- +// {"x-ctext-with-extensions", ""}, // -- + + {NULL, NULL}, +}; + +// --------------------------------------------------------- +// Convert encoding name from emacs to mime. +// --------------------------------------------------------- +char * +emacs2mime(char *emacs_enc) +{ + int emacs_enc_len = strlen(emacs_enc); + if (emacs_enc_len > 4 + && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos")) + emacs_enc[emacs_enc_len - 4] = 0; + if (emacs_enc_len > 4 + && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac")) + emacs_enc[emacs_enc_len - 4] = 0; + if (emacs_enc_len > 5 + && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix")) + emacs_enc[emacs_enc_len - 5] = 0; + for (const conversion *table = emacs_to_mime; table->from; table++) + if (!strcasecmp(emacs_enc, table->from)) + return (char *)table->to; + return emacs_enc; +} + +// --------------------------------------------------------- +// Print out Unicode entity if value is greater than 0x7F. +// --------------------------------------------------------- +inline void +unicode_entity(int u) +{ + if (u < 0x80) + putchar(u); + else { + // Handle no-break space and soft hyphen specially--they are input + // characters only, not glyphs. See groff_char(7). + if (u == 0xA0) { + putchar('\\'); + putchar('~'); + } + else if (u == 0xAD) { + putchar('\\'); + putchar('%'); + } + else + printf("\\[u%04X]", u); + } +} + +// --------------------------------------------------------- +// Conversion functions. All functions take 'data', which +// normally holds the first two lines, and a file pointer. +// --------------------------------------------------------- + +// Conversion from ISO-8859-1 (aka Latin-1) to Unicode. +void +conversion_latin1(FILE *fp, const string &data) +{ + int len = data.length(); + const unsigned char *ptr = (const unsigned char *)data.contents(); + for (int i = 0; i < len; i++) + unicode_entity(ptr[i]); + int c = -1; + while ((c = getc(fp)) != EOF) + unicode_entity(c); +} + +// A future version of groff shall support UTF-8 natively. +// In this case, the UTF-8 stuff here in this file will be +// moved to the troff program. + +struct utf8 { + FILE *fp; + unsigned char s[6]; + enum { + FIRST = 0, + SECOND, + THIRD, + FOURTH, + FIFTH, + SIXTH + } byte; + int expected_byte_count; + bool emit_invalid_utf8_warning; + bool emit_incomplete_utf8_warning; + utf8(FILE *); + ~utf8(); + void add(unsigned char); + void invalid(); + void incomplete(); +}; + +utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_byte_count(1), + emit_invalid_utf8_warning(true), + emit_incomplete_utf8_warning(true) +{ + // empty +} + +utf8::~utf8() +{ + if (byte != FIRST) + incomplete(); +} + +inline void +utf8::add(unsigned char c) +{ + s[byte] = c; + if (byte == FIRST) { + if (c < 0x80) + unicode_entity(c); + else if (c < 0xC0) + invalid(); + else if (c < 0xE0) { + expected_byte_count = 2; + byte = SECOND; + } + else if (c < 0xF0) { + expected_byte_count = 3; + byte = SECOND; + } + else if (c < 0xF8) { + expected_byte_count = 4; + byte = SECOND; + } + else if (c < 0xFC) { + expected_byte_count = 5; + byte = SECOND; + } + else if (c < 0xFE) { + expected_byte_count = 6; + byte = SECOND; + } + else + invalid(); + return; + } + if (c < 0x80 || c > 0xBF) { + incomplete(); + add(c); + return; + } + switch (byte) { + case FIRST: + // can't happen + break; + case SECOND: + if (expected_byte_count == 2) { + if (s[0] < 0xC2) + invalid(); + else + unicode_entity(((s[0] & 0x1F) << 6) + | (s[1] ^ 0x80)); + byte = FIRST; + } + else + byte = THIRD; + break; + case THIRD: + if (expected_byte_count == 3) { + if (!(s[0] >= 0xE1 || s[1] >= 0xA0)) + invalid(); + else + unicode_entity(((s[0] & 0x1F) << 12) + | ((s[1] ^ 0x80) << 6) + | (s[2] ^ 0x80)); + byte = FIRST; + } + else + byte = FOURTH; + break; + case FOURTH: + // We reject everything greater than 0x10FFFF. + if (expected_byte_count == 4) { + if (!((s[0] >= 0xF1 || s[1] >= 0x90) + && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90)))) + invalid(); + else + unicode_entity(((s[0] & 0x07) << 18) + | ((s[1] ^ 0x80) << 12) + | ((s[2] ^ 0x80) << 6) + | (s[3] ^ 0x80)); + byte = FIRST; + } + else + byte = FIFTH; + break; + case FIFTH: + if (expected_byte_count == 5) { + invalid(); + byte = FIRST; + } + else + byte = SIXTH; + break; + case SIXTH: + invalid(); + byte = FIRST; + break; + } +} + +// We use fprintf(stderr) instead of libgroff's debug() because we need +// to output longs, and libgroff's errprint() doesn't support that. + +void +utf8::invalid() +{ + if (is_debugging && emit_invalid_utf8_warning) { + fprintf(stderr, " invalid UTF-8 sequence(s) in input stream:" + " replacing each such sequence with 0xFFFD\n"); + emit_invalid_utf8_warning = false; + } + unicode_entity(0xFFFD); + byte = FIRST; +} + +void +utf8::incomplete() +{ + if (is_debugging && emit_incomplete_utf8_warning) { + fprintf(stderr, " incomplete UTF-8 sequence(s) in input stream:" + " replacing each such sequence with 0xFFFD\n"); + emit_incomplete_utf8_warning = false; + } + unicode_entity(0xFFFD); + byte = FIRST; +} + +// Conversion from UTF-8 to Unicode. +void +conversion_utf8(FILE *fp, const string &data) +{ + utf8 u(fp); + int len = data.length(); + const unsigned char *ptr = (const unsigned char *)data.contents(); + for (int i = 0; i < len; i++) + u.add(ptr[i]); + int c = -1; + while ((c = getc(fp)) != EOF) + u.add(c); + return; +} + +// Conversion from cp1047 (EBCDIC) to UTF-8. +void +conversion_cp1047(FILE *fp, const string &data) +{ + static unsigned char cp1047[] = { + 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, // 0x00 + 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, // 0x10 + 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, // 0x20 + 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, + 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, // 0x30 + 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A, + 0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, // 0x40 + 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C, + 0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, // 0x50 + 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E, + 0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, // 0x60 + 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, + 0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, // 0x70 + 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, + 0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // 0x80 + 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1, + 0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, // 0x90 + 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4, + 0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, // 0xA0 + 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE, + 0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, // 0xB0 + 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7, + 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // 0xC0 + 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5, + 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, // 0xD0 + 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF, + 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, // 0xE0 + 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0xF0 + 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F, + }; + int len = data.length(); + const unsigned char *ptr = (const unsigned char *)data.contents(); + for (int i = 0; i < len; i++) + unicode_entity(cp1047[ptr[i]]); + int c = -1; + while ((c = getc(fp)) != EOF) + unicode_entity(cp1047[c]); +} + +// Locale-sensible conversion. +#if HAVE_ICONV +void +conversion_iconv(FILE *fp, const string &data, char *enc) +{ + iconv_t handle = iconv_open(UNICODE, enc); + if (handle == (iconv_t)-1) { + if (errno == EINVAL) { + error("encoding system '%1' not supported by iconv()", enc); + return; + } + fatal("iconv_open failed"); + } + char inbuf[BUFSIZ]; + int outbuf[BUFSIZ]; + char *outptr = (char *)outbuf; + size_t outbytes_left = BUFSIZ * sizeof (int); + // Handle 'data'. + char *inptr = (char *)data.contents(); + size_t inbytes_left = data.length(); + char *limit; + while (inbytes_left > 0) { + size_t status = iconv(handle, + (ICONV_CONST char **)&inptr, &inbytes_left, + &outptr, &outbytes_left); + if (status == (size_t)-1) { + if (errno == EILSEQ) { + // Invalid byte sequence. XXX + inptr++; + inbytes_left--; + } + else if (errno == E2BIG) { + // Output buffer is full. + limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left; + for (int *ptr = outbuf; (char *)ptr < limit; ptr++) + unicode_entity(*ptr); + memmove(outbuf, outptr, outbytes_left); + outptr = (char *)outbuf + outbytes_left; + outbytes_left = BUFSIZ * sizeof (int) - outbytes_left; + } + else if (errno == EINVAL) { + // 'data' ends with partial input sequence. + memcpy(inbuf, inptr, inbytes_left); + break; + } + } + } + // Handle 'fp' and switch to 'inbuf'. + size_t read_bytes; + char *read_start = inbuf + inbytes_left; + while ((read_bytes = fread(read_start, 1, BUFSIZ - inbytes_left, fp)) > 0) { + inptr = inbuf; + inbytes_left += read_bytes; + while (inbytes_left > 0) { + size_t status = iconv(handle, + (ICONV_CONST char **)&inptr, &inbytes_left, + &outptr, &outbytes_left); + if (status == (size_t)-1) { + if (errno == EILSEQ) { + // Invalid byte sequence. XXX + inptr++; + inbytes_left--; + } + else if (errno == E2BIG) { + // Output buffer is full. + limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left; + for (int *ptr = outbuf; (char *)ptr < limit; ptr++) + unicode_entity(*ptr); + memmove(outbuf, outptr, outbytes_left); + outptr = (char *)outbuf + outbytes_left; + outbytes_left = BUFSIZ * sizeof (int) - outbytes_left; + } + else if (errno == EINVAL) { + // 'inbuf' ends with partial input sequence. + memmove(inbuf, inptr, inbytes_left); + break; + } + } + } + read_start = inbuf + inbytes_left; + } + iconv_close(handle); + // XXX use ferror? + limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left; + for (int *ptr = outbuf; (char *)ptr < limit; ptr++) + unicode_entity(*ptr); +} +#endif /* HAVE_ICONV */ + +// --------------------------------------------------------- +// Handle Byte Order Mark. +// +// Since we have a chicken-and-egg problem it's necessary +// to handle the BOM manually if it is in the data stream. +// As documented in the Unicode book it is very unlikely +// that any normal text file (regardless of the encoding) +// starts with the bytes which represent a BOM. +// +// Return the BOM in string 'BOM'; 'data' then starts with +// the byte after the BOM. This function reads (at most) +// four bytes from the data stream. +// +// Return encoding if a BOM is found, NULL otherwise. +// --------------------------------------------------------- +const char * +get_BOM(FILE *fp, string &BOM, string &data) +{ + // The BOM is U+FEFF. We have thus the following possible + // representations. + // + // UTF-8: 0xEFBBBF + // UTF-16: 0xFEFF or 0xFFFE + // UTF-32: 0x0000FEFF or 0xFFFE0000 + static struct { + int len; + const char *str; + const char *name; + } BOM_table[] = { + {4, "\x00\x00\xFE\xFF", "UTF-32"}, + {4, "\xFF\xFE\x00\x00", "UTF-32"}, + {3, "\xEF\xBB\xBF", "UTF-8"}, + {2, "\xFE\xFF", "UTF-16"}, + {2, "\xFF\xFE", "UTF-16"}, + }; + const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]); + char BOM_string[4]; + const char *retval = NULL; + int len; + for (len = 0; len < 4; len++) { + int c = getc(fp); + if (c == EOF) + break; + BOM_string[len] = char(c); + } + int i; + for (i = 0; i < BOM_table_len; i++) { + if (BOM_table[i].len <= len + && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0) + break; + } + int j = 0; + if (i < BOM_table_len) { + for (; j < BOM_table[i].len; j++) + BOM += BOM_string[j]; + retval = BOM_table[i].name; + } + for (; j < len; j++) + data += BOM_string[j]; + return retval; +} + +// --------------------------------------------------------- +// Get first two lines from input stream. +// +// Return string (allocated with 'new') without zero bytes +// or NULL in case no coding tag can occur in the data +// (which is stored unmodified in 'data'). +// --------------------------------------------------------- +char * +get_tag_lines(FILE *fp, string &data) +{ + int newline_count = 0; + int c, prev = -1; + // Handle CR, LF, and CRLF as line separators. + for (int i = 0; i < data.length(); i++) { + c = data[i]; + if (c == '\n' || c == '\r') + newline_count++; + if (c == '\n' && prev == '\r') + newline_count--; + prev = c; + } + if (newline_count > 1) + return NULL; + bool emit_warning = true; + for (int lines = newline_count; lines < 2; lines++) { + while ((c = getc(fp)) != EOF) { + if (c == '\0' && is_debugging && emit_warning) { + warning("null byte(s) found in input stream:" + " search for coding tag might return false result"); + emit_warning = false; + } + data += char(c); + if (c == '\n' || c == '\r') + break; + } + // Handle CR, LF, and CRLF as line separators. + if (c == '\r') { + c = getc(fp); + if (c != EOF && c != '\n') + ungetc(c, fp); + else + data += char(c); + } + } + return data.extract(); +} + +// --------------------------------------------------------- +// Check whether C string starts with a comment. +// +// Return 1 if true, 0 otherwise. +// --------------------------------------------------------- +int +is_comment_line(char *s) +{ + if (!s || !*s) + return 0; + if (*s == '.' || *s == '\'') + { + s++; + while (*s == ' ' || *s == '\t') + s++; + if (*s && *s == '\\') + { + s++; + if (*s == '"' || *s == '#') + return 1; + } + } + else if (*s == '\\') + { + s++; + if (*s == '#') + return 1; + } + return 0; +} + +// --------------------------------------------------------- +// Get a value/variable pair from a local variables list +// in a C string which look like this: +// +// <variable1>: <value1>; <variable2>: <value2>; ... +// +// Leading and trailing blanks are ignored. There might be +// more than one blank after ':' and ';'. +// +// Return position of next value/variable pair or NULL if +// at end of data. +// --------------------------------------------------------- +char * +get_variable_value_pair(char *d1, char **variable, char **value) +{ + static char var[MAX_VAR_LEN], val[MAX_VAR_LEN]; + *variable = var; + *value = val; + while (*d1 == ' ' || *d1 == '\t') + d1++; + // Get variable. + int l = 0; + while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1)) + var[l++] = *(d1++); + var[l] = 0; + // Skip everything until ':', ';', or end of data. + while (*d1 && *d1 != ':' && *d1 != ';') + d1++; + val[0] = 0; + if (!*d1) + return NULL; + if (*d1 == ';') + return d1 + 1; + d1++; + while (*d1 == ' ' || *d1 == '\t') + d1++; + // Get value. + l = 0; + while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1)) + val[l++] = *(d1++); + val[l] = 0; + // Skip everything until ';' or end of data. + while (*d1 && *d1 != ';') + d1++; + if (*d1 == ';') + return d1 + 1; + return NULL; +} + +// --------------------------------------------------------- +// Check coding tag in the read buffer. +// +// We search for the following line: +// +// <comment> ... -*-<local variables list>-*- +// +// ('...' might be anything). +// +// <comment> can be one of the following syntax forms at the +// beginning of the line: +// +// .\" .\# '\" '\# \# +// +// There can be whitespace after the leading '.' or "'". +// +// The local variables list must occur within the first +// comment block at the very beginning of the data stream. +// +// Within the <local variables list>, we search for +// +// coding: <value> +// +// which specifies the coding system used for the data +// stream. +// +// Return <value> if found, NULL otherwise. +// +// Note that null bytes in the data are skipped before applying +// the algorithm. This should work even with files encoded as +// UTF-16 or UTF-32 (or its siblings) in most cases. +// --------------------------------------------------------- +char * +check_coding_tag(FILE *fp, string &data) +{ + char *inbuf = get_tag_lines(fp, data); + char *lineend; + for (char *p = inbuf; is_comment_line(p); p = lineend + 1) { + if ((lineend = strchr(p, '\n')) == NULL) + break; + *lineend = 0; // switch temporarily to '\0' + char *d1 = strstr(p, "-*-"); + char *d2 = 0; + if (d1) + d2 = strstr(d1 + 3, "-*-"); + *lineend = '\n'; // restore newline + if (!d1 || !d2) + continue; + *d2 = 0; // switch temporarily to '\0' + d1 += 3; + while (d1) { + char *variable, *value; + d1 = get_variable_value_pair(d1, &variable, &value); + if (!strcasecmp(variable, "coding")) { + *d2 = '-'; // restore '-' + free(inbuf); + return value; + } + } + *d2 = '-'; // restore '-' + } + free(inbuf); + return NULL; +} + +char * +detect_file_encoding(FILE *fp) +{ +#ifdef HAVE_UCHARDET + uchardet_t ud = NULL; + struct stat stat_buf; + size_t len, read_bytes; + char *data = NULL; + int res, current_position; + const char *charset; + char *ret = NULL; + + current_position = ftell(fp); + /* Due to BOM and tag detection, we are not at the beginning of the + file. */ + rewind(fp); + if (fstat(fileno(fp), &stat_buf) != 0) { + error("fstat: %1", strerror(errno)); + goto end; + } + len = stat_buf.st_size; + if (is_debugging) + fprintf(stderr, " len: %lu\n", (unsigned long)len); + if (len == 0) + goto end; + data = (char *)calloc(len, 1); + read_bytes = fread(data, 1, len, fp); + if (read_bytes == 0) { + error("fread: %1", strerror(errno)); + goto end; + } + /* We rewind back to the original position */ + if (fseek(fp, current_position, SEEK_SET) != 0) { + fatal("fseek: %1", strerror(errno)); + goto end; + } + ud = uchardet_new(); + res = uchardet_handle_data(ud, data, len); + if (res != 0) { + debug(" uchardet_handle_data: error %1\n", res); + goto end; + } + if (is_debugging) + fprintf(stderr, " uchardet read: %lu bytes\n", + (unsigned long)read_bytes); + uchardet_data_end(ud); + charset = uchardet_get_charset(ud); + if (is_debugging) { + if (charset) + fprintf(stderr, " charset: %s\n", charset); + else + fprintf(stderr, " charset is NULL\n"); + } + /* uchardet 0.0.1 could return an empty string instead of NULL */ + if (charset && *charset) { + ret = (char *)malloc(strlen(charset) + 1); + strcpy(ret, charset); + } + +end: + if (ud) + uchardet_delete(ud); + if (data) + free(data); + + return ret; +#else /* not HAVE_UCHARDET */ + return NULL; +#endif /* not HAVE_UCHARDET */ +} + +// --------------------------------------------------------- +// Handle an input file. If `filename` is "-", read the +// standard input stream. +// +// Return 1 on success, 0 otherwise. +// --------------------------------------------------------- +int +do_file(const char *filename) +{ + FILE *fp; + string BOM, data; + bool is_seekable = false; + string reported_filename; + + // TODO: Consider moving some of this into a `quoted_file_name` + // function in libgroff. + if (strcmp(filename, "-") == 0) { + fp = stdin; + reported_filename = string("<standard input>"); + } + else { + fp = fopen(filename, FOPEN_RB); + reported_filename = "'" + string(filename) + "'"; + } + if (!fp) { + error("can't open %1: %2", reported_filename.contents(), + strerror(errno)); + return 0; + } + if (is_debugging) + fprintf(stderr, "processing %s\n", reported_filename.contents()); + if (fseek(fp, 0L, SEEK_SET) == 0) + is_seekable = true; + else { + SET_BINARY(fileno(fp)); + if (is_debugging) + fprintf(stderr, " stream is not seekable: %s\n", + strerror(errno)); + } + const char *BOM_encoding = get_BOM(fp, BOM, data); + // Determine the encoding. + char *encoding; + int must_free_encoding = 0; + if (user_encoding[0]) { + if (is_debugging) { + fprintf(stderr, " user-specified encoding '%s', " + "no search for coding tag\n", + user_encoding); + if (BOM_encoding && strcmp(BOM_encoding, user_encoding)) + fprintf(stderr, " but BOM in data stream implies encoding '%s'!\n", + BOM_encoding); + } + encoding = (char *)user_encoding; + } + else if (BOM_encoding) { + if (is_debugging) + fprintf(stderr, " found BOM, no search for coding tag\n"); + encoding = (char *)BOM_encoding; + } + else { + // 'check_coding_tag' returns a pointer to a static array (or NULL). + char *file_encoding = check_coding_tag(fp, data); + if (!file_encoding) { + if (is_debugging) + fprintf(stderr, " no coding tag\n"); + if (is_seekable) + file_encoding = detect_file_encoding(fp); + if (!file_encoding) { + if (is_debugging) + fprintf(stderr, " could not detect encoding with uchardet\n"); + file_encoding = fallback_encoding; + } + else + must_free_encoding = 1; + } + else + if (is_debugging) + fprintf(stderr, " coding tag: '%s'\n", file_encoding); + encoding = file_encoding; + } + strncpy(encoding_string, encoding, MAX_VAR_LEN - 1); + encoding_string[MAX_VAR_LEN - 1] = 0; + if (must_free_encoding) + free(encoding); + encoding = encoding_string; + // Translate from MIME & Emacs encoding names to locale encoding names. + encoding = emacs2mime(encoding_string); + if (encoding[0] == '\0') { + error("encoding '%1' not supported, not a portable encoding", + encoding_string); + return 0; + } + if (is_debugging) + fprintf(stderr, " encoding used: '%s'\n", encoding); + if (!raw_flag) { + string fn(filename); + fn += '\0'; + normalize_for_lf(fn); + printf(".lf 1 %s\n", fn.contents()); + } + int success = 1; + // Call converter (converters write to stdout). + if (!strcasecmp(encoding, "ISO-8859-1")) + conversion_latin1(fp, BOM + data); + else if (!strcasecmp(encoding, "UTF-8")) + conversion_utf8(fp, data); + else if (!strcasecmp(encoding, "cp1047")) + conversion_cp1047(fp, BOM + data); + else { +#if HAVE_ICONV + conversion_iconv(fp, BOM + data, encoding); +#else + error("encoding system '%1' not supported", encoding); + success = 0; +#endif /* HAVE_ICONV */ + } + if (fp != stdin) + fclose(fp); + return success; +} + +// --------------------------------------------------------- +// Print usage. +// --------------------------------------------------------- +void +usage(FILE *stream) +{ + fprintf(stream, +"usage: %s [-dr] [-D fallback-encoding] [-e encoding] [file ...]\n" +"usage: %s {-v | --version}\n" +"usage: %s {-h | --help}\n", + program_name, program_name, program_name); + if (stdout == stream) { + fprintf(stream, +"\n" +"Read each file, convert its encoded characters to a form GNU" +" troff(1)\n" +"can interpret, and send the result to the standard output stream.\n" +"The default fallback encoding is '%s'. See the preconv(1) manual" +" page.\n", + fallback_encoding); + exit(EXIT_SUCCESS); + } +} + +// --------------------------------------------------------- +// Main routine. +// --------------------------------------------------------- +int +main(int argc, char **argv) +{ + program_name = argv[0]; + // Determine the fallback encoding. This must be done before + // getopt() is called since the usage message shows the fallback + // encoding. + setlocale(LC_ALL, ""); + char *locale = getlocale(LC_CTYPE); + if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX")) + strcpy(fallback_encoding, "latin1"); + else { + strncpy(fallback_encoding, locale_charset(), MAX_VAR_LEN - 1); + fallback_encoding[MAX_VAR_LEN - 1] = 0; + } + + program_name = argv[0]; + int opt; + static const struct option long_options[] = { + { "help", no_argument, 0, 'h' }, + { "version", no_argument, 0, 'v' }, + { NULL, 0, 0, 0 } + }; + // Parse the command-line options. + while ((opt = getopt_long(argc, argv, + "dD:e:hrv", long_options, NULL)) != EOF) + switch (opt) { + case 'v': + printf("GNU preconv (groff) version %s %s iconv support and %s uchardet support\n", + Version_string, +#ifdef HAVE_ICONV + "with", +#else + "without", +#endif /* HAVE_ICONV */ +#ifdef HAVE_UCHARDET + "with" +#else + "without" +#endif /* HAVE_UCHARDET */ + ); + exit(0); + break; + case 'd': + is_debugging = true; + break; + case 'e': + if (optarg) { + strncpy(user_encoding, optarg, MAX_VAR_LEN - 1); + user_encoding[MAX_VAR_LEN - 1] = 0; + } + else + user_encoding[0] = 0; + break; + case 'D': + if (optarg) { + strncpy(fallback_encoding, optarg, MAX_VAR_LEN - 1); + fallback_encoding[MAX_VAR_LEN - 1] = 0; + } + break; + case 'r': + raw_flag = 1; + break; + case 'h': + usage(stdout); + break; + case '?': + usage(stderr); + exit(1); + break; + default: + assert(0); + } + int nbad = 0; + if (is_debugging) + fprintf(stderr, "fallback encoding: '%s'\n", fallback_encoding); + if (optind >= argc) + nbad += !do_file("-"); + else + for (int i = optind; i < argc; i++) + nbad += !do_file(argv[i]); + if (ferror(stdout) || fflush(stdout) < 0) + fatal("output error"); + return nbad != 0; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/preproc/preconv/tests/do-not-seek-the-unseekable.sh b/src/preproc/preconv/tests/do-not-seek-the-unseekable.sh new file mode 100755 index 0000000..2b1142d --- /dev/null +++ b/src/preproc/preconv/tests/do-not-seek-the-unseekable.sh @@ -0,0 +1,59 @@ +#!/bin/sh +# +# Copyright (C) 2022 Free Software Foundation, Inc. +# +# This file is part of groff. +# +# groff is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# groff is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +set -e + +preconv="${abs_top_builddir:-.}/preconv" + +fail= + +wail () { + echo FAILED >&2 + fail=YES +} + +# Scrape debugging output to see if we're skipping unseekable streams. +# This is fragile, but we don't want to lock the language of diagnostic +# messages (especially debugging ones). If this test fails, check the +# text of the command's debugging output for a mismatch before +# investigating deeper problems. + +echo "testing seekability of file operand '-'" >&2 +output=$(printf '' | "$preconv" -d - 2>&1) +echo "$output" | grep -q "stream is not seekable" || wail + +# /dev/stdin might not exist in a chroot. Or, if it's not (a symbolic +# link to) a character special device, the next test will not be valid, +# as when using GNU Make's `-j` option. +# +# Similarly, we must have a controlling terminal. +test -z "$fail" +echo "skipping if /dev/stdin is not a character device" >&2 +test -c /dev/stdin || exit 77 # skip +echo "skipping if there is no controlling terminal" >&2 +test "$(tty)" != "not a tty" || exit 77 # skip + +echo "testing seekability of standard input stream" >&2 +output=$(printf '' | "$preconv" -d /dev/stdin 2>&1) +echo "$output" | grep -q "stream is not seekable" || wail + +test -z "$fail" + +# vim:set ai et sw=4 ts=4 tw=72: diff --git a/src/preproc/preconv/tests/smoke-test.sh b/src/preproc/preconv/tests/smoke-test.sh new file mode 100755 index 0000000..4131416 --- /dev/null +++ b/src/preproc/preconv/tests/smoke-test.sh @@ -0,0 +1,88 @@ +#!/bin/sh +# +# Copyright (C) 2020 Free Software Foundation, Inc. +# +# This file is part of groff. +# +# groff is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# groff is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +# Ensure a predictable character encoding. +export LC_ALL=C + +set -e + +preconv="${abs_top_builddir:-.}/preconv" + +echo "testing -e flag override of BOM detection" >&2 +printf '\376\377\0\100\0\n' \ + | "$preconv" -d -e euc-kr 2>&1 > /dev/null \ + | grep -q "no search for coding tag" + +echo "testing detection of UTF-32BE BOM" >&2 +printf '\0\0\376\377\0\0\0\100\0\0\0\n' \ + | "$preconv" -d 2>&1 > /dev/null \ + | grep -q "found BOM" + +echo "testing detection of UTF-32LE BOM" >&2 +printf '\377\376\0\0\100\0\0\0\n\0\0\0' \ + | "$preconv" -d 2>&1 > /dev/null \ + | grep -q "found BOM" + +echo "testing detection of UTF-16BE BOM" >&2 +printf '\376\377\0\100\0\n' \ + | "$preconv" -d 2>&1 > /dev/null \ + | grep -q "found BOM" + +echo "testing detection of UTF-16LE BOM" >&2 +printf '\377\376\100\0\n\0' \ + | "$preconv" -d 2>&1 > /dev/null \ + | grep -q "found BOM" + +echo "testing detection of UTF-8 BOM" >&2 +printf '\357\273\277@\n' \ + | "$preconv" -d 2>&1 > /dev/null \ + | grep -q "found BOM" + +# We do not find a coding tag on piped input because it isn't seekable. +echo "testing detection of Emacs coding tag in piped input" >&2 +printf '.\\" -*- coding: euc-kr; -*-\\n' \ + | "$preconv" -d 2>&1 >/dev/null \ + | grep -q "no coding tag" + +# We need uchardet to work to get past this point. +echo "testing uchardet detection of encoding" >&2 +"$preconv" -v | grep -q 'with uchardet support' || exit 77 + +# Instead of using temporary files, which in all fastidiousness means +# cleaning them up even if we're interrupted, which in turn means +# setting up signal handlers, we use files in the build tree. + +doc=contrib/mm/groff_mmse.7 +echo "testing uchardet detection on Latin-1 document $doc" >&2 +"$preconv" -d -D us-ascii 2>&1 >/dev/null $doc \ + | grep -q 'charset: ISO-8859-1' + +# uchardet can't seek on a pipe either. +echo "testing uchardet detection on pipe (expect fallback to -D)" >&2 +printf 'Eat at the caf\351.\n' \ + | "$preconv" -d -D euc-kr 2>&1 > /dev/null \ + | grep -q "encoding used: 'EUC-KR'" + +# Fall back to the locale. preconv assumes Latin-1 for C instead of +# US-ASCII. +echo "testing fallback to locale setting in environment" >&2 +printf 'Eat at the caf\351.\n' \ + | "$preconv" -d 2>&1 > /dev/null \ + | grep -q "encoding used: 'ISO-8859-1'" |