/* Copyright (C) 2005-2020 Free Software Foundation, Inc. Written by Werner Lemberg (wl@gnu.org) This file is part of groff. groff is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. groff is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "lib.h" #include #include #include #include #ifdef HAVE_UCHARDET #include #endif #include "errarg.h" #include "error.h" #include "localcharset.h" #include "nonposix.h" #include "stringclass.h" #include "lf.h" #include #if HAVE_ICONV # include # ifdef WORDS_BIGENDIAN # define UNICODE "UTF-32BE" # else # define UNICODE "UTF-32LE" # endif #endif #define MAX_VAR_LEN 100 extern "C" const char *Version_string; char fallback_encoding[MAX_VAR_LEN]; char user_encoding[MAX_VAR_LEN]; char encoding_string[MAX_VAR_LEN]; bool is_debugging = false; int raw_flag = 0; struct conversion { const char *from; const char *to; }; // The official list of MIME tags can be found at // // http://www.iana.org/assignments/character-sets // // For encodings which don't have a MIME tag we use GNU iconv's encoding // names (which also work with the portable GNU libiconv package). They // are marked with '*'. // // Encodings specific to XEmacs and Emacs are marked as such; no mark means // that they are used by both Emacs and XEmacs. // // Encodings marked with '--' are special to Emacs, XEmacs, or other // applications and shouldn't be used for data exchange. // // 'Not covered' means that the encoding can be handled neither by GNU iconv // nor by libiconv, or just one of them has support for it. // // A special case is VIQR encoding: Despite of having a MIME tag it is // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6). // // Finally, we add all aliases of GNU iconv for 'ascii', 'latin1', and // 'utf8' to catch those encoding names before iconv is called. // // Note that most entries are commented out -- only a small, (rather) // reliable and stable subset of encodings is recognized (for coding tags) // which are still in greater use today (January 2006). Most notably, all // Windows-specific encodings are not selected because they lack stability: // Microsoft has changed the mappings instead of creating new versions. // // Please contact the groff list if you find the selection inadequate. static const conversion emacs_to_mime[] = { {"ascii", "US-ASCII"}, // Emacs {"big5", "Big5"}, {"chinese-big5", "Big5"}, // Emacs {"chinese-euc", "GB2312"}, // XEmacs {"chinese-iso-8bit", "GB2312"}, // Emacs {"cn-big5", "Big5"}, {"cn-gb", "GB2312"}, // Emacs {"cn-gb-2312", "GB2312"}, {"cp878", "KOI8-R"}, // Emacs {"cp1047", "CP1047"}, // EBCDIC {"csascii", "US-ASCII"}, // alias {"csisolatin1", "ISO-8859-1"}, // alias {"cyrillic-iso-8bit", "ISO-8859-5"}, // Emacs {"cyrillic-koi8", "KOI8-R"}, // not KOI8!, Emacs {"euc-china", "GB2312"}, // Emacs {"euc-cn", "GB2312"}, // Emacs {"euc-japan", "EUC-JP"}, {"euc-japan-1990", "EUC-JP"}, // Emacs {"euc-jp", "EUC-JP"}, {"euc-korea", "EUC-KR"}, {"euc-kr", "EUC-KR"}, {"gb2312", "GB2312"}, {"greek-iso-8bit", "ISO-8859-7"}, {"iso-10646/utf8", "UTF-8"}, // alias {"iso-10646/utf-8", "UTF-8"}, // alias {"iso-8859-1", "ISO-8859-1"}, {"iso-8859-13", "ISO-8859-13"}, // Emacs {"iso-8859-15", "ISO-8859-15"}, {"iso-8859-2", "ISO-8859-2"}, {"iso-8859-5", "ISO-8859-5"}, {"iso-8859-7", "ISO-8859-7"}, {"iso-8859-9", "ISO-8859-9"}, {"iso-latin-1", "ISO-8859-1"}, {"iso-latin-2", "ISO-8859-2"}, // Emacs {"iso-latin-5", "ISO-8859-9"}, // Emacs {"iso-latin-7", "ISO-8859-13"}, // Emacs {"iso-latin-9", "ISO-8859-15"}, // Emacs {"japanese-iso-8bit", "EUC-JP"}, // Emacs {"japanese-euc", "EUC-JP"}, // XEmacs {"jis8", "EUC-JP"}, // XEmacs {"koi8", "KOI8-R"}, // not KOI8!, Emacs {"koi8-r", "KOI8-R"}, {"korean-euc", "EUC-KR"}, // XEmacs {"korean-iso-8bit", "EUC-KR"}, // Emacs {"latin1", "ISO-8859-1"}, // alias {"latin-0", "ISO-8859-15"}, // Emacs {"latin-1", "ISO-8859-1"}, // Emacs {"latin-2", "ISO-8859-2"}, // Emacs {"latin-5", "ISO-8859-9"}, // Emacs {"latin-7", "ISO-8859-13"}, // Emacs {"latin-9", "ISO-8859-15"}, // Emacs {"mule-utf-16", "UTF-16"}, // Emacs {"mule-utf-16be", "UTF-16BE"}, // Emacs {"mule-utf-16-be", "UTF-16BE"}, // Emacs {"mule-utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE {"mule-utf-16le", "UTF-16LE"}, // Emacs {"mule-utf-16-le", "UTF-16LE"}, // Emacs {"mule-utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE {"mule-utf-8", "UTF-8"}, // Emacs {"us-ascii", "US-ASCII"}, // Emacs {"utf8", "UTF-8"}, // alias {"utf-16", "UTF-16"}, // Emacs {"utf-16be", "UTF-16BE"}, // Emacs {"utf-16-be", "UTF-16BE"}, // Emacs {"utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE {"utf-16-be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE {"utf-16le", "UTF-16LE"}, // Emacs {"utf-16-le", "UTF-16LE"}, // Emacs {"utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE {"utf-16-le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE {"utf-8", "UTF-8"}, // Emacs // {"alternativnyj", ""}, // ? // {"arabic-iso-8bit", "ISO-8859-6"}, // Emacs // {"binary", ""}, // -- // {"chinese-hz", "HZ-GB-2312"}, // Emacs // {"chinese-iso-7bit", "ISO-2022-CN"}, // Emacs // {"chinese-iso-8bit-with-esc", ""}, // -- // {"compound-text", ""}, // -- // {"compound-text-with-extension", ""}, // -- // {"cp1125", "cp1125"}, // * // {"cp1250", "windows-1250"},// Emacs // {"cp1251", "windows-1251"},// Emacs // {"cp1252", "windows-1252"},// Emacs // {"cp1253", "windows-1253"},// Emacs // {"cp1254", "windows-1254"},// Emacs // {"cp1255", "windows-1255"},// Emacs // {"cp1256", "windows-1256"},// Emacs // {"cp1257", "windows-1257"},// Emacs // {"cp1258", "windows-1258"},// Emacs // {"cp437", "cp437"}, // Emacs // {"cp720", ""}, // not covered // {"cp737", "cp737"}, // *, Emacs // {"cp775", "cp775"}, // Emacs // {"cp850", "cp850"}, // Emacs // {"cp851", "cp851"}, // Emacs // {"cp852", "cp852"}, // Emacs // {"cp855", "cp855"}, // Emacs // {"cp857", "cp857"}, // Emacs // {"cp860", "cp860"}, // Emacs // {"cp861", "cp861"}, // Emacs // {"cp862", "cp862"}, // Emacs // {"cp863", "cp863"}, // Emacs // {"cp864", "cp864"}, // Emacs // {"cp865", "cp865"}, // Emacs // {"cp866", "cp866"}, // Emacs // {"cp866u", "cp1125"}, // *, Emacs // {"cp869", "cp869"}, // Emacs // {"cp874", "cp874"}, // *, Emacs // {"cp932", "cp932"}, // *, Emacs // {"cp936", "cp936"}, // Emacs // {"cp949", "cp949"}, // *, Emacs // {"cp950", "cp950"}, // *, Emacs // {"ctext", ""}, // -- // {"ctext-no-compositions", ""}, // -- // {"ctext-with-extensions", ""}, // -- // {"cyrillic-alternativnyj", ""}, // ?, Emacs // {"cyrillic-iso-8bit-with-esc", ""}, // -- // {"cyrillic-koi8-t", "KOI8-T"}, // *, Emacs // {"devanagari", ""}, // not covered // {"dos", ""}, // -- // {"emacs-mule", ""}, // -- // {"euc-jisx0213", "EUC-JISX0213"},// *, XEmacs? // {"euc-jisx0213-with-esc", ""}, // XEmacs? // {"euc-taiwan", "EUC-TW"}, // *, Emacs // {"euc-tw", "EUC-TW"}, // *, Emacs // {"georgian-ps", "GEORGIAN-PS"}, // *, Emacs // {"greek-iso-8bit-with-esc", ""}, // -- // {"hebrew-iso-8bit", "ISO-8859-8"}, // Emacs // {"hebrew-iso-8bit-with-esc", ""}, // -- // {"hz", "HZ-GB-2312"}, // {"hz-gb-2312", "HZ-GB-2312"}, // {"in-is13194", ""}, // not covered // {"in-is13194-devanagari", ""}, // not covered // {"in-is13194-with-esc", ""}, // -- // {"iso-2022-7", ""}, // XEmacs? // {"iso-2022-7bit", ""}, // -- // {"iso-2022-7bit-lock", ""}, // -- // {"iso-2022-7bit-lock-ss2", ""}, // -- // {"iso-2022-7bit-ss2", ""}, // -- // {"iso-2022-8", ""}, // XEmacs? // {"iso-2022-8bit", ""}, // XEmacs? // {"iso-2022-8bit-lock", ""}, // XEmacs? // {"iso-2022-8bit-lock-ss2", ""}, // XEmacs? // {"iso-2022-8bit-ss2", ""}, // -- // {"iso-2022-cjk", ""}, // -- // {"iso-2022-cn", "ISO-2022-CN"}, // Emacs // {"iso-2022-cn-ext", "ISO-2022-CN-EXT"},// Emacs // {"iso-2022-int-1", ""}, // -- // {"iso-2022-jp", "ISO-2022-JP"}, // {"iso-2022-jp-1978-irv", "ISO-2022-JP"}, // {"iso-2022-jp-2", "ISO-2022-JP-2"}, // {"iso-2022-jp-3", "ISO-2022-JP-3"},// *, XEmacs? // {"iso-2022-jp-3-compatible", ""}, // XEmacs? // {"iso-2022-jp-3-strict", "ISO-2022-JP-3"},// *, XEmacs? // {"iso-2022-kr", "ISO-2022-KR"}, // {"iso-2022-lock", ""}, // XEmacs? // {"iso-8859-10", "ISO-8859-10"}, // Emacs // {"iso-8859-11", "ISO-8859-11"}, // *, Emacs // {"iso-8859-14", "ISO-8859-14"}, // Emacs // {"iso-8859-16", "ISO-8859-16"}, // {"iso-8859-3", "ISO-8859-3"}, // {"iso-8859-4", "ISO-8859-4"}, // {"iso-8859-6", "ISO-8859-6"}, // {"iso-8859-8", "ISO-8859-8"}, // {"iso-8859-8-e", "ISO-8859-8"}, // {"iso-8859-8-i", "ISO-8859-8"}, // Emacs // {"iso-latin-10", "ISO-8859-16"}, // Emacs // {"iso-latin-1-with-esc", ""}, // -- // {"iso-latin-2-with-esc", ""}, // -- // {"iso-latin-3", "ISO-8859-3"}, // Emacs // {"iso-latin-3-with-esc", ""}, // -- // {"iso-latin-4", "ISO-8859-4"}, // Emacs // {"iso-latin-4-with-esc", ""}, // -- // {"iso-latin-5-with-esc", ""}, // -- // {"iso-latin-6", "ISO-8859-10"}, // Emacs // {"iso-latin-8", "ISO-8859-14"}, // Emacs // {"iso-safe", ""}, // -- // {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"}, // Emacs // {"japanese-iso-8bit-with-esc", ""}, // -- // {"japanese-shift-jis", "Shift_JIS"}, // Emacs // {"japanese-shift-jisx0213", ""}, // XEmacs? // {"jis7", "ISO-2022-JP"}, // Xemacs // {"junet", "ISO-2022-JP"}, // {"koi8-t", "KOI8-T"}, // *, Emacs // {"koi8-u", "KOI8-U"}, // Emacs // {"korean-iso-7bit-lock", "ISO-2022-KR"}, // {"korean-iso-8bit-with-esc", ""}, // -- // {"lao", ""}, // not covered // {"lao-with-esc", ""}, // -- // {"latin-10", "ISO-8859-16"}, // Emacs // {"latin-3", "ISO-8859-3"}, // Emacs // {"latin-4", "ISO-8859-4"}, // Emacs // {"latin-6", "ISO-8859-10"}, // Emacs // {"latin-8", "ISO-8859-14"}, // Emacs // {"mac", ""}, // -- // {"mac-roman", "MACINTOSH"}, // Emacs // {"mik", ""}, // not covered // {"next", "NEXTSTEP"}, // *, Emacs // {"no-conversion", ""}, // -- // {"old-jis", "ISO-2022-JP"}, // {"pt154", "PT154"}, // Emacs // {"raw-text", ""}, // -- // {"ruscii", "cp1125"}, // *, Emacs // {"shift-jis", "Shift_JIS"}, // XEmacs // {"shift_jis", "Shift_JIS"}, // {"shift_jisx0213", "Shift_JISX0213"},// *, XEmacs? // {"sjis", "Shift_JIS"}, // Emacs // {"tcvn", "TCVN"}, // *, Emacs // {"tcvn-5712", "TCVN"}, // *, Emacs // {"thai-tis620", "TIS-620"}, // {"thai-tis620-with-esc", ""}, // -- // {"th-tis620", "TIS-620"}, // {"tibetan", ""}, // not covered // {"tibetan-iso-8bit", ""}, // not covered // {"tibetan-iso-8bit-with-esc", ""}, // -- // {"tis-620", "TIS-620"}, // {"tis620", "TIS-620"}, // {"undecided", ""}, // -- // {"unix", ""}, // -- // {"utf-7", "UTF-7"}, // Emacs // {"utf-7-safe", ""}, // XEmacs? // {"utf-8-ws", "UTF-8"}, // XEmacs? // {"vietnamese-tcvn", "TCVN"}, // *, Emacs // {"vietnamese-viqr", "VIQR"}, // not covered // {"vietnamese-viscii", "VISCII"}, // {"vietnamese-vscii", ""}, // not covered // {"viqr", "VIQR"}, // not covered // {"viscii", "VISCII"}, // {"vscii", ""}, // not covered // {"windows-037", ""}, // not covered // {"windows-10000", ""}, // not covered // {"windows-10001", ""}, // not covered // {"windows-10006", ""}, // not covered // {"windows-10007", ""}, // not covered // {"windows-10029", ""}, // not covered // {"windows-10079", ""}, // not covered // {"windows-10081", ""}, // not covered // {"windows-1026", ""}, // not covered // {"windows-1200", ""}, // not covered // {"windows-1250", "windows-1250"}, // {"windows-1251", "windows-1251"}, // {"windows-1252", "windows-1252"}, // {"windows-1253", "windows-1253"}, // {"windows-1254", "windows-1254"}, // {"windows-1255", "windows-1255"}, // {"windows-1256", "windows-1256"}, // {"windows-1257", "windows-1257"}, // {"windows-1258", "windows-1258"}, // {"windows-1361", "cp1361"}, // *, XEmacs // {"windows-437", "cp437"}, // XEmacs // {"windows-500", ""}, // not covered // {"windows-708", ""}, // not covered // {"windows-709", ""}, // not covered // {"windows-710", ""}, // not covered // {"windows-720", ""}, // not covered // {"windows-737", "cp737"}, // *, XEmacs // {"windows-775", "cp775"}, // XEmacs // {"windows-850", "cp850"}, // XEmacs // {"windows-852", "cp852"}, // XEmacs // {"windows-855", "cp855"}, // XEmacs // {"windows-857", "cp857"}, // XEmacs // {"windows-860", "cp860"}, // XEmacs // {"windows-861", "cp861"}, // XEmacs // {"windows-862", "cp862"}, // XEmacs // {"windows-863", "cp863"}, // XEmacs // {"windows-864", "cp864"}, // XEmacs // {"windows-865", "cp865"}, // XEmacs // {"windows-866", "cp866"}, // XEmacs // {"windows-869", "cp869"}, // XEmacs // {"windows-874", "cp874"}, // XEmacs // {"windows-875", ""}, // not covered // {"windows-932", "cp932"}, // *, XEmacs // {"windows-936", "cp936"}, // XEmacs // {"windows-949", "cp949"}, // *, XEmacs // {"windows-950", "cp950"}, // *, XEmacs // {"x-ctext", ""}, // -- // {"x-ctext-with-extensions", ""}, // -- {NULL, NULL}, }; // --------------------------------------------------------- // Convert encoding name from emacs to mime. // --------------------------------------------------------- char * emacs2mime(char *emacs_enc) { int emacs_enc_len = strlen(emacs_enc); if (emacs_enc_len > 4 && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos")) emacs_enc[emacs_enc_len - 4] = 0; if (emacs_enc_len > 4 && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac")) emacs_enc[emacs_enc_len - 4] = 0; if (emacs_enc_len > 5 && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix")) emacs_enc[emacs_enc_len - 5] = 0; for (const conversion *table = emacs_to_mime; table->from; table++) if (!strcasecmp(emacs_enc, table->from)) return (char *)table->to; return emacs_enc; } // --------------------------------------------------------- // Print out Unicode entity if value is greater than 0x7F. // --------------------------------------------------------- inline void unicode_entity(int u) { if (u < 0x80) putchar(u); else { // Handle no-break space and soft hyphen specially--they are input // characters only, not glyphs. See groff_char(7). if (u == 0xA0) { putchar('\\'); putchar('~'); } else if (u == 0xAD) { putchar('\\'); putchar('%'); } else printf("\\[u%04X]", u); } } // --------------------------------------------------------- // Conversion functions. All functions take 'data', which // normally holds the first two lines, and a file pointer. // --------------------------------------------------------- // Conversion from ISO-8859-1 (aka Latin-1) to Unicode. void conversion_latin1(FILE *fp, const string &data) { int len = data.length(); const unsigned char *ptr = (const unsigned char *)data.contents(); for (int i = 0; i < len; i++) unicode_entity(ptr[i]); int c = -1; while ((c = getc(fp)) != EOF) unicode_entity(c); } // A future version of groff shall support UTF-8 natively. // In this case, the UTF-8 stuff here in this file will be // moved to the troff program. struct utf8 { FILE *fp; unsigned char s[6]; enum { FIRST = 0, SECOND, THIRD, FOURTH, FIFTH, SIXTH } byte; int expected_byte_count; bool emit_invalid_utf8_warning; bool emit_incomplete_utf8_warning; utf8(FILE *); ~utf8(); void add(unsigned char); void invalid(); void incomplete(); }; utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_byte_count(1), emit_invalid_utf8_warning(true), emit_incomplete_utf8_warning(true) { // empty } utf8::~utf8() { if (byte != FIRST) incomplete(); } inline void utf8::add(unsigned char c) { s[byte] = c; if (byte == FIRST) { if (c < 0x80) unicode_entity(c); else if (c < 0xC0) invalid(); else if (c < 0xE0) { expected_byte_count = 2; byte = SECOND; } else if (c < 0xF0) { expected_byte_count = 3; byte = SECOND; } else if (c < 0xF8) { expected_byte_count = 4; byte = SECOND; } else if (c < 0xFC) { expected_byte_count = 5; byte = SECOND; } else if (c < 0xFE) { expected_byte_count = 6; byte = SECOND; } else invalid(); return; } if (c < 0x80 || c > 0xBF) { incomplete(); add(c); return; } switch (byte) { case FIRST: // can't happen break; case SECOND: if (expected_byte_count == 2) { if (s[0] < 0xC2) invalid(); else unicode_entity(((s[0] & 0x1F) << 6) | (s[1] ^ 0x80)); byte = FIRST; } else byte = THIRD; break; case THIRD: if (expected_byte_count == 3) { if (!(s[0] >= 0xE1 || s[1] >= 0xA0)) invalid(); else unicode_entity(((s[0] & 0x1F) << 12) | ((s[1] ^ 0x80) << 6) | (s[2] ^ 0x80)); byte = FIRST; } else byte = FOURTH; break; case FOURTH: // We reject everything greater than 0x10FFFF. if (expected_byte_count == 4) { if (!((s[0] >= 0xF1 || s[1] >= 0x90) && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90)))) invalid(); else unicode_entity(((s[0] & 0x07) << 18) | ((s[1] ^ 0x80) << 12) | ((s[2] ^ 0x80) << 6) | (s[3] ^ 0x80)); byte = FIRST; } else byte = FIFTH; break; case FIFTH: if (expected_byte_count == 5) { invalid(); byte = FIRST; } else byte = SIXTH; break; case SIXTH: invalid(); byte = FIRST; break; } } // We use fprintf(stderr) instead of libgroff's debug() because we need // to output longs, and libgroff's errprint() doesn't support that. void utf8::invalid() { if (is_debugging && emit_invalid_utf8_warning) { fprintf(stderr, " invalid UTF-8 sequence(s) in input stream:" " replacing each such sequence with 0xFFFD\n"); emit_invalid_utf8_warning = false; } unicode_entity(0xFFFD); byte = FIRST; } void utf8::incomplete() { if (is_debugging && emit_incomplete_utf8_warning) { fprintf(stderr, " incomplete UTF-8 sequence(s) in input stream:" " replacing each such sequence with 0xFFFD\n"); emit_incomplete_utf8_warning = false; } unicode_entity(0xFFFD); byte = FIRST; } // Conversion from UTF-8 to Unicode. void conversion_utf8(FILE *fp, const string &data) { utf8 u(fp); int len = data.length(); const unsigned char *ptr = (const unsigned char *)data.contents(); for (int i = 0; i < len; i++) u.add(ptr[i]); int c = -1; while ((c = getc(fp)) != EOF) u.add(c); return; } // Conversion from cp1047 (EBCDIC) to UTF-8. void conversion_cp1047(FILE *fp, const string &data) { static unsigned char cp1047[] = { 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, // 0x00 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, // 0x10 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, // 0x20 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07, 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, // 0x30 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A, 0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, // 0x40 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C, 0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, // 0x50 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E, 0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, // 0x60 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F, 0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, // 0x70 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22, 0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // 0x80 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1, 0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, // 0x90 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4, 0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, // 0xA0 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE, 0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, // 0xB0 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7, 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // 0xC0 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5, 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, // 0xD0 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF, 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, // 0xE0 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0xF0 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F, }; int len = data.length(); const unsigned char *ptr = (const unsigned char *)data.contents(); for (int i = 0; i < len; i++) unicode_entity(cp1047[ptr[i]]); int c = -1; while ((c = getc(fp)) != EOF) unicode_entity(cp1047[c]); } // Locale-sensible conversion. #if HAVE_ICONV void conversion_iconv(FILE *fp, const string &data, char *enc) { iconv_t handle = iconv_open(UNICODE, enc); if (handle == (iconv_t)-1) { if (errno == EINVAL) { error("encoding system '%1' not supported by iconv()", enc); return; } fatal("iconv_open failed"); } char inbuf[BUFSIZ]; int outbuf[BUFSIZ]; char *outptr = (char *)outbuf; size_t outbytes_left = BUFSIZ * sizeof (int); // Handle 'data'. char *inptr = (char *)data.contents(); size_t inbytes_left = data.length(); char *limit; while (inbytes_left > 0) { size_t status = iconv(handle, (ICONV_CONST char **)&inptr, &inbytes_left, &outptr, &outbytes_left); if (status == (size_t)-1) { if (errno == EILSEQ) { // Invalid byte sequence. XXX inptr++; inbytes_left--; } else if (errno == E2BIG) { // Output buffer is full. limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left; for (int *ptr = outbuf; (char *)ptr < limit; ptr++) unicode_entity(*ptr); memmove(outbuf, outptr, outbytes_left); outptr = (char *)outbuf + outbytes_left; outbytes_left = BUFSIZ * sizeof (int) - outbytes_left; } else if (errno == EINVAL) { // 'data' ends with partial input sequence. memcpy(inbuf, inptr, inbytes_left); break; } } } // Handle 'fp' and switch to 'inbuf'. size_t read_bytes; char *read_start = inbuf + inbytes_left; while ((read_bytes = fread(read_start, 1, BUFSIZ - inbytes_left, fp)) > 0) { inptr = inbuf; inbytes_left += read_bytes; while (inbytes_left > 0) { size_t status = iconv(handle, (ICONV_CONST char **)&inptr, &inbytes_left, &outptr, &outbytes_left); if (status == (size_t)-1) { if (errno == EILSEQ) { // Invalid byte sequence. XXX inptr++; inbytes_left--; } else if (errno == E2BIG) { // Output buffer is full. limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left; for (int *ptr = outbuf; (char *)ptr < limit; ptr++) unicode_entity(*ptr); memmove(outbuf, outptr, outbytes_left); outptr = (char *)outbuf + outbytes_left; outbytes_left = BUFSIZ * sizeof (int) - outbytes_left; } else if (errno == EINVAL) { // 'inbuf' ends with partial input sequence. memmove(inbuf, inptr, inbytes_left); break; } } } read_start = inbuf + inbytes_left; } iconv_close(handle); // XXX use ferror? limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left; for (int *ptr = outbuf; (char *)ptr < limit; ptr++) unicode_entity(*ptr); } #endif /* HAVE_ICONV */ // --------------------------------------------------------- // Handle Byte Order Mark. // // Since we have a chicken-and-egg problem it's necessary // to handle the BOM manually if it is in the data stream. // As documented in the Unicode book it is very unlikely // that any normal text file (regardless of the encoding) // starts with the bytes which represent a BOM. // // Return the BOM in string 'BOM'; 'data' then starts with // the byte after the BOM. This function reads (at most) // four bytes from the data stream. // // Return encoding if a BOM is found, NULL otherwise. // --------------------------------------------------------- const char * get_BOM(FILE *fp, string &BOM, string &data) { // The BOM is U+FEFF. We have thus the following possible // representations. // // UTF-8: 0xEFBBBF // UTF-16: 0xFEFF or 0xFFFE // UTF-32: 0x0000FEFF or 0xFFFE0000 static struct { int len; const char *str; const char *name; } BOM_table[] = { {4, "\x00\x00\xFE\xFF", "UTF-32"}, {4, "\xFF\xFE\x00\x00", "UTF-32"}, {3, "\xEF\xBB\xBF", "UTF-8"}, {2, "\xFE\xFF", "UTF-16"}, {2, "\xFF\xFE", "UTF-16"}, }; const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]); char BOM_string[4]; const char *retval = NULL; int len; for (len = 0; len < 4; len++) { int c = getc(fp); if (c == EOF) break; BOM_string[len] = char(c); } int i; for (i = 0; i < BOM_table_len; i++) { if (BOM_table[i].len <= len && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0) break; } int j = 0; if (i < BOM_table_len) { for (; j < BOM_table[i].len; j++) BOM += BOM_string[j]; retval = BOM_table[i].name; } for (; j < len; j++) data += BOM_string[j]; return retval; } // --------------------------------------------------------- // Get first two lines from input stream. // // Return string (allocated with 'new') without zero bytes // or NULL in case no coding tag can occur in the data // (which is stored unmodified in 'data'). // --------------------------------------------------------- char * get_tag_lines(FILE *fp, string &data) { int newline_count = 0; int c, prev = -1; // Handle CR, LF, and CRLF as line separators. for (int i = 0; i < data.length(); i++) { c = data[i]; if (c == '\n' || c == '\r') newline_count++; if (c == '\n' && prev == '\r') newline_count--; prev = c; } if (newline_count > 1) return NULL; bool emit_warning = true; for (int lines = newline_count; lines < 2; lines++) { while ((c = getc(fp)) != EOF) { if (c == '\0' && is_debugging && emit_warning) { warning("null byte(s) found in input stream:" " search for coding tag might return false result"); emit_warning = false; } data += char(c); if (c == '\n' || c == '\r') break; } // Handle CR, LF, and CRLF as line separators. if (c == '\r') { c = getc(fp); if (c != EOF && c != '\n') ungetc(c, fp); else data += char(c); } } return data.extract(); } // --------------------------------------------------------- // Check whether C string starts with a comment. // // Return 1 if true, 0 otherwise. // --------------------------------------------------------- int is_comment_line(char *s) { if (!s || !*s) return 0; if (*s == '.' || *s == '\'') { s++; while (*s == ' ' || *s == '\t') s++; if (*s && *s == '\\') { s++; if (*s == '"' || *s == '#') return 1; } } else if (*s == '\\') { s++; if (*s == '#') return 1; } return 0; } // --------------------------------------------------------- // Get a value/variable pair from a local variables list // in a C string which look like this: // // : ; : ; ... // // Leading and trailing blanks are ignored. There might be // more than one blank after ':' and ';'. // // Return position of next value/variable pair or NULL if // at end of data. // --------------------------------------------------------- char * get_variable_value_pair(char *d1, char **variable, char **value) { static char var[MAX_VAR_LEN], val[MAX_VAR_LEN]; *variable = var; *value = val; while (*d1 == ' ' || *d1 == '\t') d1++; // Get variable. int l = 0; while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1)) var[l++] = *(d1++); var[l] = 0; // Skip everything until ':', ';', or end of data. while (*d1 && *d1 != ':' && *d1 != ';') d1++; val[0] = 0; if (!*d1) return NULL; if (*d1 == ';') return d1 + 1; d1++; while (*d1 == ' ' || *d1 == '\t') d1++; // Get value. l = 0; while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1)) val[l++] = *(d1++); val[l] = 0; // Skip everything until ';' or end of data. while (*d1 && *d1 != ';') d1++; if (*d1 == ';') return d1 + 1; return NULL; } // --------------------------------------------------------- // Check coding tag in the read buffer. // // We search for the following line: // // ... -*--*- // // ('...' might be anything). // // can be one of the following syntax forms at the // beginning of the line: // // .\" .\# '\" '\# \# // // There can be whitespace after the leading '.' or "'". // // The local variables list must occur within the first // comment block at the very beginning of the data stream. // // Within the , we search for // // coding: // // which specifies the coding system used for the data // stream. // // Return if found, NULL otherwise. // // Note that null bytes in the data are skipped before applying // the algorithm. This should work even with files encoded as // UTF-16 or UTF-32 (or its siblings) in most cases. // --------------------------------------------------------- char * check_coding_tag(FILE *fp, string &data) { char *inbuf = get_tag_lines(fp, data); char *lineend; for (char *p = inbuf; is_comment_line(p); p = lineend + 1) { if ((lineend = strchr(p, '\n')) == NULL) break; *lineend = 0; // switch temporarily to '\0' char *d1 = strstr(p, "-*-"); char *d2 = 0; if (d1) d2 = strstr(d1 + 3, "-*-"); *lineend = '\n'; // restore newline if (!d1 || !d2) continue; *d2 = 0; // switch temporarily to '\0' d1 += 3; while (d1) { char *variable, *value; d1 = get_variable_value_pair(d1, &variable, &value); if (!strcasecmp(variable, "coding")) { *d2 = '-'; // restore '-' free(inbuf); return value; } } *d2 = '-'; // restore '-' } free(inbuf); return NULL; } char * detect_file_encoding(FILE *fp) { #ifdef HAVE_UCHARDET uchardet_t ud = NULL; struct stat stat_buf; size_t len, read_bytes; char *data = NULL; int res, current_position; const char *charset; char *ret = NULL; current_position = ftell(fp); /* Due to BOM and tag detection, we are not at the beginning of the file. */ rewind(fp); if (fstat(fileno(fp), &stat_buf) != 0) { error("fstat: %1", strerror(errno)); goto end; } len = stat_buf.st_size; if (is_debugging) fprintf(stderr, " len: %lu\n", (unsigned long)len); if (len == 0) goto end; data = (char *)calloc(len, 1); read_bytes = fread(data, 1, len, fp); if (read_bytes == 0) { error("fread: %1", strerror(errno)); goto end; } /* We rewind back to the original position */ if (fseek(fp, current_position, SEEK_SET) != 0) { fatal("fseek: %1", strerror(errno)); goto end; } ud = uchardet_new(); res = uchardet_handle_data(ud, data, len); if (res != 0) { debug(" uchardet_handle_data: error %1\n", res); goto end; } if (is_debugging) fprintf(stderr, " uchardet read: %lu bytes\n", (unsigned long)read_bytes); uchardet_data_end(ud); charset = uchardet_get_charset(ud); if (is_debugging) { if (charset) fprintf(stderr, " charset: %s\n", charset); else fprintf(stderr, " charset is NULL\n"); } /* uchardet 0.0.1 could return an empty string instead of NULL */ if (charset && *charset) { ret = (char *)malloc(strlen(charset) + 1); strcpy(ret, charset); } end: if (ud) uchardet_delete(ud); if (data) free(data); return ret; #else /* not HAVE_UCHARDET */ return NULL; #endif /* not HAVE_UCHARDET */ } // --------------------------------------------------------- // Handle an input file. If `filename` is "-", read the // standard input stream. // // Return 1 on success, 0 otherwise. // --------------------------------------------------------- int do_file(const char *filename) { FILE *fp; string BOM, data; bool is_seekable = false; string reported_filename; // TODO: Consider moving some of this into a `quoted_file_name` // function in libgroff. if (strcmp(filename, "-") == 0) { fp = stdin; reported_filename = string(""); } else { fp = fopen(filename, FOPEN_RB); reported_filename = "'" + string(filename) + "'"; } if (!fp) { error("can't open %1: %2", reported_filename.contents(), strerror(errno)); return 0; } if (is_debugging) fprintf(stderr, "processing %s\n", reported_filename.contents()); if (fseek(fp, 0L, SEEK_SET) == 0) is_seekable = true; else { SET_BINARY(fileno(fp)); if (is_debugging) fprintf(stderr, " stream is not seekable: %s\n", strerror(errno)); } const char *BOM_encoding = get_BOM(fp, BOM, data); // Determine the encoding. char *encoding; int must_free_encoding = 0; if (user_encoding[0]) { if (is_debugging) { fprintf(stderr, " user-specified encoding '%s', " "no search for coding tag\n", user_encoding); if (BOM_encoding && strcmp(BOM_encoding, user_encoding)) fprintf(stderr, " but BOM in data stream implies encoding '%s'!\n", BOM_encoding); } encoding = (char *)user_encoding; } else if (BOM_encoding) { if (is_debugging) fprintf(stderr, " found BOM, no search for coding tag\n"); encoding = (char *)BOM_encoding; } else { // 'check_coding_tag' returns a pointer to a static array (or NULL). char *file_encoding = check_coding_tag(fp, data); if (!file_encoding) { if (is_debugging) fprintf(stderr, " no coding tag\n"); if (is_seekable) file_encoding = detect_file_encoding(fp); if (!file_encoding) { if (is_debugging) fprintf(stderr, " could not detect encoding with uchardet\n"); file_encoding = fallback_encoding; } else must_free_encoding = 1; } else if (is_debugging) fprintf(stderr, " coding tag: '%s'\n", file_encoding); encoding = file_encoding; } strncpy(encoding_string, encoding, MAX_VAR_LEN - 1); encoding_string[MAX_VAR_LEN - 1] = 0; if (must_free_encoding) free(encoding); encoding = encoding_string; // Translate from MIME & Emacs encoding names to locale encoding names. encoding = emacs2mime(encoding_string); if (encoding[0] == '\0') { error("encoding '%1' not supported, not a portable encoding", encoding_string); return 0; } if (is_debugging) fprintf(stderr, " encoding used: '%s'\n", encoding); if (!raw_flag) { string fn(filename); fn += '\0'; normalize_for_lf(fn); printf(".lf 1 %s\n", fn.contents()); } int success = 1; // Call converter (converters write to stdout). if (!strcasecmp(encoding, "ISO-8859-1")) conversion_latin1(fp, BOM + data); else if (!strcasecmp(encoding, "UTF-8")) conversion_utf8(fp, data); else if (!strcasecmp(encoding, "cp1047")) conversion_cp1047(fp, BOM + data); else { #if HAVE_ICONV conversion_iconv(fp, BOM + data, encoding); #else error("encoding system '%1' not supported", encoding); success = 0; #endif /* HAVE_ICONV */ } if (fp != stdin) fclose(fp); return success; } // --------------------------------------------------------- // Print usage. // --------------------------------------------------------- void usage(FILE *stream) { fprintf(stream, "usage: %s [-dr] [-D fallback-encoding] [-e encoding] [file ...]\n" "usage: %s {-v | --version}\n" "usage: %s {-h | --help}\n", program_name, program_name, program_name); if (stdout == stream) { fprintf(stream, "\n" "Read each file, convert its encoded characters to a form GNU" " troff(1)\n" "can interpret, and send the result to the standard output stream.\n" "The default fallback encoding is '%s'. See the preconv(1) manual" " page.\n", fallback_encoding); exit(EXIT_SUCCESS); } } // --------------------------------------------------------- // Main routine. // --------------------------------------------------------- int main(int argc, char **argv) { program_name = argv[0]; // Determine the fallback encoding. This must be done before // getopt() is called since the usage message shows the fallback // encoding. setlocale(LC_ALL, ""); char *locale = getlocale(LC_CTYPE); if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX")) strcpy(fallback_encoding, "latin1"); else { strncpy(fallback_encoding, locale_charset(), MAX_VAR_LEN - 1); fallback_encoding[MAX_VAR_LEN - 1] = 0; } program_name = argv[0]; int opt; static const struct option long_options[] = { { "help", no_argument, 0, 'h' }, { "version", no_argument, 0, 'v' }, { NULL, 0, 0, 0 } }; // Parse the command-line options. while ((opt = getopt_long(argc, argv, "dD:e:hrv", long_options, NULL)) != EOF) switch (opt) { case 'v': printf("GNU preconv (groff) version %s %s iconv support and %s uchardet support\n", Version_string, #ifdef HAVE_ICONV "with", #else "without", #endif /* HAVE_ICONV */ #ifdef HAVE_UCHARDET "with" #else "without" #endif /* HAVE_UCHARDET */ ); exit(0); break; case 'd': is_debugging = true; break; case 'e': if (optarg) { strncpy(user_encoding, optarg, MAX_VAR_LEN - 1); user_encoding[MAX_VAR_LEN - 1] = 0; } else user_encoding[0] = 0; break; case 'D': if (optarg) { strncpy(fallback_encoding, optarg, MAX_VAR_LEN - 1); fallback_encoding[MAX_VAR_LEN - 1] = 0; } break; case 'r': raw_flag = 1; break; case 'h': usage(stdout); break; case '?': usage(stderr); exit(1); break; default: assert(0); } int nbad = 0; if (is_debugging) fprintf(stderr, "fallback encoding: '%s'\n", fallback_encoding); if (optind >= argc) nbad += !do_file("-"); else for (int i = optind; i < argc; i++) nbad += !do_file(argv[i]); if (ferror(stdout) || fflush(stdout) < 0) fatal("output error"); return nbad != 0; } // Local Variables: // fill-column: 72 // mode: C++ // End: // vim: set cindent noexpandtab shiftwidth=2 textwidth=72: