/* Copyright (C) 2005-2020 Free Software Foundation, Inc.
     Written by Werner Lemberg (wl@gnu.org)

This file is part of groff.

groff is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or
(at your option) any later version.

groff is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>. */

#include "lib.h"

#include <assert.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/stat.h>
#ifdef HAVE_UCHARDET
#include <uchardet/uchardet.h>
#endif

#include "errarg.h"
#include "error.h"
#include "localcharset.h"
#include "nonposix.h"
#include "stringclass.h"
#include "lf.h"

#include <locale.h>

#if HAVE_ICONV
# include <iconv.h>
# ifdef WORDS_BIGENDIAN
#  define UNICODE "UTF-32BE"
# else
#  define UNICODE "UTF-32LE"
# endif
#endif

#define MAX_VAR_LEN 100

extern "C" const char *Version_string;

char fallback_encoding[MAX_VAR_LEN];
char user_encoding[MAX_VAR_LEN];
char encoding_string[MAX_VAR_LEN];
bool is_debugging = false;
int raw_flag = 0;

struct conversion {
  const char *from;
  const char *to;
};

// The official list of MIME tags can be found at
//
//   http://www.iana.org/assignments/character-sets
//
// For encodings which don't have a MIME tag we use GNU iconv's encoding
// names (which also work with the portable GNU libiconv package).  They
// are marked with '*'.
//
// Encodings specific to XEmacs and Emacs are marked as such; no mark means
// that they are used by both Emacs and XEmacs.
//
// Encodings marked with '--' are special to Emacs, XEmacs, or other
// applications and shouldn't be used for data exchange.
//
// 'Not covered' means that the encoding can be handled neither by GNU iconv
// nor by libiconv, or just one of them has support for it.
//
// A special case is VIQR encoding: Despite of having a MIME tag it is
// missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
//
// Finally, we add all aliases of GNU iconv for 'ascii', 'latin1', and
// 'utf8' to catch those encoding names before iconv is called.
//
// Note that most entries are commented out -- only a small, (rather)
// reliable and stable subset of encodings is recognized (for coding tags)
// which are still in greater use today (January 2006).  Most notably, all
// Windows-specific encodings are not selected because they lack stability:
// Microsoft has changed the mappings instead of creating new versions.
//
// Please contact the groff list if you find the selection inadequate.

static const conversion
emacs_to_mime[] = {
  {"ascii",				"US-ASCII"},	// Emacs
  {"big5",				"Big5"},
  {"chinese-big5",			"Big5"},	// Emacs
  {"chinese-euc",			"GB2312"},	// XEmacs
  {"chinese-iso-8bit",			"GB2312"},	// Emacs
  {"cn-big5",				"Big5"},
  {"cn-gb",				"GB2312"},	// Emacs
  {"cn-gb-2312",			"GB2312"},
  {"cp878",				"KOI8-R"},	// Emacs
  {"cp1047",				"CP1047"},	// EBCDIC
  {"csascii",				"US-ASCII"},	// alias
  {"csisolatin1",			"ISO-8859-1"},	// alias
  {"cyrillic-iso-8bit",			"ISO-8859-5"},	// Emacs
  {"cyrillic-koi8",			"KOI8-R"},	// not KOI8!, Emacs
  {"euc-china",				"GB2312"},	// Emacs
  {"euc-cn",				"GB2312"},	// Emacs
  {"euc-japan",				"EUC-JP"},
  {"euc-japan-1990",			"EUC-JP"},	// Emacs
  {"euc-jp",				"EUC-JP"},
  {"euc-korea",				"EUC-KR"},
  {"euc-kr",				"EUC-KR"},
  {"gb2312",				"GB2312"},
  {"greek-iso-8bit",			"ISO-8859-7"},
  {"iso-10646/utf8",			"UTF-8"},	// alias
  {"iso-10646/utf-8",			"UTF-8"},	// alias
  {"iso-8859-1",			"ISO-8859-1"},
  {"iso-8859-13",			"ISO-8859-13"},	// Emacs
  {"iso-8859-15",			"ISO-8859-15"},
  {"iso-8859-2",			"ISO-8859-2"},
  {"iso-8859-5",			"ISO-8859-5"},
  {"iso-8859-7",			"ISO-8859-7"},
  {"iso-8859-9",			"ISO-8859-9"},
  {"iso-latin-1",			"ISO-8859-1"},
  {"iso-latin-2",			"ISO-8859-2"},	// Emacs
  {"iso-latin-5",			"ISO-8859-9"},	// Emacs
  {"iso-latin-7",			"ISO-8859-13"},	// Emacs
  {"iso-latin-9",			"ISO-8859-15"},	// Emacs
  {"japanese-iso-8bit",			"EUC-JP"},	// Emacs
  {"japanese-euc",			"EUC-JP"},	// XEmacs
  {"jis8",				"EUC-JP"},	// XEmacs
  {"koi8",				"KOI8-R"},	// not KOI8!, Emacs
  {"koi8-r",				"KOI8-R"},
  {"korean-euc",			"EUC-KR"},	// XEmacs
  {"korean-iso-8bit",			"EUC-KR"},	// Emacs
  {"latin1",				"ISO-8859-1"},  // alias
  {"latin-0",				"ISO-8859-15"},	// Emacs
  {"latin-1",				"ISO-8859-1"},	// Emacs
  {"latin-2",				"ISO-8859-2"},	// Emacs
  {"latin-5",				"ISO-8859-9"},	// Emacs
  {"latin-7",				"ISO-8859-13"},	// Emacs
  {"latin-9",				"ISO-8859-15"},	// Emacs
  {"mule-utf-16",			"UTF-16"},	// Emacs
  {"mule-utf-16be",			"UTF-16BE"},	// Emacs
  {"mule-utf-16-be",			"UTF-16BE"},	// Emacs
  {"mule-utf-16be-with-signature",	"UTF-16"},	// Emacs, not UTF-16BE
  {"mule-utf-16le",			"UTF-16LE"},	// Emacs
  {"mule-utf-16-le",			"UTF-16LE"},	// Emacs
  {"mule-utf-16le-with-signature",	"UTF-16"},	// Emacs, not UTF-16LE
  {"mule-utf-8",			"UTF-8"},	// Emacs
  {"us-ascii",				"US-ASCII"},	// Emacs
  {"utf8",				"UTF-8"},	// alias
  {"utf-16",				"UTF-16"},	// Emacs
  {"utf-16be",				"UTF-16BE"},	// Emacs
  {"utf-16-be",				"UTF-16BE"},	// Emacs
  {"utf-16be-with-signature",		"UTF-16"},	// Emacs, not UTF-16BE
  {"utf-16-be-with-signature",		"UTF-16"},	// Emacs, not UTF-16BE
  {"utf-16le",				"UTF-16LE"},	// Emacs
  {"utf-16-le",				"UTF-16LE"},	// Emacs
  {"utf-16le-with-signature",		"UTF-16"},	// Emacs, not UTF-16LE
  {"utf-16-le-with-signature",		"UTF-16"},	// Emacs, not UTF-16LE
  {"utf-8",				"UTF-8"},	// Emacs

//  {"alternativnyj",			""},		// ?
//  {"arabic-iso-8bit",			"ISO-8859-6"},	// Emacs
//  {"binary",				""},		// --
//  {"chinese-hz",			"HZ-GB-2312"},	// Emacs
//  {"chinese-iso-7bit",		"ISO-2022-CN"},	// Emacs
//  {"chinese-iso-8bit-with-esc",	""},		// --
//  {"compound-text",			""},		// --
//  {"compound-text-with-extension",	""},		// --
//  {"cp1125",				"cp1125"},	// *
//  {"cp1250",				"windows-1250"},// Emacs
//  {"cp1251",				"windows-1251"},// Emacs
//  {"cp1252",				"windows-1252"},// Emacs
//  {"cp1253",				"windows-1253"},// Emacs
//  {"cp1254",				"windows-1254"},// Emacs
//  {"cp1255",				"windows-1255"},// Emacs
//  {"cp1256",				"windows-1256"},// Emacs
//  {"cp1257",				"windows-1257"},// Emacs
//  {"cp1258",				"windows-1258"},// Emacs
//  {"cp437",				"cp437"},	// Emacs
//  {"cp720",				""},		// not covered
//  {"cp737",				"cp737"},	// *, Emacs
//  {"cp775",				"cp775"},	// Emacs
//  {"cp850",				"cp850"},	// Emacs
//  {"cp851",				"cp851"},	// Emacs
//  {"cp852",				"cp852"},	// Emacs
//  {"cp855",				"cp855"},	// Emacs
//  {"cp857",				"cp857"},	// Emacs
//  {"cp860",				"cp860"},	// Emacs
//  {"cp861",				"cp861"},	// Emacs
//  {"cp862",				"cp862"},	// Emacs
//  {"cp863",				"cp863"},	// Emacs
//  {"cp864",				"cp864"},	// Emacs
//  {"cp865",				"cp865"},	// Emacs
//  {"cp866",				"cp866"},	// Emacs
//  {"cp866u",				"cp1125"},	// *, Emacs
//  {"cp869",				"cp869"},	// Emacs
//  {"cp874",				"cp874"},	// *, Emacs
//  {"cp932",				"cp932"},	// *, Emacs
//  {"cp936",				"cp936"},	// Emacs
//  {"cp949",				"cp949"},	// *, Emacs
//  {"cp950",				"cp950"},	// *, Emacs
//  {"ctext",				""},		// --
//  {"ctext-no-compositions",		""},		// --
//  {"ctext-with-extensions",		""},		// --
//  {"cyrillic-alternativnyj",		""},		// ?, Emacs
//  {"cyrillic-iso-8bit-with-esc",	""},		// --
//  {"cyrillic-koi8-t",			"KOI8-T"},	// *, Emacs
//  {"devanagari",			""},		// not covered
//  {"dos",				""},		// --
//  {"emacs-mule",			""},		// --
//  {"euc-jisx0213",			"EUC-JISX0213"},// *, XEmacs?
//  {"euc-jisx0213-with-esc",		""},		// XEmacs?
//  {"euc-taiwan",			"EUC-TW"},	// *, Emacs
//  {"euc-tw",				"EUC-TW"},	// *, Emacs
//  {"georgian-ps",			"GEORGIAN-PS"},	// *, Emacs
//  {"greek-iso-8bit-with-esc",		""},		// --
//  {"hebrew-iso-8bit",			"ISO-8859-8"},	// Emacs
//  {"hebrew-iso-8bit-with-esc",	""},		// --
//  {"hz",				"HZ-GB-2312"},
//  {"hz-gb-2312",			"HZ-GB-2312"},
//  {"in-is13194",			""},		// not covered
//  {"in-is13194-devanagari",		""},		// not covered
//  {"in-is13194-with-esc",		""},		// --
//  {"iso-2022-7",			""},		// XEmacs?
//  {"iso-2022-7bit",			""},		// --
//  {"iso-2022-7bit-lock",		""},		// --
//  {"iso-2022-7bit-lock-ss2",		""},		// --
//  {"iso-2022-7bit-ss2",		""},		// --
//  {"iso-2022-8",			""},		// XEmacs?
//  {"iso-2022-8bit",			""},		// XEmacs?
//  {"iso-2022-8bit-lock",		""},		// XEmacs?
//  {"iso-2022-8bit-lock-ss2",		""},		// XEmacs?
//  {"iso-2022-8bit-ss2",		""},		// --
//  {"iso-2022-cjk",			""},		// --
//  {"iso-2022-cn",			"ISO-2022-CN"},	// Emacs
//  {"iso-2022-cn-ext",			"ISO-2022-CN-EXT"},// Emacs
//  {"iso-2022-int-1",			""},		// --
//  {"iso-2022-jp",			"ISO-2022-JP"},
//  {"iso-2022-jp-1978-irv",		"ISO-2022-JP"},
//  {"iso-2022-jp-2",			"ISO-2022-JP-2"},
//  {"iso-2022-jp-3",			"ISO-2022-JP-3"},// *, XEmacs?
//  {"iso-2022-jp-3-compatible",	""},		// XEmacs?
//  {"iso-2022-jp-3-strict",		"ISO-2022-JP-3"},// *, XEmacs?
//  {"iso-2022-kr",			"ISO-2022-KR"},
//  {"iso-2022-lock",			""},		// XEmacs?
//  {"iso-8859-10",			"ISO-8859-10"},	// Emacs
//  {"iso-8859-11",			"ISO-8859-11"},	// *, Emacs
//  {"iso-8859-14",			"ISO-8859-14"},	// Emacs
//  {"iso-8859-16",			"ISO-8859-16"},
//  {"iso-8859-3",			"ISO-8859-3"},
//  {"iso-8859-4",			"ISO-8859-4"},
//  {"iso-8859-6",			"ISO-8859-6"},
//  {"iso-8859-8",			"ISO-8859-8"},
//  {"iso-8859-8-e",			"ISO-8859-8"},
//  {"iso-8859-8-i",			"ISO-8859-8"},	// Emacs
//  {"iso-latin-10",			"ISO-8859-16"},	// Emacs
//  {"iso-latin-1-with-esc",		""},		// --
//  {"iso-latin-2-with-esc",		""},		// --
//  {"iso-latin-3",			"ISO-8859-3"},	// Emacs
//  {"iso-latin-3-with-esc",		""},		// --
//  {"iso-latin-4",			"ISO-8859-4"},	// Emacs
//  {"iso-latin-4-with-esc",		""},		// --
//  {"iso-latin-5-with-esc",		""},		// --
//  {"iso-latin-6",			"ISO-8859-10"},	// Emacs
//  {"iso-latin-8",			"ISO-8859-14"},	// Emacs
//  {"iso-safe",				""},		// --
//  {"japanese-iso-7bit-1978-irv",	"ISO-2022-JP"},	// Emacs
//  {"japanese-iso-8bit-with-esc",	""},		// --
//  {"japanese-shift-jis",		"Shift_JIS"},	// Emacs
//  {"japanese-shift-jisx0213",		""},		// XEmacs?
//  {"jis7",				"ISO-2022-JP"},	// Xemacs
//  {"junet",				"ISO-2022-JP"},
//  {"koi8-t",				"KOI8-T"},	// *, Emacs
//  {"koi8-u",				"KOI8-U"},	// Emacs
//  {"korean-iso-7bit-lock",		"ISO-2022-KR"},
//  {"korean-iso-8bit-with-esc",	""},		// --
//  {"lao",				""},		// not covered
//  {"lao-with-esc",			""},		// --
//  {"latin-10",			"ISO-8859-16"},	// Emacs
//  {"latin-3",				"ISO-8859-3"},	// Emacs
//  {"latin-4",				"ISO-8859-4"},	// Emacs
//  {"latin-6",				"ISO-8859-10"},	// Emacs
//  {"latin-8",				"ISO-8859-14"},	// Emacs
//  {"mac",				""},		// --
//  {"mac-roman",			"MACINTOSH"},	// Emacs
//  {"mik",				""},		// not covered
//  {"next",				"NEXTSTEP"},	// *, Emacs
//  {"no-conversion",			""},		// --
//  {"old-jis",				"ISO-2022-JP"},
//  {"pt154",				"PT154"},	// Emacs
//  {"raw-text",			""},		// --
//  {"ruscii",				"cp1125"},	// *, Emacs
//  {"shift-jis",			"Shift_JIS"},	// XEmacs
//  {"shift_jis",			"Shift_JIS"},
//  {"shift_jisx0213",			"Shift_JISX0213"},// *, XEmacs?
//  {"sjis",				"Shift_JIS"},	// Emacs
//  {"tcvn",				"TCVN"},	// *, Emacs
//  {"tcvn-5712",			"TCVN"},	// *, Emacs
//  {"thai-tis620",			"TIS-620"},
//  {"thai-tis620-with-esc",		""},		// --
//  {"th-tis620",			"TIS-620"},
//  {"tibetan",				""},		// not covered
//  {"tibetan-iso-8bit",		""},		// not covered
//  {"tibetan-iso-8bit-with-esc",	""},		// --
//  {"tis-620",				"TIS-620"},
//  {"tis620",				"TIS-620"},
//  {"undecided",			""},		// --
//  {"unix",				""},		// --
//  {"utf-7",				"UTF-7"},	// Emacs
//  {"utf-7-safe",			""},		// XEmacs?
//  {"utf-8-ws",			"UTF-8"},	// XEmacs?
//  {"vietnamese-tcvn",			"TCVN"},	// *, Emacs
//  {"vietnamese-viqr",			"VIQR"},	// not covered
//  {"vietnamese-viscii",		"VISCII"},
//  {"vietnamese-vscii",		""},		// not covered
//  {"viqr",				"VIQR"},	// not covered
//  {"viscii",				"VISCII"},
//  {"vscii",				""},		// not covered
//  {"windows-037",			""},		// not covered
//  {"windows-10000",			""},		// not covered
//  {"windows-10001",			""},		// not covered
//  {"windows-10006",			""},		// not covered
//  {"windows-10007",			""},		// not covered
//  {"windows-10029",			""},		// not covered
//  {"windows-10079",			""},		// not covered
//  {"windows-10081",			""},		// not covered
//  {"windows-1026",			""},		// not covered
//  {"windows-1200",			""},		// not covered
//  {"windows-1250",			"windows-1250"},
//  {"windows-1251",			"windows-1251"},
//  {"windows-1252",			"windows-1252"},
//  {"windows-1253",			"windows-1253"},
//  {"windows-1254",			"windows-1254"},
//  {"windows-1255",			"windows-1255"},
//  {"windows-1256",			"windows-1256"},
//  {"windows-1257",			"windows-1257"},
//  {"windows-1258",			"windows-1258"},
//  {"windows-1361",			"cp1361"},	// *, XEmacs
//  {"windows-437",			"cp437"},	// XEmacs
//  {"windows-500",			""},		// not covered
//  {"windows-708",			""},		// not covered
//  {"windows-709",			""},		// not covered
//  {"windows-710",			""},		// not covered
//  {"windows-720",			""},		// not covered
//  {"windows-737",			"cp737"},	// *, XEmacs
//  {"windows-775",			"cp775"},	// XEmacs
//  {"windows-850",			"cp850"},	// XEmacs
//  {"windows-852",			"cp852"},	// XEmacs
//  {"windows-855",			"cp855"},	// XEmacs
//  {"windows-857",			"cp857"},	// XEmacs
//  {"windows-860",			"cp860"},	// XEmacs
//  {"windows-861",			"cp861"},	// XEmacs
//  {"windows-862",			"cp862"},	// XEmacs
//  {"windows-863",			"cp863"},	// XEmacs
//  {"windows-864",			"cp864"},	// XEmacs
//  {"windows-865",			"cp865"},	// XEmacs
//  {"windows-866",			"cp866"},	// XEmacs
//  {"windows-869",			"cp869"},	// XEmacs
//  {"windows-874",			"cp874"},	// XEmacs
//  {"windows-875",			""},		// not covered
//  {"windows-932",			"cp932"},	// *, XEmacs
//  {"windows-936",			"cp936"},	// XEmacs
//  {"windows-949",			"cp949"},	// *, XEmacs
//  {"windows-950",			"cp950"},	// *, XEmacs
//  {"x-ctext",				""},		// --
//  {"x-ctext-with-extensions",		""},		// --

  {NULL,				NULL},
};

// ---------------------------------------------------------
// Convert encoding name from emacs to mime.
// ---------------------------------------------------------
char *
emacs2mime(char *emacs_enc)
{
  int emacs_enc_len = strlen(emacs_enc);
  if (emacs_enc_len > 4
      && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
    emacs_enc[emacs_enc_len - 4] = 0;
  if (emacs_enc_len > 4
      && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
    emacs_enc[emacs_enc_len - 4] = 0;
  if (emacs_enc_len > 5
      && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
    emacs_enc[emacs_enc_len - 5] = 0;
  for (const conversion *table = emacs_to_mime; table->from; table++)
    if (!strcasecmp(emacs_enc, table->from))
      return (char *)table->to;
  return emacs_enc;
}

// ---------------------------------------------------------
// Print out Unicode entity if value is greater than 0x7F.
// ---------------------------------------------------------
inline void
unicode_entity(int u)
{
  if (u < 0x80)
    putchar(u);
  else {
    // Handle no-break space and soft hyphen specially--they are input
    // characters only, not glyphs.  See groff_char(7).
    if (u == 0xA0) {
      putchar('\\');
      putchar('~');
    }
    else if (u == 0xAD) {
      putchar('\\');
      putchar('%');
    }
    else
      printf("\\[u%04X]", u);
  }
}

// ---------------------------------------------------------
// Conversion functions.  All functions take 'data', which
// normally holds the first two lines, and a file pointer.
// ---------------------------------------------------------

// Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
void
conversion_latin1(FILE *fp, const string &data)
{
  int len = data.length();
  const unsigned char *ptr = (const unsigned char *)data.contents();
  for (int i = 0; i < len; i++)
    unicode_entity(ptr[i]);
  int c = -1;
  while ((c = getc(fp)) != EOF)
    unicode_entity(c);
}

// A future version of groff shall support UTF-8 natively.
// In this case, the UTF-8 stuff here in this file will be
// moved to the troff program.

struct utf8 {
  FILE *fp;
  unsigned char s[6];
  enum {
    FIRST = 0,
    SECOND,
    THIRD,
    FOURTH,
    FIFTH,
    SIXTH
  } byte;
  int expected_byte_count;
  bool emit_invalid_utf8_warning;
  bool emit_incomplete_utf8_warning;
  utf8(FILE *);
  ~utf8();
  void add(unsigned char);
  void invalid();
  void incomplete();
};

utf8::utf8(FILE *f) : fp(f), byte(FIRST), expected_byte_count(1),
		      emit_invalid_utf8_warning(true),
		      emit_incomplete_utf8_warning(true)
{
  // empty
}

utf8::~utf8()
{
  if (byte != FIRST)
    incomplete();
}

inline void
utf8::add(unsigned char c)
{
  s[byte] = c;
  if (byte == FIRST) {
    if (c < 0x80)
      unicode_entity(c);
    else if (c < 0xC0)
      invalid();
    else if (c < 0xE0) {
      expected_byte_count = 2;
      byte = SECOND;
    }
    else if (c < 0xF0) {
      expected_byte_count = 3;
      byte = SECOND;
    }
    else if (c < 0xF8) {
      expected_byte_count = 4;
      byte = SECOND;
    }
    else if (c < 0xFC) {
      expected_byte_count = 5;
      byte = SECOND;
    }
    else if (c < 0xFE) {
      expected_byte_count = 6;
      byte = SECOND;
    }
    else
      invalid();
    return;
  }
  if (c < 0x80 || c > 0xBF) {
    incomplete();
    add(c);
    return;
  }
  switch (byte) {
  case FIRST:
    // can't happen
    break;
  case SECOND:
    if (expected_byte_count == 2) {
      if (s[0] < 0xC2)
	invalid();
      else
	unicode_entity(((s[0] & 0x1F) << 6)
		       | (s[1] ^ 0x80));
      byte = FIRST;
    }
    else
      byte = THIRD;
    break;
  case THIRD:
    if (expected_byte_count == 3) {
      if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
	invalid();
      else
	unicode_entity(((s[0] & 0x1F) << 12)
		       | ((s[1] ^ 0x80) << 6)
		       | (s[2] ^ 0x80));
      byte = FIRST;
    }
    else
      byte = FOURTH;
    break;
  case FOURTH:
    // We reject everything greater than 0x10FFFF.
    if (expected_byte_count == 4) {
      if (!((s[0] >= 0xF1 || s[1] >= 0x90)
	    && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
	invalid();
      else
	unicode_entity(((s[0] & 0x07) << 18)
		       | ((s[1] ^ 0x80) << 12)
		       | ((s[2] ^ 0x80) << 6)
		       | (s[3] ^ 0x80));
      byte = FIRST;
    }
    else
      byte = FIFTH;
    break;
  case FIFTH:
    if (expected_byte_count == 5) {
      invalid();
      byte = FIRST;
    }
    else
      byte = SIXTH;
    break;
  case SIXTH:
    invalid();
    byte = FIRST;
    break;
  }
}

// We use fprintf(stderr) instead of libgroff's debug() because we need
// to output longs, and libgroff's errprint() doesn't support that.

void
utf8::invalid()
{
  if (is_debugging && emit_invalid_utf8_warning) {
    fprintf(stderr, "  invalid UTF-8 sequence(s) in input stream:"
		    " replacing each such sequence with 0xFFFD\n");
    emit_invalid_utf8_warning = false;
  }
  unicode_entity(0xFFFD);
  byte = FIRST;
}

void
utf8::incomplete()
{
  if (is_debugging && emit_incomplete_utf8_warning) {
    fprintf(stderr, "  incomplete UTF-8 sequence(s) in input stream:"
		    " replacing each such sequence with 0xFFFD\n");
    emit_incomplete_utf8_warning = false;
  }
  unicode_entity(0xFFFD);
  byte = FIRST;
}

// Conversion from UTF-8 to Unicode.
void
conversion_utf8(FILE *fp, const string &data)
{
  utf8 u(fp);
  int len = data.length();
  const unsigned char *ptr = (const unsigned char *)data.contents();
  for (int i = 0; i < len; i++)
    u.add(ptr[i]);
  int c = -1;
  while ((c = getc(fp)) != EOF)
    u.add(c);
  return;
}

// Conversion from cp1047 (EBCDIC) to UTF-8.
void
conversion_cp1047(FILE *fp, const string &data)
{
  static unsigned char cp1047[] = {
    0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F,	// 0x00
    0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
    0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87,	// 0x10
    0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
    0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B,	// 0x20
    0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
    0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,	// 0x30
    0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
    0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5,	// 0x40
    0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
    0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF,	// 0x50
    0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
    0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5,	// 0x60
    0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
    0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF,	// 0x70
    0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
    0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,	// 0x80
    0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
    0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,	// 0x90
    0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
    0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,	// 0xA0
    0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
    0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC,	// 0xB0
    0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
    0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,	// 0xC0
    0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
    0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,	// 0xD0
    0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
    0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,	// 0xE0
    0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,	// 0xF0
    0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
  };
  int len = data.length();
  const unsigned char *ptr = (const unsigned char *)data.contents();
  for (int i = 0; i < len; i++)
    unicode_entity(cp1047[ptr[i]]);
  int c = -1;
  while ((c = getc(fp)) != EOF)
    unicode_entity(cp1047[c]);
}

// Locale-sensible conversion.
#if HAVE_ICONV
void
conversion_iconv(FILE *fp, const string &data, char *enc)
{
  iconv_t handle = iconv_open(UNICODE, enc);
  if (handle == (iconv_t)-1) {
    if (errno == EINVAL) {
      error("encoding system '%1' not supported by iconv()", enc);
      return;
    }
    fatal("iconv_open failed");
  }
  char inbuf[BUFSIZ];
  int outbuf[BUFSIZ];
  char *outptr = (char *)outbuf;
  size_t outbytes_left = BUFSIZ * sizeof (int);
  // Handle 'data'.
  char *inptr = (char *)data.contents();
  size_t inbytes_left = data.length();
  char *limit;
  while (inbytes_left > 0) {
    size_t status = iconv(handle,
			  (ICONV_CONST char **)&inptr, &inbytes_left,
			  &outptr, &outbytes_left);
    if (status == (size_t)-1) {
      if (errno == EILSEQ) {
	// Invalid byte sequence.  XXX
	inptr++;
	inbytes_left--;
      }
      else if (errno == E2BIG) {
	// Output buffer is full.
	limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
	for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
	  unicode_entity(*ptr);
	memmove(outbuf, outptr, outbytes_left);
	outptr = (char *)outbuf + outbytes_left;
	outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
      }
      else if (errno == EINVAL) {
	// 'data' ends with partial input sequence.
	memcpy(inbuf, inptr, inbytes_left);
	break;
      }
    }
  }
  // Handle 'fp' and switch to 'inbuf'.
  size_t read_bytes;
  char *read_start = inbuf + inbytes_left;
  while ((read_bytes = fread(read_start, 1, BUFSIZ - inbytes_left, fp)) > 0) {
    inptr = inbuf;
    inbytes_left += read_bytes;
    while (inbytes_left > 0) {
      size_t status = iconv(handle,
			    (ICONV_CONST char **)&inptr, &inbytes_left,
			    &outptr, &outbytes_left);
      if (status == (size_t)-1) {
	if (errno == EILSEQ) {
	  // Invalid byte sequence.  XXX
	  inptr++;
	  inbytes_left--;
	}
	else if (errno == E2BIG) {
	  // Output buffer is full.
	  limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
	  for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
	    unicode_entity(*ptr);
	  memmove(outbuf, outptr, outbytes_left);
	  outptr = (char *)outbuf + outbytes_left;
	  outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
	}
	else if (errno == EINVAL) {
	  // 'inbuf' ends with partial input sequence.
	  memmove(inbuf, inptr, inbytes_left);
	  break;
	}
      }
    }
    read_start = inbuf + inbytes_left;
  }
  iconv_close(handle);
  // XXX use ferror?
  limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
  for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
    unicode_entity(*ptr);
}
#endif /* HAVE_ICONV */

// ---------------------------------------------------------
// Handle Byte Order Mark.
//
// Since we have a chicken-and-egg problem it's necessary
// to handle the BOM manually if it is in the data stream.
// As documented in the Unicode book it is very unlikely
// that any normal text file (regardless of the encoding)
// starts with the bytes which represent a BOM.
//
// Return the BOM in string 'BOM'; 'data' then starts with
// the byte after the BOM.  This function reads (at most)
// four bytes from the data stream.
//
// Return encoding if a BOM is found, NULL otherwise.
// ---------------------------------------------------------
const char *
get_BOM(FILE *fp, string &BOM, string &data)
{
  // The BOM is U+FEFF.  We have thus the following possible
  // representations.
  //
  //   UTF-8: 0xEFBBBF
  //   UTF-16: 0xFEFF or 0xFFFE
  //   UTF-32: 0x0000FEFF or 0xFFFE0000
  static struct {
    int len;
    const char *str;
    const char *name;
  } BOM_table[] = {
    {4, "\x00\x00\xFE\xFF", "UTF-32"},
    {4, "\xFF\xFE\x00\x00", "UTF-32"},
    {3, "\xEF\xBB\xBF", "UTF-8"},
    {2, "\xFE\xFF", "UTF-16"},
    {2, "\xFF\xFE", "UTF-16"},
  };
  const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]);
  char BOM_string[4];
  const char *retval = NULL;
  int len;
  for (len = 0; len < 4; len++) {
    int c = getc(fp);
    if (c == EOF)
      break;
    BOM_string[len] = char(c);
  }
  int i;
  for (i = 0; i < BOM_table_len; i++) {
    if (BOM_table[i].len <= len
	&& memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
      break;
  }
  int j = 0;
  if (i < BOM_table_len) {
    for (; j < BOM_table[i].len; j++)
      BOM += BOM_string[j];
    retval = BOM_table[i].name;
  }
  for (; j < len; j++)
    data += BOM_string[j];
  return retval;
}

// ---------------------------------------------------------
// Get first two lines from input stream.
//
// Return string (allocated with 'new') without zero bytes
// or NULL in case no coding tag can occur in the data
// (which is stored unmodified in 'data').
// ---------------------------------------------------------
char *
get_tag_lines(FILE *fp, string &data)
{
  int newline_count = 0;
  int c, prev = -1;
  // Handle CR, LF, and CRLF as line separators.
  for (int i = 0; i < data.length(); i++) {
    c = data[i];
    if (c == '\n' || c == '\r')
      newline_count++;
    if (c == '\n' && prev == '\r')
      newline_count--;
    prev = c;
  }
  if (newline_count > 1)
    return NULL;
  bool emit_warning = true;
  for (int lines = newline_count; lines < 2; lines++) {
    while ((c = getc(fp)) != EOF) {
      if (c == '\0' && is_debugging && emit_warning) {
	warning("null byte(s) found in input stream:"
		" search for coding tag might return false result");
	emit_warning = false;
      }
      data += char(c);
      if (c == '\n' || c == '\r')
	break;
    }
    // Handle CR, LF, and CRLF as line separators.
    if (c == '\r') {
      c = getc(fp);
      if (c != EOF && c != '\n')
	ungetc(c, fp);
      else
	data += char(c);
    }
  }
  return data.extract();
}

// ---------------------------------------------------------
// Check whether C string starts with a comment.
//
// Return 1 if true, 0 otherwise.
// ---------------------------------------------------------
int
is_comment_line(char *s)
{
  if (!s || !*s)
    return 0;
  if (*s == '.' || *s == '\'')
  {
    s++;
    while (*s == ' ' || *s == '\t')
      s++;
    if (*s && *s == '\\')
    {
      s++;
      if (*s == '"' || *s == '#')
	return 1;
    }
  }
  else if (*s == '\\')
  {
    s++;
    if (*s == '#')
      return 1;
  }
  return 0;
}

// ---------------------------------------------------------
// Get a value/variable pair from a local variables list
// in a C string which look like this:
//
//   <variable1>: <value1>; <variable2>: <value2>; ...
//
// Leading and trailing blanks are ignored.  There might be
// more than one blank after ':' and ';'.
//
// Return position of next value/variable pair or NULL if
// at end of data.
// ---------------------------------------------------------
char *
get_variable_value_pair(char *d1, char **variable, char **value)
{
  static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
  *variable = var;
  *value = val;
  while (*d1 == ' ' || *d1 == '\t')
    d1++;
  // Get variable.
  int l = 0;
  while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
    var[l++] = *(d1++);
  var[l] = 0;
  // Skip everything until ':', ';', or end of data.
  while (*d1 && *d1 != ':' && *d1 != ';')
    d1++;
  val[0] = 0;
  if (!*d1)
    return NULL;
  if (*d1 == ';')
    return d1 + 1;
  d1++;
  while (*d1 == ' ' || *d1 == '\t')
    d1++;
  // Get value.
  l = 0;
  while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
    val[l++] = *(d1++);
  val[l] = 0;
  // Skip everything until ';' or end of data.
  while (*d1 && *d1 != ';')
    d1++;
  if (*d1 == ';')
    return d1 + 1;
  return NULL;
}

// ---------------------------------------------------------
// Check coding tag in the read buffer.
//
// We search for the following line:
//
//   <comment> ... -*-<local variables list>-*-
//
// ('...' might be anything).
//
// <comment> can be one of the following syntax forms at the
// beginning of the line:
//
//   .\"   .\#   '\"   '\#   \#
//
// There can be whitespace after the leading '.' or "'".
//
// The local variables list must occur within the first
// comment block at the very beginning of the data stream.
//
// Within the <local variables list>, we search for
//
//   coding: <value>
//
// which specifies the coding system used for the data
// stream.
//
// Return <value> if found, NULL otherwise.
//
// Note that null bytes in the data are skipped before applying
// the algorithm.  This should work even with files encoded as
// UTF-16 or UTF-32 (or its siblings) in most cases.
// ---------------------------------------------------------
char *
check_coding_tag(FILE *fp, string &data)
{
  char *inbuf = get_tag_lines(fp, data);
  char *lineend;
  for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
    if ((lineend = strchr(p, '\n')) == NULL)
      break;
    *lineend = 0;		// switch temporarily to '\0'
    char *d1 = strstr(p, "-*-");
    char *d2 = 0;
    if (d1)
      d2 = strstr(d1 + 3, "-*-");
    *lineend = '\n';		// restore newline
    if (!d1 || !d2)
      continue;
    *d2 = 0;			// switch temporarily to '\0'
    d1 += 3;
    while (d1) {
      char *variable, *value;
      d1 = get_variable_value_pair(d1, &variable, &value);
      if (!strcasecmp(variable, "coding")) {
	*d2 = '-';		// restore '-'
	free(inbuf);
	return value;
      }
    }
    *d2 = '-';			// restore '-'
  }
  free(inbuf);
  return NULL;
}

char *
detect_file_encoding(FILE *fp)
{
#ifdef HAVE_UCHARDET
  uchardet_t ud = NULL;
  struct stat stat_buf;
  size_t len, read_bytes;
  char *data = NULL;
  int res, current_position;
  const char *charset;
  char *ret = NULL;

  current_position = ftell(fp);
  /* Due to BOM and tag detection, we are not at the beginning of the
     file. */
  rewind(fp);
  if (fstat(fileno(fp), &stat_buf) != 0) {
    error("fstat: %1", strerror(errno));
    goto end;
  }
  len = stat_buf.st_size;
  if (is_debugging)
    fprintf(stderr, "  len: %lu\n", (unsigned long)len);
  if (len == 0)
    goto end;
  data = (char *)calloc(len, 1);
  read_bytes = fread(data, 1, len, fp);
  if (read_bytes == 0) {
    error("fread: %1", strerror(errno));
    goto end;
  }
  /* We rewind back to the original position */
  if (fseek(fp, current_position, SEEK_SET) != 0) {
    fatal("fseek: %1", strerror(errno));
    goto end;
  }
  ud = uchardet_new();
  res = uchardet_handle_data(ud, data, len);
  if (res != 0) {
    debug("  uchardet_handle_data: error %1\n", res);
    goto end;
  }
  if (is_debugging)
    fprintf(stderr, "  uchardet read: %lu bytes\n",
	    (unsigned long)read_bytes);
  uchardet_data_end(ud);
  charset = uchardet_get_charset(ud);
  if (is_debugging) {
    if (charset)
       fprintf(stderr, "  charset: %s\n", charset);
    else
       fprintf(stderr, "  charset is NULL\n");
  }
  /* uchardet 0.0.1 could return an empty string instead of NULL */
  if (charset && *charset) {
    ret = (char *)malloc(strlen(charset) + 1);
    strcpy(ret, charset);
  }

end:
  if (ud)
     uchardet_delete(ud);
  if (data)
     free(data);

  return ret;
#else /* not HAVE_UCHARDET */
  return NULL;
#endif /* not HAVE_UCHARDET */
}

// ---------------------------------------------------------
// Handle an input file.  If `filename` is "-", read the
// standard input stream.
//
// Return 1 on success, 0 otherwise.
// ---------------------------------------------------------
int
do_file(const char *filename)
{
  FILE *fp;
  string BOM, data;
  bool is_seekable = false;
  string reported_filename;

  // TODO: Consider moving some of this into a `quoted_file_name`
  // function in libgroff.
  if (strcmp(filename, "-") == 0) {
    fp = stdin;
    reported_filename = string("<standard input>");
  }
  else {
    fp = fopen(filename, FOPEN_RB);
    reported_filename = "'" + string(filename) + "'";
  }
  if (!fp) {
    error("can't open %1: %2", reported_filename.contents(),
	  strerror(errno));
    return 0;
  }
  if (is_debugging)
    fprintf(stderr, "processing %s\n", reported_filename.contents());
  if (fseek(fp, 0L, SEEK_SET) == 0)
    is_seekable = true;
  else {
    SET_BINARY(fileno(fp));
    if (is_debugging)
      fprintf(stderr, "  stream is not seekable: %s\n",
	      strerror(errno));
  }
  const char *BOM_encoding = get_BOM(fp, BOM, data);
  // Determine the encoding.
  char *encoding;
  int must_free_encoding = 0;
  if (user_encoding[0]) {
    if (is_debugging) {
      fprintf(stderr, "  user-specified encoding '%s', "
		      "no search for coding tag\n",
		      user_encoding);
      if (BOM_encoding && strcmp(BOM_encoding, user_encoding))
	fprintf(stderr, "  but BOM in data stream implies encoding '%s'!\n",
			BOM_encoding);
    }
    encoding = (char *)user_encoding;
  }
  else if (BOM_encoding) {
    if (is_debugging)
      fprintf(stderr, "  found BOM, no search for coding tag\n");
    encoding = (char *)BOM_encoding;
  }
  else {
    // 'check_coding_tag' returns a pointer to a static array (or NULL).
    char *file_encoding = check_coding_tag(fp, data);
    if (!file_encoding) {
      if (is_debugging)
	fprintf(stderr, "  no coding tag\n");
      if (is_seekable)
         file_encoding = detect_file_encoding(fp);
      if (!file_encoding) {
        if (is_debugging)
          fprintf(stderr, "  could not detect encoding with uchardet\n");
        file_encoding = fallback_encoding;
      }
      else
        must_free_encoding = 1;
    }
    else
      if (is_debugging)
	fprintf(stderr, "  coding tag: '%s'\n", file_encoding);
    encoding = file_encoding;
  }
  strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
  encoding_string[MAX_VAR_LEN - 1] = 0;
  if (must_free_encoding)
    free(encoding);
  encoding = encoding_string;
  // Translate from MIME & Emacs encoding names to locale encoding names.
  encoding = emacs2mime(encoding_string);
  if (encoding[0] == '\0') {
    error("encoding '%1' not supported, not a portable encoding",
	  encoding_string);
    return 0;
  }
  if (is_debugging)
    fprintf(stderr, "  encoding used: '%s'\n", encoding);
  if (!raw_flag) {
    string fn(filename);
    fn += '\0';
    normalize_for_lf(fn);
    printf(".lf 1 %s\n", fn.contents());
  }
  int success = 1;
  // Call converter (converters write to stdout).
  if (!strcasecmp(encoding, "ISO-8859-1"))
    conversion_latin1(fp, BOM + data);
  else if (!strcasecmp(encoding, "UTF-8"))
    conversion_utf8(fp, data);
  else if (!strcasecmp(encoding, "cp1047"))
    conversion_cp1047(fp, BOM + data);
  else {
#if HAVE_ICONV
    conversion_iconv(fp, BOM + data, encoding);
#else
    error("encoding system '%1' not supported", encoding);
    success = 0;
#endif /* HAVE_ICONV */
  }
  if (fp != stdin)
    fclose(fp);
  return success;
}

// ---------------------------------------------------------
// Print usage.
// ---------------------------------------------------------
void
usage(FILE *stream)
{
  fprintf(stream,
"usage: %s [-dr] [-D fallback-encoding] [-e encoding] [file ...]\n"
"usage: %s {-v | --version}\n"
"usage: %s {-h | --help}\n",
	  program_name, program_name, program_name);
  if (stdout == stream) {
    fprintf(stream,
"\n"
"Read each file, convert its encoded characters to a form GNU"
" troff(1)\n"
"can interpret, and send the result to the standard output stream.\n"
"The default fallback encoding is '%s'.  See the preconv(1) manual"
" page.\n",
	  fallback_encoding);
    exit(EXIT_SUCCESS);
  }
}

// ---------------------------------------------------------
// Main routine.
// ---------------------------------------------------------
int
main(int argc, char **argv)
{
  program_name = argv[0];
  // Determine the fallback encoding.  This must be done before
  // getopt() is called since the usage message shows the fallback
  // encoding.
  setlocale(LC_ALL, "");
  char *locale = getlocale(LC_CTYPE);
  if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
    strcpy(fallback_encoding, "latin1");
  else {
    strncpy(fallback_encoding, locale_charset(), MAX_VAR_LEN - 1);
    fallback_encoding[MAX_VAR_LEN - 1] = 0;
  }

  program_name = argv[0];
  int opt;
  static const struct option long_options[] = {
    { "help", no_argument, 0, 'h' },
    { "version", no_argument, 0, 'v' },
    { NULL, 0, 0, 0 }
  };
  // Parse the command-line options.
  while ((opt = getopt_long(argc, argv,
			    "dD:e:hrv", long_options, NULL)) != EOF)
    switch (opt) {
    case 'v':
      printf("GNU preconv (groff) version %s %s iconv support and %s uchardet support\n",
	     Version_string,
#ifdef HAVE_ICONV
	     "with",
#else
	     "without",
#endif /* HAVE_ICONV */
#ifdef HAVE_UCHARDET
             "with"
#else
             "without"
#endif /* HAVE_UCHARDET */
	    );
      exit(0);
      break;
    case 'd':
      is_debugging = true;
      break;
    case 'e':
      if (optarg) {
	strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
	user_encoding[MAX_VAR_LEN - 1] = 0;
      }
      else
	user_encoding[0] = 0;
      break;
    case 'D':
      if (optarg) {
	strncpy(fallback_encoding, optarg, MAX_VAR_LEN - 1);
	fallback_encoding[MAX_VAR_LEN - 1] = 0;
      }
      break;
    case 'r':
      raw_flag = 1;
      break;
    case 'h':
      usage(stdout);
      break;
    case '?':
      usage(stderr);
      exit(1);
      break;
    default:
      assert(0);
    }
  int nbad = 0;
  if (is_debugging)
    fprintf(stderr, "fallback encoding: '%s'\n", fallback_encoding);
  if (optind >= argc)
    nbad += !do_file("-");
  else
    for (int i = optind; i < argc; i++)
      nbad += !do_file(argv[i]);
  if (ferror(stdout) || fflush(stdout) < 0)
    fatal("output error");
  return nbad != 0;
}

// Local Variables:
// fill-column: 72
// mode: C++
// End:
// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: