summaryrefslogtreecommitdiffstats
path: root/src/utils/indxbib
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:44:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:44:05 +0000
commitd318611dd6f23fcfedd50e9b9e24620b102ba96a (patch)
tree8b9eef82ca40fdd5a8deeabf07572074c236095d /src/utils/indxbib
parentInitial commit. (diff)
downloadgroff-upstream/1.23.0.tar.xz
groff-upstream/1.23.0.zip
Adding upstream version 1.23.0.upstream/1.23.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/utils/indxbib')
-rw-r--r--src/utils/indxbib/eign133
-rw-r--r--src/utils/indxbib/indxbib.1.man347
-rw-r--r--src/utils/indxbib/indxbib.am57
-rw-r--r--src/utils/indxbib/indxbib.cpp803
-rw-r--r--src/utils/indxbib/signal.c77
5 files changed, 1417 insertions, 0 deletions
diff --git a/src/utils/indxbib/eign b/src/utils/indxbib/eign
new file mode 100644
index 0000000..7718c8b
--- /dev/null
+++ b/src/utils/indxbib/eign
@@ -0,0 +1,133 @@
+a
+i
+the
+to
+of
+and
+in
+is
+it
+for
+that
+if
+you
+this
+be
+on
+with
+not
+have
+are
+or
+as
+from
+can
+but
+by
+at
+an
+will
+no
+all
+was
+do
+there
+my
+one
+so
+we
+they
+what
+would
+any
+which
+about
+get
+your
+use
+some
+me
+then
+name
+like
+out
+when
+up
+time
+other
+more
+only
+just
+end
+also
+know
+how
+new
+should
+been
+than
+them
+he
+who
+make
+may
+people
+these
+now
+their
+here
+into
+first
+could
+way
+had
+see
+work
+well
+were
+two
+very
+where
+while
+us
+because
+good
+same
+even
+much
+most
+many
+such
+long
+his
+over
+last
+since
+right
+before
+our
+without
+too
+those
+why
+must
+part
+being
+current
+back
+still
+go
+point
+value
+each
+did
+both
+true
+off
+say
+another
+state
+might
+under
+start
+try
diff --git a/src/utils/indxbib/indxbib.1.man b/src/utils/indxbib/indxbib.1.man
new file mode 100644
index 0000000..df02fcc
--- /dev/null
+++ b/src/utils/indxbib/indxbib.1.man
@@ -0,0 +1,347 @@
+.TH @g@indxbib @MAN1EXT@ "@MDATE@" "groff @VERSION@"
+.SH Name
+@g@indxbib \- make inverted index for bibliographic databases
+.
+.
+.\" ====================================================================
+.\" Legal Terms
+.\" ====================================================================
+.\"
+.\" Copyright (C) 1989-2020 Free Software Foundation, Inc.
+.\"
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of
+.\" this manual under the conditions for verbatim copying, provided that
+.\" the entire resulting derived work is distributed under the terms of
+.\" a permission notice identical to this one.
+.\"
+.\" Permission is granted to copy and distribute translations of this
+.\" manual into another language, under the above conditions for
+.\" modified versions, except that this permission notice may be
+.\" included in translations approved by the Free Software Foundation
+.\" instead of in the original English.
+.
+.
+.\" Save and disable compatibility mode (for, e.g., Solaris 10/11).
+.do nr *groff_indxbib_1_man_C \n[.cp]
+.cp 0
+.
+.\" Define fallback for groff 1.23's MR macro if the system lacks it.
+.nr do-fallback 0
+.if !\n(.f .nr do-fallback 1 \" mandoc
+.if \n(.g .if !d MR .nr do-fallback 1 \" older groff
+.if !\n(.g .nr do-fallback 1 \" non-groff *roff
+.if \n[do-fallback] \{\
+. de MR
+. ie \\n(.$=1 \
+. I \%\\$1
+. el \
+. IR \%\\$1 (\\$2)\\$3
+. .
+.\}
+.rr do-fallback
+.
+.
+.\" ====================================================================
+.SH Synopsis
+.\" ====================================================================
+.
+.SY @g@indxbib
+.RB [ \-w ]
+.RB [ \-c\~\c
+.IR \%common-words-file ]
+.RB [ \-d\~\c
+.IR dir ]
+.RB [ \-f\~\c
+.IR \%list-file ]
+.RB [ \-h\~\c
+.IR \%min-hash-table-size ]
+.RB [ \-i\~\c
+.IR \%excluded-fields ]
+.RB [ \-k\~\c
+.IR \%max-keys-per-record ]
+.RB [ \-l\~\c
+.IR \%min-key-length ]
+.RB [ \-n\~\c
+.IR \%threshold ]
+.RB [ \-o\~\c
+.IR file ]
+.RB [ \-t\~\c
+.IR \%max-key-length ]
+.RI [ file\~ .\|.\|.]
+.YS
+.
+.
+.SY @g@indxbib
+.B \-\-help
+.YS
+.
+.
+.SY @g@indxbib
+.B \-v
+.
+.SY @g@indxbib
+.B \-\-version
+.YS
+.
+.
+.\" ====================================================================
+.SH Description
+.\" ====================================================================
+.
+.I @g@indxbib
+makes an inverted index for the bibliographic databases in each
+.I file
+for use with
+.MR @g@refer @MAN1EXT@ ,
+.MR @g@lookbib @MAN1EXT@ ,
+and
+.MR lkbib @MAN1EXT@ .
+.
+Each created index is named
+.RI file @INDEX_SUFFIX@ ;
+writing is done to a temporary file which is then renamed to this.
+.
+If no
+.I file
+operands are given on the command line because the
+.B \-f
+option has been used,
+and no
+.B \-o
+option is given,
+the index will be named
+.IR \%@DEFAULT_INDEX_NAME@@INDEX_SUFFIX@ .
+.
+.
+.LP
+Bibliographic databases are divided into records by blank lines.
+.
+Within a record,
+each field starts with a
+.B %
+character at the beginning of a line.
+.
+Fields have a one letter name that follows the
+.B %
+character.
+.
+.
+.LP
+The values set by the
+.BR \-c ,
+.BR \-l ,
+.BR \-n ,
+and
+.B \-t
+options are stored in the index:
+when the index is searched,
+keys will be discarded and truncated in a
+manner appropriate to these options;
+the original keys will be used for verifying that any record
+found using the index actually contains the keys.
+.
+This means that a user of an index need not know whether these
+options were used in the creation of the index,
+provided that not all the keys to be searched for
+would have been discarded during indexing
+and that the user supplies at least the part of each key
+that would have remained after being truncated during indexing.
+.
+The value set by the
+.B \-i
+option is also stored in the index
+and will be used in verifying records found using the index.
+.
+.
+.\" ====================================================================
+.SH Options
+.\" ====================================================================
+.
+.B \-\-help
+displays a usage message,
+while
+.B \-v
+and
+.B \-\-version
+show version information;
+all exit afterward.
+.
+.
+.TP
+.BI \-c\~ common-words-file
+Read the list of common words from
+.I common-words-file
+instead of
+.IR \%@COMMON_WORDS_FILE@ .
+.
+.
+.TP
+.BI \-d\~ dir
+Use
+.I dir
+as the name of the directory to store in the index,
+instead of that returned by
+.MR getcwd 2 .
+.
+Typically,
+.I dir
+will be a symbolic link whose target is the current working directory.
+.
+.
+.TP
+.BI \-f\~ list-file
+Read the files to be indexed from
+.IR list-file .
+.
+If
+.I list-file
+is
+.BR \- ,
+files will be read from the standard input stream.
+.
+The
+.B \-f
+option can be given at most once.
+.
+.
+.TP
+.BI \-h\~ min-hash-table-size
+Use the first prime number greater than or equal to
+the argument for the size of the hash table.
+.
+Larger values
+will usually make searching faster,
+but will make the index file larger
+and cause
+.I @g@indxbib
+to use more memory.
+.
+The default hash table size is 997.
+.
+.
+.TP
+.BI \-i\~ excluded-fields
+Don't index the contents of fields whose names are in
+.IR excluded-fields .
+.
+Field names are one character each.
+.
+If this option is not present,
+.I @g@indxbib
+excludes fields
+.BR X ,
+.BR Y ,
+and
+.BR Z .
+.
+.
+.TP
+.BI \-k\~ max-keys-per-record
+Use no more keys per input record than specified in the argument.
+.
+If this option is not present,
+the maximum is 100.
+.
+.
+.TP
+.BI \-l\~ min-key-length
+Discard any key whose length in characters is shorter than the value of
+the argument.
+.
+If this option is not present,
+the minimum key length
+is 3.
+.
+.
+.TP
+.BI \-n\~ threshold
+Discard the
+.I threshold
+most common words from the common words file.
+.
+If this option is not present,
+the 100 most common words are discarded.
+.
+.
+.TP
+.BI \-o\~ basename
+Name the index
+.RI basename @INDEX_SUFFIX@ .
+.
+.
+.TP
+.BI \-t\~ max-key-length
+Truncate keys to
+.I max-key-length
+in characters.
+.
+If this option is not present,
+keys are truncated to 6 characters.
+.
+.
+.TP
+.B \-w
+Index whole files.
+.
+Each file is a separate record.
+.
+.
+.\" ====================================================================
+.SH Files
+.\" ====================================================================
+.
+.TP
+.RI \%file @INDEX_SUFFIX@
+index for
+.I file
+.
+.
+.TP
+.I \%@DEFAULT_INDEX_NAME@@INDEX_SUFFIX@
+default index name
+.
+.
+.TP
+.I \%@COMMON_WORDS_FILE@
+contains the list of common words.
+.
+The traditional name,
+.RI \[lq] eign \[rq],
+is an abbreviation of \[lq]English ignored [word list]\[rq].
+.
+.
+.TP
+.IR \%indxbib XXXXXX
+temporary file
+.
+.
+.\" ====================================================================
+.SH "See also"
+.\" ====================================================================
+.
+\[lq]Some Applications of Inverted Indexes on the Unix System\[rq],
+by M.\& E.\& Lesk,
+1978,
+AT&T Bell Laboratories Computing Science Technical Report No.\& 69.
+.
+.
+.LP
+.MR @g@refer @MAN1EXT@ ,
+.MR lkbib @MAN1EXT@ ,
+.MR @g@lookbib @MAN1EXT@
+.
+.
+.\" Restore compatibility mode (for, e.g., Solaris 10/11).
+.cp \n[*groff_indxbib_1_man_C]
+.do rr *groff_indxbib_1_man_C
+.
+.
+.\" Local Variables:
+.\" fill-column: 72
+.\" mode: nroff
+.\" End:
+.\" vim: set filetype=groff textwidth=72:
diff --git a/src/utils/indxbib/indxbib.am b/src/utils/indxbib/indxbib.am
new file mode 100644
index 0000000..d2a7d5a
--- /dev/null
+++ b/src/utils/indxbib/indxbib.am
@@ -0,0 +1,57 @@
+# Copyright (C) 2014-2020 Free Software Foundation, Inc.
+#
+# This file is part of groff.
+#
+# groff is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# groff is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+indxbib_srcdir = $(top_srcdir)/src/utils/indxbib
+prefixexecbin_PROGRAMS += indxbib
+indxbib_SOURCES = \
+ src/utils/indxbib/indxbib.cpp \
+ src/utils/indxbib/signal.c
+src/utils/indxbib/indxbib.$(OBJEXT): defs.h
+indxbib_LDADD = libbib.a libgroff.a $(LIBM) lib/libgnu.a
+PREFIXMAN1 += src/utils/indxbib/indxbib.1
+EXTRA_DIST += \
+ src/utils/indxbib/indxbib.1.man \
+ src/utils/indxbib/eign
+
+install-data-local: install_indxbib
+install_indxbib: $(indxbib_srcdir)/eign
+ -test -d $(DESTDIR)$(datadir) \
+ || $(mkinstalldirs) $(DESTDIR)$(datadir)
+ -test -d $(DESTDIR)$(dataprogramdir) \
+ || $(mkinstalldirs) $(DESTDIR)$(dataprogramdir)
+ -test -d $(DESTDIR)$(datasubdir) \
+ || $(mkinstalldirs) $(DESTDIR)$(datasubdir)
+ if test -f /usr/lib/eign; then \
+ rm -f $(DESTDIR)$(common_words_file); \
+ ln -s /usr/lib/eign $(DESTDIR)$(common_words_file) 2>/dev/null \
+ || ln /usr/lib/eign $(DESTDIR)$(common_words_file) 2>/dev/null \
+ || cp /usr/lib/eign $(DESTDIR)$(common_words_file); \
+ else \
+ rm -f $(DESTDIR)$(common_words_file); \
+ $(INSTALL_DATA) $(indxbib_srcdir)/eign $(DESTDIR)$(common_words_file); \
+ fi
+
+uninstall-local: uninstall_indxbib
+uninstall_indxbib:
+ rm -f $(DESTDIR)$(common_words_file)
+
+
+# Local Variables:
+# fill-column: 72
+# mode: makefile-automake
+# End:
+# vim: set autoindent filetype=automake textwidth=72:
diff --git a/src/utils/indxbib/indxbib.cpp b/src/utils/indxbib/indxbib.cpp
new file mode 100644
index 0000000..ad8bb0e
--- /dev/null
+++ b/src/utils/indxbib/indxbib.cpp
@@ -0,0 +1,803 @@
+/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
+ Written by James Clark (jjc@jclark.com)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "lib.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "posix.h"
+#include "errarg.h"
+#include "error.h"
+#include "stringclass.h"
+#include "cset.h"
+#include "cmap.h"
+
+#include "defs.h"
+#include "index.h"
+
+#include "nonposix.h"
+
+extern "C" const char *Version_string;
+
+#define DEFAULT_HASH_TABLE_SIZE 997
+#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
+
+// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
+
+#define MALLOC_OVERHEAD 16
+
+#ifdef BLOCK_SIZE
+#undef BLOCK_SIZE
+#endif
+
+const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
+ - sizeof(int)) / sizeof(int));
+struct block {
+ block *next;
+ int used;
+ int v[BLOCK_SIZE];
+
+ block(block *p = 0) : next(p), used(0) { }
+};
+
+struct block;
+
+union table_entry {
+ block *ptr;
+ int count;
+};
+
+struct word_list {
+ word_list *next;
+ char *str;
+ int len;
+ word_list(const char *, int, word_list *);
+};
+
+table_entry *hash_table;
+int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
+// We make this the same size as hash_table so we only have to do one
+// mod per key.
+static word_list **common_words_table = 0;
+char *key_buffer;
+
+FILE *indxfp;
+int ntags = 0;
+string filenames;
+char *temp_index_file = 0;
+
+const char *ignore_fields = "XYZ";
+const char *common_words_file = COMMON_WORDS_FILE;
+int n_ignore_words = 100;
+int truncate_len = 6;
+int shortest_len = 3;
+int max_keys_per_item = 100;
+
+static void usage(FILE *stream);
+static void write_hash_table();
+static void init_hash_table();
+static void read_common_words_file();
+static int store_key(char *s, int len);
+static void possibly_store_key(char *s, int len);
+static int do_whole_file(const char *filename);
+static int do_file(const char *filename);
+static void store_reference(int filename_index, int pos, int len);
+static void check_integer_arg(char opt, const char *arg, int min, int *res);
+static void store_filename(const char *);
+static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
+static char *get_cwd();
+
+extern "C" {
+ void cleanup();
+ void catch_fatal_signals();
+ void ignore_fatal_signals();
+}
+
+int main(int argc, char **argv)
+{
+ program_name = argv[0];
+ static char stderr_buf[BUFSIZ];
+ setbuf(stderr, stderr_buf);
+
+ const char *base_name = 0;
+ typedef int (*parser_t)(const char *);
+ parser_t parser = do_file;
+ const char *directory = 0;
+ const char *foption = 0;
+ int opt;
+ static const struct option long_options[] = {
+ { "help", no_argument, 0, CHAR_MAX + 1 },
+ { "version", no_argument, 0, 'v' },
+ { NULL, 0, 0, 0 }
+ };
+ while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
+ long_options, NULL))
+ != EOF)
+ switch (opt) {
+ case 'c':
+ common_words_file = optarg;
+ break;
+ case 'd':
+ directory = optarg;
+ break;
+ case 'f':
+ foption = optarg;
+ break;
+ case 'h':
+ {
+ int requested_hash_table_size;
+ check_integer_arg('h', optarg, 1, &requested_hash_table_size);
+ hash_table_size = requested_hash_table_size;
+ if ((hash_table_size > 2) && (hash_table_size % 2) == 0)
+ hash_table_size++;
+ while (!is_prime(hash_table_size))
+ hash_table_size += 2;
+ if (hash_table_size != requested_hash_table_size)
+ warning("requested hash table size %1 is not prime: using %2"
+ " instead", optarg, hash_table_size);
+ }
+ break;
+ case 'i':
+ ignore_fields = optarg;
+ break;
+ case 'k':
+ check_integer_arg('k', optarg, 1, &max_keys_per_item);
+ break;
+ case 'l':
+ check_integer_arg('l', optarg, 0, &shortest_len);
+ break;
+ case 'n':
+ check_integer_arg('n', optarg, 0, &n_ignore_words);
+ break;
+ case 'o':
+ base_name = optarg;
+ break;
+ case 't':
+ check_integer_arg('t', optarg, 1, &truncate_len);
+ break;
+ case 'w':
+ parser = do_whole_file;
+ break;
+ case 'v':
+ printf("GNU indxbib (groff) version %s\n", Version_string);
+ exit(0);
+ break;
+ case CHAR_MAX + 1: // --help
+ usage(stdout);
+ exit(0);
+ break;
+ case '?':
+ usage(stderr);
+ exit(1);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ if (optind >= argc && foption == 0)
+ fatal("no files and no -f option");
+ if (!directory) {
+ char *path = get_cwd();
+ store_filename(path);
+ delete[] path;
+ }
+ else
+ store_filename(directory);
+ init_hash_table();
+ store_filename(common_words_file);
+ store_filename(ignore_fields);
+ key_buffer = new char[truncate_len];
+ read_common_words_file();
+ if (!base_name)
+ base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
+ const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
+ const char *sep = &DIR_SEPS[1];
+ while (*sep) {
+ p1 = strrchr(base_name, *sep);
+ if (p1 && (!p || p1 > p))
+ p = p1;
+ sep++;
+ }
+ size_t name_max;
+ if (p) {
+ char *dir = strsave(base_name);
+ dir[p - base_name] = '\0';
+ name_max = file_name_max(dir);
+ delete[] dir;
+ }
+ else
+ name_max = file_name_max(".");
+ const char *filename = p ? p + 1 : base_name;
+ if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
+ fatal("'%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
+ if (p) {
+ p++;
+ temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
+ memcpy(temp_index_file, base_name, p - base_name);
+ strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
+ }
+ else {
+ temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
+ }
+ catch_fatal_signals();
+ int fd = mkstemp(temp_index_file);
+ if (fd < 0)
+ fatal("can't create temporary index file: %1", strerror(errno));
+ indxfp = fdopen(fd, FOPEN_WB);
+ if (indxfp == 0)
+ fatal("fdopen failed");
+ if (fseek(indxfp, sizeof(index_header), 0) < 0)
+ fatal("can't seek past index header: %1", strerror(errno));
+ int failed = 0;
+ if (foption) {
+ FILE *fp = stdin;
+ if (strcmp(foption, "-") != 0) {
+ errno = 0;
+ fp = fopen(foption, "r");
+ if (!fp)
+ fatal("can't open '%1': %2", foption, strerror(errno));
+ }
+ string path;
+ int lineno = 1;
+ for (;;) {
+ int c;
+ for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
+ if (c == '\0')
+ error_with_file_and_line(foption, lineno,
+ "nul character in pathname ignored");
+ else
+ path += c;
+ }
+ if (path.length() > 0) {
+ path += '\0';
+ if (!(*parser)(path.contents()))
+ failed = 1;
+ path.clear();
+ }
+ if (c == EOF)
+ break;
+ lineno++;
+ }
+ if (fp != stdin)
+ fclose(fp);
+ }
+ for (int i = optind; i < argc; i++)
+ if (!(*parser)(argv[i]))
+ failed = 1;
+ write_hash_table();
+ if (fclose(indxfp) < 0)
+ fatal("error closing temporary index file: %1", strerror(errno));
+ char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
+ strcpy(index_file, base_name);
+ strcat(index_file, INDEX_SUFFIX);
+#ifdef HAVE_RENAME
+#ifdef __EMX__
+ if (access(index_file, R_OK) == 0)
+ unlink(index_file);
+#endif /* __EMX__ */
+ if (rename(temp_index_file, index_file) < 0) {
+#ifdef __MSDOS__
+ // RENAME could fail on plain MS-DOS filesystems because
+ // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
+ char *fname = p ? index_file + (p - base_name) : 0;
+ char *dot = 0;
+
+ // Replace the dot with an underscore and try again.
+ if (fname
+ && (dot = strchr(fname, '.')) != 0
+ && strcmp(dot, INDEX_SUFFIX) != 0)
+ *dot = '_';
+ if (rename(temp_index_file, index_file) < 0)
+#endif
+ fatal("can't rename temporary index file: %1", strerror(errno));
+ }
+#else /* not HAVE_RENAME */
+ ignore_fatal_signals();
+ if (unlink(index_file) < 0) {
+ if (errno != ENOENT)
+ fatal("can't unlink '%1': %2", index_file, strerror(errno));
+ }
+ if (link(temp_index_file, index_file) < 0)
+ fatal("can't link temporary index file: %1", strerror(errno));
+ if (unlink(temp_index_file) < 0)
+ fatal("can't unlink temporary index file: %1", strerror(errno));
+#endif /* not HAVE_RENAME */
+ temp_index_file = 0;
+ return failed;
+}
+
+static void usage(FILE *stream)
+{
+ fprintf(stream,
+"usage: %s [-w] [-c common-words-file] [-d dir] [-f list-file]"
+" [-h min-hash-table-size] [-i excluded-fields]"
+" [-k max-keys-per-record] [-l min-key-length]"
+" [-n threshold] [-o file] [-t max-key-length] [file ...]\n"
+"usage: %s {-v | --version}\n"
+"usage: %s --help\n",
+ program_name, program_name, program_name);
+}
+
+static void check_integer_arg(char opt, const char *arg, int min, int *res)
+{
+ char *ptr;
+ long n = strtol(arg, &ptr, 10);
+ if (n == 0 && ptr == arg)
+ error("argument to -%1 not an integer", opt);
+ else if (n < min)
+ error("argument to -%1 must not be less than %2", opt, min);
+ else {
+ if (n > INT_MAX)
+ error("argument to -%1 greater than maximum integer", opt);
+ else if (*ptr != '\0')
+ error("junk after integer argument to -%1", opt);
+ *res = int(n);
+ }
+}
+
+static char *get_cwd()
+{
+ char *buf;
+ int size = 12;
+
+ for (;;) {
+ buf = new char[size];
+ if (getcwd(buf, size))
+ break;
+ if (errno != ERANGE)
+ fatal("cannot get current working directory: %1", strerror(errno));
+ delete[] buf;
+ if (size == INT_MAX)
+ fatal("current working directory longer than INT_MAX");
+ if (size > INT_MAX/2)
+ size = INT_MAX;
+ else
+ size *= 2;
+ }
+ return buf;
+}
+
+word_list::word_list(const char *s, int n, word_list *p)
+: next(p), len(n)
+{
+ str = new char[n];
+ memcpy(str, s, n);
+}
+
+static void read_common_words_file()
+{
+ if (n_ignore_words <= 0)
+ return;
+ errno = 0;
+ FILE *fp = fopen(common_words_file, "r");
+ if (!fp)
+ fatal("can't open '%1': %2", common_words_file, strerror(errno));
+ common_words_table = new word_list * [hash_table_size];
+ for (int i = 0; i < hash_table_size; i++)
+ common_words_table[i] = 0;
+ int count = 0;
+ int key_len = 0;
+ for (;;) {
+ int c = getc(fp);
+ while (c != EOF && !csalnum(c))
+ c = getc(fp);
+ if (c == EOF)
+ break;
+ do {
+ if (key_len < truncate_len)
+ key_buffer[key_len++] = cmlower(c);
+ c = getc(fp);
+ } while (c != EOF && csalnum(c));
+ if (key_len >= shortest_len) {
+ int h = hash(key_buffer, key_len) % hash_table_size;
+ common_words_table[h] = new word_list(key_buffer, key_len,
+ common_words_table[h]);
+ }
+ if (++count >= n_ignore_words)
+ break;
+ key_len = 0;
+ if (c == EOF)
+ break;
+ }
+ n_ignore_words = count;
+ fclose(fp);
+}
+
+static int do_whole_file(const char *filename)
+{
+ errno = 0;
+ FILE *fp = fopen(filename, "r");
+ if (!fp) {
+ error("can't open '%1': %2", filename, strerror(errno));
+ return 0;
+ }
+ int count = 0;
+ int key_len = 0;
+ int c;
+ while ((c = getc(fp)) != EOF) {
+ if (csalnum(c)) {
+ key_len = 1;
+ key_buffer[0] = c;
+ while ((c = getc(fp)) != EOF) {
+ if (!csalnum(c))
+ break;
+ if (key_len < truncate_len)
+ key_buffer[key_len++] = c;
+ }
+ if (store_key(key_buffer, key_len)) {
+ if (++count >= max_keys_per_item)
+ break;
+ }
+ if (c == EOF)
+ break;
+ }
+ }
+ store_reference(filenames.length(), 0, 0);
+ store_filename(filename);
+ fclose(fp);
+ return 1;
+}
+
+static int do_file(const char *filename)
+{
+ errno = 0;
+ // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
+ // byte counts to be consistent with fseek.
+ FILE *fp = fopen(filename, FOPEN_RB);
+ if (fp == 0) {
+ error("can't open '%1': %2", filename, strerror(errno));
+ return 0;
+ }
+ int filename_index = filenames.length();
+ store_filename(filename);
+
+ enum {
+ START, // at the start of the file; also in between references
+ BOL, // in the middle of a reference, at the beginning of the line
+ PERCENT, // seen a percent at the beginning of the line
+ IGNORE, // ignoring a field
+ IGNORE_BOL, // at the beginning of a line ignoring a field
+ KEY, // in the middle of a key
+ DISCARD, // after truncate_len bytes of a key
+ MIDDLE // in between keys
+ } state = START;
+
+ // In states START, BOL, IGNORE_BOL, space_count how many spaces at
+ // the beginning have been seen. In states PERCENT, IGNORE, KEY,
+ // MIDDLE space_count must be 0.
+ int space_count = 0;
+ int byte_count = 0; // bytes read
+ int key_len = 0;
+ int ref_start = -1; // position of start of current reference
+ for (;;) {
+ int c = getc(fp);
+ if (c == EOF)
+ break;
+ // We opened the file in binary mode, so we need to skip
+ // every CR character before a Newline.
+ if (c == '\r') {
+ int peek = getc(fp);
+ if (peek == '\n') {
+ byte_count++;
+ c = peek;
+ }
+ else
+ ungetc(peek, fp);
+ }
+#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
+ else if (c == 0x1a) // ^Z means EOF in text files
+ break;
+#endif
+ byte_count++;
+ switch (state) {
+ case START:
+ if (c == ' ' || c == '\t') {
+ space_count++;
+ break;
+ }
+ if (c == '\n') {
+ space_count = 0;
+ break;
+ }
+ ref_start = byte_count - space_count - 1;
+ space_count = 0;
+ if (c == '%')
+ state = PERCENT;
+ else if (csalnum(c)) {
+ state = KEY;
+ key_buffer[0] = c;
+ key_len = 1;
+ }
+ else
+ state = MIDDLE;
+ break;
+ case BOL:
+ switch (c) {
+ case '%':
+ if (space_count > 0) {
+ space_count = 0;
+ state = MIDDLE;
+ }
+ else
+ state = PERCENT;
+ break;
+ case ' ':
+ case '\t':
+ space_count++;
+ break;
+ case '\n':
+ store_reference(filename_index, ref_start,
+ byte_count - 1 - space_count - ref_start);
+ state = START;
+ space_count = 0;
+ break;
+ default:
+ space_count = 0;
+ if (csalnum(c)) {
+ state = KEY;
+ key_buffer[0] = c;
+ key_len = 1;
+ }
+ else
+ state = MIDDLE;
+ }
+ break;
+ case PERCENT:
+ if (strchr(ignore_fields, c) != 0)
+ state = IGNORE;
+ else if (c == '\n')
+ state = BOL;
+ else
+ state = MIDDLE;
+ break;
+ case IGNORE:
+ if (c == '\n')
+ state = IGNORE_BOL;
+ break;
+ case IGNORE_BOL:
+ switch (c) {
+ case '%':
+ if (space_count > 0) {
+ state = IGNORE;
+ space_count = 0;
+ }
+ else
+ state = PERCENT;
+ break;
+ case ' ':
+ case '\t':
+ space_count++;
+ break;
+ case '\n':
+ store_reference(filename_index, ref_start,
+ byte_count - 1 - space_count - ref_start);
+ state = START;
+ space_count = 0;
+ break;
+ default:
+ space_count = 0;
+ state = IGNORE;
+ }
+ break;
+ case KEY:
+ if (csalnum(c)) {
+ if (key_len < truncate_len)
+ key_buffer[key_len++] = c;
+ else
+ state = DISCARD;
+ }
+ else {
+ possibly_store_key(key_buffer, key_len);
+ key_len = 0;
+ if (c == '\n')
+ state = BOL;
+ else
+ state = MIDDLE;
+ }
+ break;
+ case DISCARD:
+ if (!csalnum(c)) {
+ possibly_store_key(key_buffer, key_len);
+ key_len = 0;
+ if (c == '\n')
+ state = BOL;
+ else
+ state = MIDDLE;
+ }
+ break;
+ case MIDDLE:
+ if (csalnum(c)) {
+ state = KEY;
+ key_buffer[0] = c;
+ key_len = 1;
+ }
+ else if (c == '\n')
+ state = BOL;
+ break;
+ default:
+ assert(0);
+ }
+ }
+ switch (state) {
+ case START:
+ break;
+ case DISCARD:
+ case KEY:
+ possibly_store_key(key_buffer, key_len);
+ // fall through
+ case BOL:
+ case PERCENT:
+ case IGNORE_BOL:
+ case IGNORE:
+ case MIDDLE:
+ store_reference(filename_index, ref_start,
+ byte_count - ref_start - space_count);
+ break;
+ default:
+ assert(0);
+ }
+ fclose(fp);
+ return 1;
+}
+
+static void store_reference(int filename_index, int pos, int len)
+{
+ tag t;
+ t.filename_index = filename_index;
+ t.start = pos;
+ t.length = len;
+ fwrite_or_die(&t, sizeof(t), 1, indxfp);
+ ntags++;
+}
+
+static void store_filename(const char *fn)
+{
+ filenames += fn;
+ filenames += '\0';
+}
+
+static void init_hash_table()
+{
+ hash_table = new table_entry[hash_table_size];
+ for (int i = 0; i < hash_table_size; i++)
+ hash_table[i].ptr = 0;
+}
+
+static void possibly_store_key(char *s, int len)
+{
+ static int last_tagno = -1;
+ static int key_count;
+ if (last_tagno != ntags) {
+ last_tagno = ntags;
+ key_count = 0;
+ }
+ if (key_count < max_keys_per_item) {
+ if (store_key(s, len))
+ key_count++;
+ }
+}
+
+static int store_key(char *s, int len)
+{
+ if (len < shortest_len)
+ return 0;
+ int is_number = 1;
+ for (int i = 0; i < len; i++)
+ if (!csdigit(s[i])) {
+ is_number = 0;
+ s[i] = cmlower(s[i]);
+ }
+ if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
+ return 0;
+ int h = hash(s, len) % hash_table_size;
+ if (common_words_table) {
+ for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
+ if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
+ return 0;
+ }
+ table_entry *pp = hash_table + h;
+ if (!pp->ptr)
+ pp->ptr = new block;
+ else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
+ return 1;
+ else if (pp->ptr->used >= BLOCK_SIZE)
+ pp->ptr = new block(pp->ptr);
+ pp->ptr->v[(pp->ptr->used)++] = ntags;
+ return 1;
+}
+
+static void write_hash_table()
+{
+ const int minus_one = -1;
+ int li = 0;
+ for (int i = 0; i < hash_table_size; i++) {
+ block *ptr = hash_table[i].ptr;
+ if (!ptr)
+ hash_table[i].count = -1;
+ else {
+ hash_table[i].count = li;
+ block *rev = 0;
+ while (ptr) {
+ block *tem = ptr;
+ ptr = ptr->next;
+ tem->next = rev;
+ rev = tem;
+ }
+ while (rev) {
+ fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
+ li += rev->used;
+ block *tem = rev;
+ rev = rev->next;
+ delete tem;
+ }
+ fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
+ li += 1;
+ }
+ }
+ if (sizeof(table_entry) == sizeof(int))
+ fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
+ else {
+ // write it out word by word
+ for (int i = 0; i < hash_table_size; i++)
+ fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
+ }
+ fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
+ if (fseek(indxfp, 0, 0) < 0)
+ fatal("error seeking on index file: %1", strerror(errno));
+ index_header h;
+ h.magic = INDEX_MAGIC;
+ h.version = INDEX_VERSION;
+ h.tags_size = ntags;
+ h.lists_size = li;
+ h.table_size = hash_table_size;
+ h.strings_size = filenames.length();
+ h.truncate = truncate_len;
+ h.shortest = shortest_len;
+ h.common = n_ignore_words;
+ fwrite_or_die(&h, sizeof(h), 1, indxfp);
+}
+
+static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
+{
+ if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
+ fatal("fwrite failed: %1", strerror(errno));
+}
+
+void fatal_error_exit()
+{
+ cleanup();
+ exit(3);
+}
+
+extern "C" {
+
+void cleanup()
+{
+ if (temp_index_file)
+ unlink(temp_index_file);
+}
+
+}
+
+// Local Variables:
+// fill-column: 72
+// mode: C++
+// End:
+// vim: set cindent noexpandtab shiftwidth=2 textwidth=72:
diff --git a/src/utils/indxbib/signal.c b/src/utils/indxbib/signal.c
new file mode 100644
index 0000000..2231b64
--- /dev/null
+++ b/src/utils/indxbib/signal.c
@@ -0,0 +1,77 @@
+/* Copyright (C) 1992-2020 Free Software Foundation, Inc.
+ Written by James Clark (jjc@jclark.com)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Unfortunately vendors seem to have problems writing a <signal.h>
+that is correct for C++, so we implement all signal handling in C. */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <signal.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Prototype */
+void catch_fatal_signals(void);
+
+extern void cleanup(void);
+
+static RETSIGTYPE handle_fatal_signal(int signum)
+{
+ signal(signum, SIG_DFL);
+ cleanup();
+#ifdef HAVE_KILL
+ kill(getpid(), signum);
+#else
+ /* MS-DOS and Win32 don't have kill(); the best compromise is
+ probably to use exit() instead. */
+ exit(signum);
+#endif
+}
+
+void catch_fatal_signals(void)
+{
+#ifdef SIGHUP
+ signal(SIGHUP, handle_fatal_signal);
+#endif
+ signal(SIGINT, handle_fatal_signal);
+ signal(SIGTERM, handle_fatal_signal);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef HAVE_RENAME
+
+void ignore_fatal_signals()
+{
+#ifdef SIGHUP
+ signal(SIGHUP, SIG_IGN);
+#endif
+ signal(SIGINT, SIG_IGN);
+ signal(SIGTERM, SIG_IGN);
+}
+
+#endif /* not HAVE_RENAME */