diff options
Diffstat (limited to 'src/utils/indxbib')
-rw-r--r-- | src/utils/indxbib/eign | 133 | ||||
-rw-r--r-- | src/utils/indxbib/indxbib.1.man | 347 | ||||
-rw-r--r-- | src/utils/indxbib/indxbib.am | 57 | ||||
-rw-r--r-- | src/utils/indxbib/indxbib.cpp | 803 | ||||
-rw-r--r-- | src/utils/indxbib/signal.c | 77 |
5 files changed, 1417 insertions, 0 deletions
diff --git a/src/utils/indxbib/eign b/src/utils/indxbib/eign new file mode 100644 index 0000000..7718c8b --- /dev/null +++ b/src/utils/indxbib/eign @@ -0,0 +1,133 @@ +a +i +the +to +of +and +in +is +it +for +that +if +you +this +be +on +with +not +have +are +or +as +from +can +but +by +at +an +will +no +all +was +do +there +my +one +so +we +they +what +would +any +which +about +get +your +use +some +me +then +name +like +out +when +up +time +other +more +only +just +end +also +know +how +new +should +been +than +them +he +who +make +may +people +these +now +their +here +into +first +could +way +had +see +work +well +were +two +very +where +while +us +because +good +same +even +much +most +many +such +long +his +over +last +since +right +before +our +without +too +those +why +must +part +being +current +back +still +go +point +value +each +did +both +true +off +say +another +state +might +under +start +try diff --git a/src/utils/indxbib/indxbib.1.man b/src/utils/indxbib/indxbib.1.man new file mode 100644 index 0000000..df02fcc --- /dev/null +++ b/src/utils/indxbib/indxbib.1.man @@ -0,0 +1,347 @@ +.TH @g@indxbib @MAN1EXT@ "@MDATE@" "groff @VERSION@" +.SH Name +@g@indxbib \- make inverted index for bibliographic databases +. +. +.\" ==================================================================== +.\" Legal Terms +.\" ==================================================================== +.\" +.\" Copyright (C) 1989-2020 Free Software Foundation, Inc. +.\" +.\" Permission is granted to make and distribute verbatim copies of this +.\" manual provided the copyright notice and this permission notice are +.\" preserved on all copies. +.\" +.\" Permission is granted to copy and distribute modified versions of +.\" this manual under the conditions for verbatim copying, provided that +.\" the entire resulting derived work is distributed under the terms of +.\" a permission notice identical to this one. +.\" +.\" Permission is granted to copy and distribute translations of this +.\" manual into another language, under the above conditions for +.\" modified versions, except that this permission notice may be +.\" included in translations approved by the Free Software Foundation +.\" instead of in the original English. +. +. +.\" Save and disable compatibility mode (for, e.g., Solaris 10/11). +.do nr *groff_indxbib_1_man_C \n[.cp] +.cp 0 +. +.\" Define fallback for groff 1.23's MR macro if the system lacks it. +.nr do-fallback 0 +.if !\n(.f .nr do-fallback 1 \" mandoc +.if \n(.g .if !d MR .nr do-fallback 1 \" older groff +.if !\n(.g .nr do-fallback 1 \" non-groff *roff +.if \n[do-fallback] \{\ +. de MR +. ie \\n(.$=1 \ +. I \%\\$1 +. el \ +. IR \%\\$1 (\\$2)\\$3 +. . +.\} +.rr do-fallback +. +. +.\" ==================================================================== +.SH Synopsis +.\" ==================================================================== +. +.SY @g@indxbib +.RB [ \-w ] +.RB [ \-c\~\c +.IR \%common-words-file ] +.RB [ \-d\~\c +.IR dir ] +.RB [ \-f\~\c +.IR \%list-file ] +.RB [ \-h\~\c +.IR \%min-hash-table-size ] +.RB [ \-i\~\c +.IR \%excluded-fields ] +.RB [ \-k\~\c +.IR \%max-keys-per-record ] +.RB [ \-l\~\c +.IR \%min-key-length ] +.RB [ \-n\~\c +.IR \%threshold ] +.RB [ \-o\~\c +.IR file ] +.RB [ \-t\~\c +.IR \%max-key-length ] +.RI [ file\~ .\|.\|.] +.YS +. +. +.SY @g@indxbib +.B \-\-help +.YS +. +. +.SY @g@indxbib +.B \-v +. +.SY @g@indxbib +.B \-\-version +.YS +. +. +.\" ==================================================================== +.SH Description +.\" ==================================================================== +. +.I @g@indxbib +makes an inverted index for the bibliographic databases in each +.I file +for use with +.MR @g@refer @MAN1EXT@ , +.MR @g@lookbib @MAN1EXT@ , +and +.MR lkbib @MAN1EXT@ . +. +Each created index is named +.RI file @INDEX_SUFFIX@ ; +writing is done to a temporary file which is then renamed to this. +. +If no +.I file +operands are given on the command line because the +.B \-f +option has been used, +and no +.B \-o +option is given, +the index will be named +.IR \%@DEFAULT_INDEX_NAME@@INDEX_SUFFIX@ . +. +. +.LP +Bibliographic databases are divided into records by blank lines. +. +Within a record, +each field starts with a +.B % +character at the beginning of a line. +. +Fields have a one letter name that follows the +.B % +character. +. +. +.LP +The values set by the +.BR \-c , +.BR \-l , +.BR \-n , +and +.B \-t +options are stored in the index: +when the index is searched, +keys will be discarded and truncated in a +manner appropriate to these options; +the original keys will be used for verifying that any record +found using the index actually contains the keys. +. +This means that a user of an index need not know whether these +options were used in the creation of the index, +provided that not all the keys to be searched for +would have been discarded during indexing +and that the user supplies at least the part of each key +that would have remained after being truncated during indexing. +. +The value set by the +.B \-i +option is also stored in the index +and will be used in verifying records found using the index. +. +. +.\" ==================================================================== +.SH Options +.\" ==================================================================== +. +.B \-\-help +displays a usage message, +while +.B \-v +and +.B \-\-version +show version information; +all exit afterward. +. +. +.TP +.BI \-c\~ common-words-file +Read the list of common words from +.I common-words-file +instead of +.IR \%@COMMON_WORDS_FILE@ . +. +. +.TP +.BI \-d\~ dir +Use +.I dir +as the name of the directory to store in the index, +instead of that returned by +.MR getcwd 2 . +. +Typically, +.I dir +will be a symbolic link whose target is the current working directory. +. +. +.TP +.BI \-f\~ list-file +Read the files to be indexed from +.IR list-file . +. +If +.I list-file +is +.BR \- , +files will be read from the standard input stream. +. +The +.B \-f +option can be given at most once. +. +. +.TP +.BI \-h\~ min-hash-table-size +Use the first prime number greater than or equal to +the argument for the size of the hash table. +. +Larger values +will usually make searching faster, +but will make the index file larger +and cause +.I @g@indxbib +to use more memory. +. +The default hash table size is 997. +. +. +.TP +.BI \-i\~ excluded-fields +Don't index the contents of fields whose names are in +.IR excluded-fields . +. +Field names are one character each. +. +If this option is not present, +.I @g@indxbib +excludes fields +.BR X , +.BR Y , +and +.BR Z . +. +. +.TP +.BI \-k\~ max-keys-per-record +Use no more keys per input record than specified in the argument. +. +If this option is not present, +the maximum is 100. +. +. +.TP +.BI \-l\~ min-key-length +Discard any key whose length in characters is shorter than the value of +the argument. +. +If this option is not present, +the minimum key length +is 3. +. +. +.TP +.BI \-n\~ threshold +Discard the +.I threshold +most common words from the common words file. +. +If this option is not present, +the 100 most common words are discarded. +. +. +.TP +.BI \-o\~ basename +Name the index +.RI basename @INDEX_SUFFIX@ . +. +. +.TP +.BI \-t\~ max-key-length +Truncate keys to +.I max-key-length +in characters. +. +If this option is not present, +keys are truncated to 6 characters. +. +. +.TP +.B \-w +Index whole files. +. +Each file is a separate record. +. +. +.\" ==================================================================== +.SH Files +.\" ==================================================================== +. +.TP +.RI \%file @INDEX_SUFFIX@ +index for +.I file +. +. +.TP +.I \%@DEFAULT_INDEX_NAME@@INDEX_SUFFIX@ +default index name +. +. +.TP +.I \%@COMMON_WORDS_FILE@ +contains the list of common words. +. +The traditional name, +.RI \[lq] eign \[rq], +is an abbreviation of \[lq]English ignored [word list]\[rq]. +. +. +.TP +.IR \%indxbib XXXXXX +temporary file +. +. +.\" ==================================================================== +.SH "See also" +.\" ==================================================================== +. +\[lq]Some Applications of Inverted Indexes on the Unix System\[rq], +by M.\& E.\& Lesk, +1978, +AT&T Bell Laboratories Computing Science Technical Report No.\& 69. +. +. +.LP +.MR @g@refer @MAN1EXT@ , +.MR lkbib @MAN1EXT@ , +.MR @g@lookbib @MAN1EXT@ +. +. +.\" Restore compatibility mode (for, e.g., Solaris 10/11). +.cp \n[*groff_indxbib_1_man_C] +.do rr *groff_indxbib_1_man_C +. +. +.\" Local Variables: +.\" fill-column: 72 +.\" mode: nroff +.\" End: +.\" vim: set filetype=groff textwidth=72: diff --git a/src/utils/indxbib/indxbib.am b/src/utils/indxbib/indxbib.am new file mode 100644 index 0000000..d2a7d5a --- /dev/null +++ b/src/utils/indxbib/indxbib.am @@ -0,0 +1,57 @@ +# Copyright (C) 2014-2020 Free Software Foundation, Inc. +# +# This file is part of groff. +# +# groff is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# groff is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +indxbib_srcdir = $(top_srcdir)/src/utils/indxbib +prefixexecbin_PROGRAMS += indxbib +indxbib_SOURCES = \ + src/utils/indxbib/indxbib.cpp \ + src/utils/indxbib/signal.c +src/utils/indxbib/indxbib.$(OBJEXT): defs.h +indxbib_LDADD = libbib.a libgroff.a $(LIBM) lib/libgnu.a +PREFIXMAN1 += src/utils/indxbib/indxbib.1 +EXTRA_DIST += \ + src/utils/indxbib/indxbib.1.man \ + src/utils/indxbib/eign + +install-data-local: install_indxbib +install_indxbib: $(indxbib_srcdir)/eign + -test -d $(DESTDIR)$(datadir) \ + || $(mkinstalldirs) $(DESTDIR)$(datadir) + -test -d $(DESTDIR)$(dataprogramdir) \ + || $(mkinstalldirs) $(DESTDIR)$(dataprogramdir) + -test -d $(DESTDIR)$(datasubdir) \ + || $(mkinstalldirs) $(DESTDIR)$(datasubdir) + if test -f /usr/lib/eign; then \ + rm -f $(DESTDIR)$(common_words_file); \ + ln -s /usr/lib/eign $(DESTDIR)$(common_words_file) 2>/dev/null \ + || ln /usr/lib/eign $(DESTDIR)$(common_words_file) 2>/dev/null \ + || cp /usr/lib/eign $(DESTDIR)$(common_words_file); \ + else \ + rm -f $(DESTDIR)$(common_words_file); \ + $(INSTALL_DATA) $(indxbib_srcdir)/eign $(DESTDIR)$(common_words_file); \ + fi + +uninstall-local: uninstall_indxbib +uninstall_indxbib: + rm -f $(DESTDIR)$(common_words_file) + + +# Local Variables: +# fill-column: 72 +# mode: makefile-automake +# End: +# vim: set autoindent filetype=automake textwidth=72: diff --git a/src/utils/indxbib/indxbib.cpp b/src/utils/indxbib/indxbib.cpp new file mode 100644 index 0000000..ad8bb0e --- /dev/null +++ b/src/utils/indxbib/indxbib.cpp @@ -0,0 +1,803 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "lib.h" + +#include <assert.h> +#include <errno.h> +#include <stdlib.h> + +#include "posix.h" +#include "errarg.h" +#include "error.h" +#include "stringclass.h" +#include "cset.h" +#include "cmap.h" + +#include "defs.h" +#include "index.h" + +#include "nonposix.h" + +extern "C" const char *Version_string; + +#define DEFAULT_HASH_TABLE_SIZE 997 +#define TEMP_INDEX_TEMPLATE "indxbibXXXXXX" + +// (2^n - MALLOC_OVERHEAD) should be a good argument for malloc(). + +#define MALLOC_OVERHEAD 16 + +#ifdef BLOCK_SIZE +#undef BLOCK_SIZE +#endif + +const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *) + - sizeof(int)) / sizeof(int)); +struct block { + block *next; + int used; + int v[BLOCK_SIZE]; + + block(block *p = 0) : next(p), used(0) { } +}; + +struct block; + +union table_entry { + block *ptr; + int count; +}; + +struct word_list { + word_list *next; + char *str; + int len; + word_list(const char *, int, word_list *); +}; + +table_entry *hash_table; +int hash_table_size = DEFAULT_HASH_TABLE_SIZE; +// We make this the same size as hash_table so we only have to do one +// mod per key. +static word_list **common_words_table = 0; +char *key_buffer; + +FILE *indxfp; +int ntags = 0; +string filenames; +char *temp_index_file = 0; + +const char *ignore_fields = "XYZ"; +const char *common_words_file = COMMON_WORDS_FILE; +int n_ignore_words = 100; +int truncate_len = 6; +int shortest_len = 3; +int max_keys_per_item = 100; + +static void usage(FILE *stream); +static void write_hash_table(); +static void init_hash_table(); +static void read_common_words_file(); +static int store_key(char *s, int len); +static void possibly_store_key(char *s, int len); +static int do_whole_file(const char *filename); +static int do_file(const char *filename); +static void store_reference(int filename_index, int pos, int len); +static void check_integer_arg(char opt, const char *arg, int min, int *res); +static void store_filename(const char *); +static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp); +static char *get_cwd(); + +extern "C" { + void cleanup(); + void catch_fatal_signals(); + void ignore_fatal_signals(); +} + +int main(int argc, char **argv) +{ + program_name = argv[0]; + static char stderr_buf[BUFSIZ]; + setbuf(stderr, stderr_buf); + + const char *base_name = 0; + typedef int (*parser_t)(const char *); + parser_t parser = do_file; + const char *directory = 0; + const char *foption = 0; + int opt; + static const struct option long_options[] = { + { "help", no_argument, 0, CHAR_MAX + 1 }, + { "version", no_argument, 0, 'v' }, + { NULL, 0, 0, 0 } + }; + while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw", + long_options, NULL)) + != EOF) + switch (opt) { + case 'c': + common_words_file = optarg; + break; + case 'd': + directory = optarg; + break; + case 'f': + foption = optarg; + break; + case 'h': + { + int requested_hash_table_size; + check_integer_arg('h', optarg, 1, &requested_hash_table_size); + hash_table_size = requested_hash_table_size; + if ((hash_table_size > 2) && (hash_table_size % 2) == 0) + hash_table_size++; + while (!is_prime(hash_table_size)) + hash_table_size += 2; + if (hash_table_size != requested_hash_table_size) + warning("requested hash table size %1 is not prime: using %2" + " instead", optarg, hash_table_size); + } + break; + case 'i': + ignore_fields = optarg; + break; + case 'k': + check_integer_arg('k', optarg, 1, &max_keys_per_item); + break; + case 'l': + check_integer_arg('l', optarg, 0, &shortest_len); + break; + case 'n': + check_integer_arg('n', optarg, 0, &n_ignore_words); + break; + case 'o': + base_name = optarg; + break; + case 't': + check_integer_arg('t', optarg, 1, &truncate_len); + break; + case 'w': + parser = do_whole_file; + break; + case 'v': + printf("GNU indxbib (groff) version %s\n", Version_string); + exit(0); + break; + case CHAR_MAX + 1: // --help + usage(stdout); + exit(0); + break; + case '?': + usage(stderr); + exit(1); + break; + default: + assert(0); + break; + } + if (optind >= argc && foption == 0) + fatal("no files and no -f option"); + if (!directory) { + char *path = get_cwd(); + store_filename(path); + delete[] path; + } + else + store_filename(directory); + init_hash_table(); + store_filename(common_words_file); + store_filename(ignore_fields); + key_buffer = new char[truncate_len]; + read_common_words_file(); + if (!base_name) + base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME; + const char *p = strrchr(base_name, DIR_SEPS[0]), *p1; + const char *sep = &DIR_SEPS[1]; + while (*sep) { + p1 = strrchr(base_name, *sep); + if (p1 && (!p || p1 > p)) + p = p1; + sep++; + } + size_t name_max; + if (p) { + char *dir = strsave(base_name); + dir[p - base_name] = '\0'; + name_max = file_name_max(dir); + delete[] dir; + } + else + name_max = file_name_max("."); + const char *filename = p ? p + 1 : base_name; + if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max) + fatal("'%1.%2' is too long for a filename", filename, INDEX_SUFFIX); + if (p) { + p++; + temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)]; + memcpy(temp_index_file, base_name, p - base_name); + strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE); + } + else { + temp_index_file = strsave(TEMP_INDEX_TEMPLATE); + } + catch_fatal_signals(); + int fd = mkstemp(temp_index_file); + if (fd < 0) + fatal("can't create temporary index file: %1", strerror(errno)); + indxfp = fdopen(fd, FOPEN_WB); + if (indxfp == 0) + fatal("fdopen failed"); + if (fseek(indxfp, sizeof(index_header), 0) < 0) + fatal("can't seek past index header: %1", strerror(errno)); + int failed = 0; + if (foption) { + FILE *fp = stdin; + if (strcmp(foption, "-") != 0) { + errno = 0; + fp = fopen(foption, "r"); + if (!fp) + fatal("can't open '%1': %2", foption, strerror(errno)); + } + string path; + int lineno = 1; + for (;;) { + int c; + for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) { + if (c == '\0') + error_with_file_and_line(foption, lineno, + "nul character in pathname ignored"); + else + path += c; + } + if (path.length() > 0) { + path += '\0'; + if (!(*parser)(path.contents())) + failed = 1; + path.clear(); + } + if (c == EOF) + break; + lineno++; + } + if (fp != stdin) + fclose(fp); + } + for (int i = optind; i < argc; i++) + if (!(*parser)(argv[i])) + failed = 1; + write_hash_table(); + if (fclose(indxfp) < 0) + fatal("error closing temporary index file: %1", strerror(errno)); + char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)]; + strcpy(index_file, base_name); + strcat(index_file, INDEX_SUFFIX); +#ifdef HAVE_RENAME +#ifdef __EMX__ + if (access(index_file, R_OK) == 0) + unlink(index_file); +#endif /* __EMX__ */ + if (rename(temp_index_file, index_file) < 0) { +#ifdef __MSDOS__ + // RENAME could fail on plain MS-DOS filesystems because + // INDEX_FILE is an invalid filename, e.g. it has multiple dots. + char *fname = p ? index_file + (p - base_name) : 0; + char *dot = 0; + + // Replace the dot with an underscore and try again. + if (fname + && (dot = strchr(fname, '.')) != 0 + && strcmp(dot, INDEX_SUFFIX) != 0) + *dot = '_'; + if (rename(temp_index_file, index_file) < 0) +#endif + fatal("can't rename temporary index file: %1", strerror(errno)); + } +#else /* not HAVE_RENAME */ + ignore_fatal_signals(); + if (unlink(index_file) < 0) { + if (errno != ENOENT) + fatal("can't unlink '%1': %2", index_file, strerror(errno)); + } + if (link(temp_index_file, index_file) < 0) + fatal("can't link temporary index file: %1", strerror(errno)); + if (unlink(temp_index_file) < 0) + fatal("can't unlink temporary index file: %1", strerror(errno)); +#endif /* not HAVE_RENAME */ + temp_index_file = 0; + return failed; +} + +static void usage(FILE *stream) +{ + fprintf(stream, +"usage: %s [-w] [-c common-words-file] [-d dir] [-f list-file]" +" [-h min-hash-table-size] [-i excluded-fields]" +" [-k max-keys-per-record] [-l min-key-length]" +" [-n threshold] [-o file] [-t max-key-length] [file ...]\n" +"usage: %s {-v | --version}\n" +"usage: %s --help\n", + program_name, program_name, program_name); +} + +static void check_integer_arg(char opt, const char *arg, int min, int *res) +{ + char *ptr; + long n = strtol(arg, &ptr, 10); + if (n == 0 && ptr == arg) + error("argument to -%1 not an integer", opt); + else if (n < min) + error("argument to -%1 must not be less than %2", opt, min); + else { + if (n > INT_MAX) + error("argument to -%1 greater than maximum integer", opt); + else if (*ptr != '\0') + error("junk after integer argument to -%1", opt); + *res = int(n); + } +} + +static char *get_cwd() +{ + char *buf; + int size = 12; + + for (;;) { + buf = new char[size]; + if (getcwd(buf, size)) + break; + if (errno != ERANGE) + fatal("cannot get current working directory: %1", strerror(errno)); + delete[] buf; + if (size == INT_MAX) + fatal("current working directory longer than INT_MAX"); + if (size > INT_MAX/2) + size = INT_MAX; + else + size *= 2; + } + return buf; +} + +word_list::word_list(const char *s, int n, word_list *p) +: next(p), len(n) +{ + str = new char[n]; + memcpy(str, s, n); +} + +static void read_common_words_file() +{ + if (n_ignore_words <= 0) + return; + errno = 0; + FILE *fp = fopen(common_words_file, "r"); + if (!fp) + fatal("can't open '%1': %2", common_words_file, strerror(errno)); + common_words_table = new word_list * [hash_table_size]; + for (int i = 0; i < hash_table_size; i++) + common_words_table[i] = 0; + int count = 0; + int key_len = 0; + for (;;) { + int c = getc(fp); + while (c != EOF && !csalnum(c)) + c = getc(fp); + if (c == EOF) + break; + do { + if (key_len < truncate_len) + key_buffer[key_len++] = cmlower(c); + c = getc(fp); + } while (c != EOF && csalnum(c)); + if (key_len >= shortest_len) { + int h = hash(key_buffer, key_len) % hash_table_size; + common_words_table[h] = new word_list(key_buffer, key_len, + common_words_table[h]); + } + if (++count >= n_ignore_words) + break; + key_len = 0; + if (c == EOF) + break; + } + n_ignore_words = count; + fclose(fp); +} + +static int do_whole_file(const char *filename) +{ + errno = 0; + FILE *fp = fopen(filename, "r"); + if (!fp) { + error("can't open '%1': %2", filename, strerror(errno)); + return 0; + } + int count = 0; + int key_len = 0; + int c; + while ((c = getc(fp)) != EOF) { + if (csalnum(c)) { + key_len = 1; + key_buffer[0] = c; + while ((c = getc(fp)) != EOF) { + if (!csalnum(c)) + break; + if (key_len < truncate_len) + key_buffer[key_len++] = c; + } + if (store_key(key_buffer, key_len)) { + if (++count >= max_keys_per_item) + break; + } + if (c == EOF) + break; + } + } + store_reference(filenames.length(), 0, 0); + store_filename(filename); + fclose(fp); + return 1; +} + +static int do_file(const char *filename) +{ + errno = 0; + // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on + // byte counts to be consistent with fseek. + FILE *fp = fopen(filename, FOPEN_RB); + if (fp == 0) { + error("can't open '%1': %2", filename, strerror(errno)); + return 0; + } + int filename_index = filenames.length(); + store_filename(filename); + + enum { + START, // at the start of the file; also in between references + BOL, // in the middle of a reference, at the beginning of the line + PERCENT, // seen a percent at the beginning of the line + IGNORE, // ignoring a field + IGNORE_BOL, // at the beginning of a line ignoring a field + KEY, // in the middle of a key + DISCARD, // after truncate_len bytes of a key + MIDDLE // in between keys + } state = START; + + // In states START, BOL, IGNORE_BOL, space_count how many spaces at + // the beginning have been seen. In states PERCENT, IGNORE, KEY, + // MIDDLE space_count must be 0. + int space_count = 0; + int byte_count = 0; // bytes read + int key_len = 0; + int ref_start = -1; // position of start of current reference + for (;;) { + int c = getc(fp); + if (c == EOF) + break; + // We opened the file in binary mode, so we need to skip + // every CR character before a Newline. + if (c == '\r') { + int peek = getc(fp); + if (peek == '\n') { + byte_count++; + c = peek; + } + else + ungetc(peek, fp); + } +#if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__) + else if (c == 0x1a) // ^Z means EOF in text files + break; +#endif + byte_count++; + switch (state) { + case START: + if (c == ' ' || c == '\t') { + space_count++; + break; + } + if (c == '\n') { + space_count = 0; + break; + } + ref_start = byte_count - space_count - 1; + space_count = 0; + if (c == '%') + state = PERCENT; + else if (csalnum(c)) { + state = KEY; + key_buffer[0] = c; + key_len = 1; + } + else + state = MIDDLE; + break; + case BOL: + switch (c) { + case '%': + if (space_count > 0) { + space_count = 0; + state = MIDDLE; + } + else + state = PERCENT; + break; + case ' ': + case '\t': + space_count++; + break; + case '\n': + store_reference(filename_index, ref_start, + byte_count - 1 - space_count - ref_start); + state = START; + space_count = 0; + break; + default: + space_count = 0; + if (csalnum(c)) { + state = KEY; + key_buffer[0] = c; + key_len = 1; + } + else + state = MIDDLE; + } + break; + case PERCENT: + if (strchr(ignore_fields, c) != 0) + state = IGNORE; + else if (c == '\n') + state = BOL; + else + state = MIDDLE; + break; + case IGNORE: + if (c == '\n') + state = IGNORE_BOL; + break; + case IGNORE_BOL: + switch (c) { + case '%': + if (space_count > 0) { + state = IGNORE; + space_count = 0; + } + else + state = PERCENT; + break; + case ' ': + case '\t': + space_count++; + break; + case '\n': + store_reference(filename_index, ref_start, + byte_count - 1 - space_count - ref_start); + state = START; + space_count = 0; + break; + default: + space_count = 0; + state = IGNORE; + } + break; + case KEY: + if (csalnum(c)) { + if (key_len < truncate_len) + key_buffer[key_len++] = c; + else + state = DISCARD; + } + else { + possibly_store_key(key_buffer, key_len); + key_len = 0; + if (c == '\n') + state = BOL; + else + state = MIDDLE; + } + break; + case DISCARD: + if (!csalnum(c)) { + possibly_store_key(key_buffer, key_len); + key_len = 0; + if (c == '\n') + state = BOL; + else + state = MIDDLE; + } + break; + case MIDDLE: + if (csalnum(c)) { + state = KEY; + key_buffer[0] = c; + key_len = 1; + } + else if (c == '\n') + state = BOL; + break; + default: + assert(0); + } + } + switch (state) { + case START: + break; + case DISCARD: + case KEY: + possibly_store_key(key_buffer, key_len); + // fall through + case BOL: + case PERCENT: + case IGNORE_BOL: + case IGNORE: + case MIDDLE: + store_reference(filename_index, ref_start, + byte_count - ref_start - space_count); + break; + default: + assert(0); + } + fclose(fp); + return 1; +} + +static void store_reference(int filename_index, int pos, int len) +{ + tag t; + t.filename_index = filename_index; + t.start = pos; + t.length = len; + fwrite_or_die(&t, sizeof(t), 1, indxfp); + ntags++; +} + +static void store_filename(const char *fn) +{ + filenames += fn; + filenames += '\0'; +} + +static void init_hash_table() +{ + hash_table = new table_entry[hash_table_size]; + for (int i = 0; i < hash_table_size; i++) + hash_table[i].ptr = 0; +} + +static void possibly_store_key(char *s, int len) +{ + static int last_tagno = -1; + static int key_count; + if (last_tagno != ntags) { + last_tagno = ntags; + key_count = 0; + } + if (key_count < max_keys_per_item) { + if (store_key(s, len)) + key_count++; + } +} + +static int store_key(char *s, int len) +{ + if (len < shortest_len) + return 0; + int is_number = 1; + for (int i = 0; i < len; i++) + if (!csdigit(s[i])) { + is_number = 0; + s[i] = cmlower(s[i]); + } + if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9')) + return 0; + int h = hash(s, len) % hash_table_size; + if (common_words_table) { + for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next) + if (len == ptr->len && memcmp(s, ptr->str, len) == 0) + return 0; + } + table_entry *pp = hash_table + h; + if (!pp->ptr) + pp->ptr = new block; + else if (pp->ptr->v[pp->ptr->used - 1] == ntags) + return 1; + else if (pp->ptr->used >= BLOCK_SIZE) + pp->ptr = new block(pp->ptr); + pp->ptr->v[(pp->ptr->used)++] = ntags; + return 1; +} + +static void write_hash_table() +{ + const int minus_one = -1; + int li = 0; + for (int i = 0; i < hash_table_size; i++) { + block *ptr = hash_table[i].ptr; + if (!ptr) + hash_table[i].count = -1; + else { + hash_table[i].count = li; + block *rev = 0; + while (ptr) { + block *tem = ptr; + ptr = ptr->next; + tem->next = rev; + rev = tem; + } + while (rev) { + fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp); + li += rev->used; + block *tem = rev; + rev = rev->next; + delete tem; + } + fwrite_or_die(&minus_one, sizeof(int), 1, indxfp); + li += 1; + } + } + if (sizeof(table_entry) == sizeof(int)) + fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp); + else { + // write it out word by word + for (int i = 0; i < hash_table_size; i++) + fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp); + } + fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp); + if (fseek(indxfp, 0, 0) < 0) + fatal("error seeking on index file: %1", strerror(errno)); + index_header h; + h.magic = INDEX_MAGIC; + h.version = INDEX_VERSION; + h.tags_size = ntags; + h.lists_size = li; + h.table_size = hash_table_size; + h.strings_size = filenames.length(); + h.truncate = truncate_len; + h.shortest = shortest_len; + h.common = n_ignore_words; + fwrite_or_die(&h, sizeof(h), 1, indxfp); +} + +static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp) +{ + if (fwrite(ptr, size, nitems, fp) != (size_t)nitems) + fatal("fwrite failed: %1", strerror(errno)); +} + +void fatal_error_exit() +{ + cleanup(); + exit(3); +} + +extern "C" { + +void cleanup() +{ + if (temp_index_file) + unlink(temp_index_file); +} + +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/utils/indxbib/signal.c b/src/utils/indxbib/signal.c new file mode 100644 index 0000000..2231b64 --- /dev/null +++ b/src/utils/indxbib/signal.c @@ -0,0 +1,77 @@ +/* Copyright (C) 1992-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Unfortunately vendors seem to have problems writing a <signal.h> +that is correct for C++, so we implement all signal handling in C. */ + +#include <config.h> + +#include <stdlib.h> +#include <sys/types.h> +#include <signal.h> +#ifdef HAVE_UNISTD_H +#include <unistd.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Prototype */ +void catch_fatal_signals(void); + +extern void cleanup(void); + +static RETSIGTYPE handle_fatal_signal(int signum) +{ + signal(signum, SIG_DFL); + cleanup(); +#ifdef HAVE_KILL + kill(getpid(), signum); +#else + /* MS-DOS and Win32 don't have kill(); the best compromise is + probably to use exit() instead. */ + exit(signum); +#endif +} + +void catch_fatal_signals(void) +{ +#ifdef SIGHUP + signal(SIGHUP, handle_fatal_signal); +#endif + signal(SIGINT, handle_fatal_signal); + signal(SIGTERM, handle_fatal_signal); +} + +#ifdef __cplusplus +} +#endif + +#ifndef HAVE_RENAME + +void ignore_fatal_signals() +{ +#ifdef SIGHUP + signal(SIGHUP, SIG_IGN); +#endif + signal(SIGINT, SIG_IGN); + signal(SIGTERM, SIG_IGN); +} + +#endif /* not HAVE_RENAME */ |