summaryrefslogtreecommitdiffstats
path: root/src/libs/libbib/index.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/libs/libbib/index.cpp')
-rw-r--r--src/libs/libbib/index.cpp688
1 files changed, 688 insertions, 0 deletions
diff --git a/src/libs/libbib/index.cpp b/src/libs/libbib/index.cpp
new file mode 100644
index 0000000..aebcbbf
--- /dev/null
+++ b/src/libs/libbib/index.cpp
@@ -0,0 +1,688 @@
+/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
+ Written by James Clark (jjc@jclark.com)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "lib.h"
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "posix.h"
+#include "cset.h"
+#include "cmap.h"
+#include "errarg.h"
+#include "error.h"
+
+#include "refid.h"
+#include "search.h"
+#include "index.h"
+#include "defs.h"
+
+#include "nonposix.h"
+
+// Interface to mmap.
+extern "C" {
+ void *mapread(int fd, int len);
+ int unmap(void *, int len);
+}
+
+#if 0
+const
+#endif
+int minus_one = -1;
+
+bool do_verify = false;
+
+struct word_list;
+
+class index_search_item : public search_item {
+ search_item *out_of_date_files;
+ index_header header;
+ char *buffer;
+ void *map_addr;
+ int map_len;
+ tag *tags;
+ int *table;
+ int *lists;
+ char *pool;
+ char *key_buffer;
+ char *filename_buffer;
+ int filename_buflen;
+ char **common_words_table;
+ int common_words_table_size;
+ const char *ignore_fields;
+ time_t mtime;
+
+ const char *get_invalidity_reason();
+ const int *search1(const char **pp, const char *end);
+ const int *search(const char *ptr, int length, int **temp_listp);
+ const char *munge_filename(const char *);
+ void read_common_words_file();
+ void add_out_of_date_file(int fd, const char *filename, int fid);
+public:
+ index_search_item(const char *, int);
+ ~index_search_item();
+ const char *check_header(index_header *, unsigned);
+ bool load(int fd);
+ search_item_iterator *make_search_item_iterator(const char *);
+ bool is_valid();
+ void check_files();
+ int next_filename_id() const;
+ friend class index_search_item_iterator;
+};
+
+class index_search_item_iterator : public search_item_iterator {
+ index_search_item *indx;
+ search_item_iterator *out_of_date_files_iter;
+ search_item *next_out_of_date_file;
+ const int *found_list;
+ int *temp_list;
+ char *buf;
+ int buflen;
+ linear_searcher searcher;
+ char *query;
+ int get_tag(int tagno, const linear_searcher &, const char **, int *,
+ reference_id *);
+public:
+ index_search_item_iterator(index_search_item *, const char *);
+ ~index_search_item_iterator();
+ int next(const linear_searcher &, const char **, int *, reference_id *);
+};
+
+
+index_search_item::index_search_item(const char *filename, int fid)
+: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
+ map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
+ common_words_table(0)
+{
+}
+
+index_search_item::~index_search_item()
+{
+ if (buffer)
+ free(buffer);
+ if (map_addr) {
+ if (unmap(map_addr, map_len) < 0)
+ error("unmap: %1", strerror(errno));
+ }
+ while (out_of_date_files) {
+ search_item *tem = out_of_date_files;
+ out_of_date_files = out_of_date_files->next;
+ delete tem;
+ }
+ delete[] filename_buffer;
+ delete[] key_buffer;
+ if (common_words_table) {
+ for (int i = 0; i < common_words_table_size; i++)
+ delete[] common_words_table[i];
+ delete[] common_words_table;
+ }
+}
+
+class file_closer {
+ int *fdp;
+public:
+ file_closer(int &fd) : fdp(&fd) { }
+ ~file_closer() { close(*fdp); }
+};
+
+// Tell the compiler that a variable is intentionally unused.
+inline void unused(void *) { }
+
+// Validate the data reported in the header so that we don't overread on
+// the heap in the load() member function. Return null pointer if no
+// problems are detected.
+const char *index_search_item::check_header(index_header *file_header,
+ unsigned file_size)
+{
+ if (file_header->tags_size < 0)
+ return "tag list length negative";
+ if (file_header->lists_size < 0)
+ return "reference list length negative";
+ // The table and string pool sizes will not be zero, even in an empty
+ // index.
+ if (file_header->table_size < 1)
+ return "table size nonpositive";
+ if (file_header->strings_size < 1)
+ return "string pool size nonpositive";
+ size_t sz = (file_header->tags_size * sizeof(tag)
+ + file_header->lists_size * sizeof(int)
+ + file_header->table_size * sizeof(int)
+ + file_header->strings_size
+ + sizeof(file_header));
+ if (sz != file_size)
+ return("size mismatch between header and data");
+ unsigned size_remaining = file_size;
+ unsigned chunk_size = file_header->tags_size * sizeof(tag);
+ if (chunk_size > size_remaining)
+ return "claimed tag list length exceeds file size";
+ size_remaining -= chunk_size;
+ chunk_size = file_header->lists_size * sizeof(int);
+ if (chunk_size > size_remaining)
+ return "claimed reference list length exceeds file size";
+ size_remaining -= chunk_size;
+ chunk_size = file_header->table_size * sizeof(int);
+ if (chunk_size > size_remaining)
+ return "claimed table size exceeds file size";
+ size_remaining -= chunk_size;
+ chunk_size = file_header->strings_size;
+ if (chunk_size > size_remaining)
+ return "claimed string pool size exceeds file size";
+ return 0;
+}
+
+bool index_search_item::load(int fd)
+{
+ file_closer fd_closer(fd); // close fd on return
+ unused(&fd_closer);
+ struct stat sb;
+ if (fstat(fd, &sb) < 0) {
+ error("can't fstat index '%1': %2", name, strerror(errno));
+ return false;
+ }
+ if (!S_ISREG(sb.st_mode)) {
+ error("index '%1' is not a regular file", name);
+ return false;
+ }
+ mtime = sb.st_mtime;
+ unsigned size = unsigned(sb.st_size); // widening conversion
+ if (size == 0) {
+ error("index '%1' is an empty file", name);
+ return false;
+ }
+ char *addr;
+ map_addr = mapread(fd, size);
+ if (map_addr) {
+ addr = (char *)map_addr;
+ map_len = size;
+ }
+ else {
+ addr = buffer = (char *)malloc(size);
+ if (buffer == 0) {
+ error("can't allocate memory to process index '%1'", name);
+ return false;
+ }
+ char *ptr = buffer;
+ int bytes_to_read = size;
+ while (bytes_to_read > 0) {
+ int nread = read(fd, ptr, bytes_to_read);
+ if (nread == 0) {
+ error("unexpected end-of-file while reading index '%1'", name);
+ return false;
+ }
+ if (nread < 0) {
+ error("read error on index '%1': %2", name, strerror(errno));
+ return false;
+ }
+ bytes_to_read -= nread;
+ ptr += nread;
+ }
+ }
+ header = *(index_header *)addr;
+ if (header.magic != INDEX_MAGIC) {
+ error("'%1' is not an index file: wrong magic number", name);
+ return false;
+ }
+ if (header.version != INDEX_VERSION) {
+ error("version number in index '%1' is wrong: was %2, should be %3",
+ name, header.version, INDEX_VERSION);
+ return false;
+ }
+ const char *problem = check_header(&header, size);
+ if (problem != 0) {
+ if (do_verify)
+ error("corrupt header in index file '%1': %2", name, problem);
+ else
+ error("corrupt header in index file '%1'", name);
+ return false;
+ }
+ tags = (tag *)(addr + sizeof(header));
+ lists = (int *)(tags + header.tags_size);
+ table = (int *)(lists + header.lists_size);
+ pool = (char *)(table + header.table_size);
+ ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
+ key_buffer = new char[header.truncate];
+ read_common_words_file();
+ return true;
+}
+
+const char *index_search_item::get_invalidity_reason()
+{
+ if (tags == 0)
+ return "not loaded";
+ if ((header.lists_size > 0) && (lists[header.lists_size - 1] >= 0))
+ return "last list element not negative";
+ int i;
+ for (i = 0; i < header.table_size; i++) {
+ int li = table[i];
+ if (li >= header.lists_size)
+ return "bad list index";
+ if (li >= 0) {
+ for (int *ptr = lists + li; *ptr >= 0; ptr++) {
+ if (*ptr >= header.tags_size)
+ return "bad tag index";
+ if (*ptr >= ptr[1] && ptr[1] >= 0)
+ return "list not ordered";
+ }
+ }
+ }
+ for (i = 0; i < header.tags_size; i++) {
+ if (tags[i].filename_index >= header.strings_size)
+ return "bad index in tags";
+ if (tags[i].length < 0)
+ return "bad length in tags";
+ if (tags[i].start < 0)
+ return "bad start in tags";
+ }
+ if (pool[header.strings_size - 1] != '\0')
+ return "last character in string pool is not null";
+ return 0;
+}
+
+bool index_search_item::is_valid()
+{
+ const char *reason = get_invalidity_reason();
+ if (!reason)
+ return true;
+ error("'%1' is bad: %2", name, reason);
+ return false;
+}
+
+int index_search_item::next_filename_id() const
+{
+ return filename_id + header.strings_size + 1;
+}
+
+search_item_iterator *index_search_item::make_search_item_iterator(
+ const char *query)
+{
+ return new index_search_item_iterator(this, query);
+}
+
+search_item *make_index_search_item(const char *filename, int fid)
+{
+ char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
+ strcpy(index_filename, filename);
+ strcat(index_filename, INDEX_SUFFIX);
+ int fd = open(index_filename, O_RDONLY | O_BINARY);
+ if (fd < 0)
+ return 0;
+ index_search_item *item = new index_search_item(index_filename, fid);
+ delete[] index_filename;
+ if (!item->load(fd)) {
+ close(fd);
+ delete item;
+ return 0;
+ }
+ else if (do_verify && !item->is_valid()) {
+ delete item;
+ return 0;
+ }
+ else {
+ item->check_files();
+ return item;
+ }
+}
+
+
+index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
+ const char *q)
+: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
+ buf(0), buflen(0),
+ searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
+ query(strsave(q))
+{
+ found_list = indx->search(q, strlen(q), &temp_list);
+ if (!found_list) {
+ found_list = &minus_one;
+ warning("all keys would have been discarded in constructing index '%1'",
+ indx->name);
+ }
+}
+
+index_search_item_iterator::~index_search_item_iterator()
+{
+ delete[] temp_list;
+ delete[] buf;
+ delete[] query;
+ delete out_of_date_files_iter;
+}
+
+int index_search_item_iterator::next(const linear_searcher &,
+ const char **pp, int *lenp,
+ reference_id *ridp)
+{
+ if (found_list) {
+ for (;;) {
+ int tagno = *found_list;
+ if (tagno == -1)
+ break;
+ found_list++;
+ if (get_tag(tagno, searcher, pp, lenp, ridp))
+ return 1;
+ }
+ found_list = 0;
+ next_out_of_date_file = indx->out_of_date_files;
+ }
+ while (next_out_of_date_file) {
+ if (out_of_date_files_iter == 0)
+ out_of_date_files_iter
+ = next_out_of_date_file->make_search_item_iterator(query);
+ if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
+ return 1;
+ delete out_of_date_files_iter;
+ out_of_date_files_iter = 0;
+ next_out_of_date_file = next_out_of_date_file->next;
+ }
+ return 0;
+}
+
+int index_search_item_iterator::get_tag(int tagno,
+ const linear_searcher &searchr,
+ const char **pp, int *lenp,
+ reference_id *ridp)
+{
+ if (tagno < 0 || tagno >= indx->header.tags_size) {
+ error("bad tag number");
+ return 0;
+ }
+ tag *tp = indx->tags + tagno;
+ const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
+ int fd = open(filename, O_RDONLY | O_BINARY);
+ if (fd < 0) {
+ error("can't open '%1': %2", filename, strerror(errno));
+ return 0;
+ }
+ struct stat sb;
+ if (fstat(fd, &sb) < 0) {
+ error("can't fstat: %1", strerror(errno));
+ close(fd);
+ return 0;
+ }
+ time_t mtime = sb.st_mtime;
+ if (mtime > indx->mtime) {
+ indx->add_out_of_date_file(fd, filename,
+ indx->filename_id + tp->filename_index);
+ return 0;
+ }
+ int res = 0;
+ FILE *fp = fdopen(fd, FOPEN_RB);
+ if (!fp) {
+ error("fdopen failed");
+ close(fd);
+ return 0;
+ }
+ if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
+ error("can't seek on '%1': %2", filename, strerror(errno));
+ else {
+ int length = tp->length;
+ int err = 0;
+ if (length == 0) {
+ if (fstat(fileno(fp), &sb) < 0) {
+ error("can't stat '%1': %2", filename, strerror(errno));
+ err = 1;
+ }
+ else if (!S_ISREG(sb.st_mode)) {
+ error("'%1' is not a regular file", filename);
+ err = 1;
+ }
+ else
+ length = int(sb.st_size);
+ }
+ if (!err) {
+ if (length + 2 > buflen) {
+ delete[] buf;
+ buflen = length + 2;
+ buf = new char[buflen];
+ }
+ if (fread(buf + 1, 1, length, fp) != (size_t)length)
+ error("fread on '%1' failed: %2", filename, strerror(errno));
+ else {
+ buf[0] = '\n';
+ // Remove the CR characters from CRLF pairs.
+ int sidx = 1, didx = 1;
+ for ( ; sidx < length + 1; sidx++, didx++)
+ {
+ if (buf[sidx] == '\r')
+ {
+ if (buf[++sidx] != '\n')
+ buf[didx++] = '\r';
+ else
+ length--;
+ }
+ if (sidx != didx)
+ buf[didx] = buf[sidx];
+ }
+ buf[length + 1] = '\n';
+ res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
+ if (res && ridp)
+ *ridp = reference_id(indx->filename_id + tp->filename_index,
+ tp->start);
+ }
+ }
+ }
+ fclose(fp);
+ return res;
+}
+
+const char *index_search_item::munge_filename(const char *filename)
+{
+ if (IS_ABSOLUTE(filename))
+ return filename;
+ const char *cwd = pool;
+ int need_slash = (cwd[0] != 0
+ && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
+ int len = strlen(cwd) + strlen(filename) + need_slash + 1;
+ if (len > filename_buflen) {
+ delete[] filename_buffer;
+ filename_buflen = len;
+ filename_buffer = new char[len];
+ }
+ strcpy(filename_buffer, cwd);
+ if (need_slash)
+ strcat(filename_buffer, "/");
+ strcat(filename_buffer, filename);
+ return filename_buffer;
+}
+
+const int *index_search_item::search1(const char **pp, const char *end)
+{
+ while (*pp < end && !csalnum(**pp))
+ *pp += 1;
+ if (*pp >= end)
+ return 0;
+ const char *start = *pp;
+ while (*pp < end && csalnum(**pp))
+ *pp += 1;
+ int len = *pp - start;
+ if (len < header.shortest)
+ return 0;
+ if (len > header.truncate)
+ len = header.truncate;
+ int is_number = 1;
+ for (int i = 0; i < len; i++)
+ if (csdigit(start[i]))
+ key_buffer[i] = start[i];
+ else {
+ key_buffer[i] = cmlower(start[i]);
+ is_number = 0;
+ }
+ if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
+ return 0;
+ unsigned hc = hash(key_buffer, len);
+ if (common_words_table) {
+ for (int h = hc % common_words_table_size;
+ common_words_table[h];
+ --h) {
+ if (strlen(common_words_table[h]) == (size_t)len
+ && memcmp(common_words_table[h], key_buffer, len) == 0)
+ return 0;
+ if (h == 0)
+ h = common_words_table_size;
+ }
+ }
+ int li = table[int(hc % header.table_size)];
+ return li < 0 ? &minus_one : lists + li;
+}
+
+static void merge(int *result, const int *s1, const int *s2)
+{
+ for (; *s1 >= 0; s1++) {
+ while (*s2 >= 0 && *s2 < *s1)
+ s2++;
+ if (*s2 == *s1)
+ *result++ = *s2;
+ }
+ *result++ = -1;
+}
+
+const int *index_search_item::search(const char *ptr, int length,
+ int **temp_listp)
+{
+ const char *end = ptr + length;
+ if (*temp_listp) {
+ delete[] *temp_listp;
+ *temp_listp = 0;
+ }
+ const int *first_list = 0;
+ while (ptr < end && (first_list = search1(&ptr, end)) == 0)
+ ;
+ if (!first_list)
+ return 0;
+ if (*first_list < 0)
+ return first_list;
+ const int *second_list = 0;
+ while (ptr < end && (second_list = search1(&ptr, end)) == 0)
+ ;
+ if (!second_list)
+ return first_list;
+ if (*second_list < 0)
+ return second_list;
+ const int *p;
+ for (p = first_list; *p >= 0; p++)
+ ;
+ int len = p - first_list;
+ for (p = second_list; *p >= 0; p++)
+ ;
+ if (p - second_list < len)
+ len = p - second_list;
+ int *matches = new int[len + 1];
+ merge(matches, first_list, second_list);
+ while (ptr < end) {
+ const int *list = search1(&ptr, end);
+ if (list != 0) {
+ if (*list < 0) {
+ delete[] matches;
+ return list;
+ }
+ merge(matches, matches, list);
+ if (*matches < 0) {
+ delete[] matches;
+ return &minus_one;
+ }
+ }
+ }
+ *temp_listp = matches;
+ return matches;
+}
+
+void index_search_item::read_common_words_file()
+{
+ if (header.common <= 0)
+ return;
+ const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
+ errno = 0;
+ FILE *fp = fopen(common_words_file, "r");
+ if (!fp) {
+ error("can't open '%1': %2", common_words_file, strerror(errno));
+ return;
+ }
+ common_words_table_size = 2*header.common + 1;
+ while (!is_prime(common_words_table_size))
+ common_words_table_size += 2;
+ common_words_table = new char *[common_words_table_size];
+ for (int i = 0; i < common_words_table_size; i++)
+ common_words_table[i] = 0;
+ int count = 0;
+ int key_len = 0;
+ for (;;) {
+ int c = getc(fp);
+ while (c != EOF && !csalnum(c))
+ c = getc(fp);
+ if (c == EOF)
+ break;
+ do {
+ if (key_len < header.truncate)
+ key_buffer[key_len++] = cmlower(c);
+ c = getc(fp);
+ } while (c != EOF && csalnum(c));
+ if (key_len >= header.shortest) {
+ int h = hash(key_buffer, key_len) % common_words_table_size;
+ while (common_words_table[h]) {
+ if (h == 0)
+ h = common_words_table_size;
+ --h;
+ }
+ common_words_table[h] = new char[key_len + 1];
+ memcpy(common_words_table[h], key_buffer, key_len);
+ common_words_table[h][key_len] = '\0';
+ }
+ if (++count >= header.common)
+ break;
+ key_len = 0;
+ if (c == EOF)
+ break;
+ }
+ fclose(fp);
+}
+
+void index_search_item::add_out_of_date_file(int fd, const char *filename,
+ int fid)
+{
+ search_item **pp;
+ for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
+ if ((*pp)->is_named(filename))
+ return;
+ *pp = make_linear_search_item(fd, filename, fid);
+ warning("'%1' modified since index '%2' created", filename, name);
+}
+
+void index_search_item::check_files()
+{
+ const char *pool_end = pool + header.strings_size;
+ for (const char *ptr = strchr(ignore_fields, '\0') + 1;
+ ptr < pool_end;
+ ptr = strchr(ptr, '\0') + 1) {
+ const char *path = munge_filename(ptr);
+ struct stat sb;
+ if (stat(path, &sb) < 0)
+ error("can't stat '%1': %2", path, strerror(errno));
+ else if (sb.st_mtime > mtime) {
+ int fd = open(path, O_RDONLY | O_BINARY);
+ if (fd < 0)
+ error("can't open '%1': %2", path, strerror(errno));
+ else
+ add_out_of_date_file(fd, path, filename_id + (ptr - pool));
+ }
+ }
+}
+
+// Local Variables:
+// fill-column: 72
+// mode: C++
+// End:
+// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: