diff options
Diffstat (limited to 'src/libs/libbib')
-rw-r--r-- | src/libs/libbib/common.cpp | 37 | ||||
-rw-r--r-- | src/libs/libbib/index.cpp | 688 | ||||
-rw-r--r-- | src/libs/libbib/libbib.am | 35 | ||||
-rw-r--r-- | src/libs/libbib/linear.cpp | 506 | ||||
-rw-r--r-- | src/libs/libbib/map.c | 86 | ||||
-rw-r--r-- | src/libs/libbib/search.cpp | 136 |
6 files changed, 1488 insertions, 0 deletions
diff --git a/src/libs/libbib/common.cpp b/src/libs/libbib/common.cpp new file mode 100644 index 0000000..05accd8 --- /dev/null +++ b/src/libs/libbib/common.cpp @@ -0,0 +1,37 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +unsigned hash(const char *s, int len) +{ +#if 0 + unsigned h = 0, g; + while (*s != '\0') { + h <<= 4; + h += *s++; + if ((g = h & 0xf0000000) != 0) { + h ^= g >> 24; + h ^= g; + } + } +#endif + unsigned h = 0; + while (--len >= 0) + h = *s++ + 65587*h; + return h; +} + diff --git a/src/libs/libbib/index.cpp b/src/libs/libbib/index.cpp new file mode 100644 index 0000000..aebcbbf --- /dev/null +++ b/src/libs/libbib/index.cpp @@ -0,0 +1,688 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "lib.h" + +#include <stdlib.h> +#include <errno.h> + +#include "posix.h" +#include "cset.h" +#include "cmap.h" +#include "errarg.h" +#include "error.h" + +#include "refid.h" +#include "search.h" +#include "index.h" +#include "defs.h" + +#include "nonposix.h" + +// Interface to mmap. +extern "C" { + void *mapread(int fd, int len); + int unmap(void *, int len); +} + +#if 0 +const +#endif +int minus_one = -1; + +bool do_verify = false; + +struct word_list; + +class index_search_item : public search_item { + search_item *out_of_date_files; + index_header header; + char *buffer; + void *map_addr; + int map_len; + tag *tags; + int *table; + int *lists; + char *pool; + char *key_buffer; + char *filename_buffer; + int filename_buflen; + char **common_words_table; + int common_words_table_size; + const char *ignore_fields; + time_t mtime; + + const char *get_invalidity_reason(); + const int *search1(const char **pp, const char *end); + const int *search(const char *ptr, int length, int **temp_listp); + const char *munge_filename(const char *); + void read_common_words_file(); + void add_out_of_date_file(int fd, const char *filename, int fid); +public: + index_search_item(const char *, int); + ~index_search_item(); + const char *check_header(index_header *, unsigned); + bool load(int fd); + search_item_iterator *make_search_item_iterator(const char *); + bool is_valid(); + void check_files(); + int next_filename_id() const; + friend class index_search_item_iterator; +}; + +class index_search_item_iterator : public search_item_iterator { + index_search_item *indx; + search_item_iterator *out_of_date_files_iter; + search_item *next_out_of_date_file; + const int *found_list; + int *temp_list; + char *buf; + int buflen; + linear_searcher searcher; + char *query; + int get_tag(int tagno, const linear_searcher &, const char **, int *, + reference_id *); +public: + index_search_item_iterator(index_search_item *, const char *); + ~index_search_item_iterator(); + int next(const linear_searcher &, const char **, int *, reference_id *); +}; + + +index_search_item::index_search_item(const char *filename, int fid) +: search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0), + map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0), + common_words_table(0) +{ +} + +index_search_item::~index_search_item() +{ + if (buffer) + free(buffer); + if (map_addr) { + if (unmap(map_addr, map_len) < 0) + error("unmap: %1", strerror(errno)); + } + while (out_of_date_files) { + search_item *tem = out_of_date_files; + out_of_date_files = out_of_date_files->next; + delete tem; + } + delete[] filename_buffer; + delete[] key_buffer; + if (common_words_table) { + for (int i = 0; i < common_words_table_size; i++) + delete[] common_words_table[i]; + delete[] common_words_table; + } +} + +class file_closer { + int *fdp; +public: + file_closer(int &fd) : fdp(&fd) { } + ~file_closer() { close(*fdp); } +}; + +// Tell the compiler that a variable is intentionally unused. +inline void unused(void *) { } + +// Validate the data reported in the header so that we don't overread on +// the heap in the load() member function. Return null pointer if no +// problems are detected. +const char *index_search_item::check_header(index_header *file_header, + unsigned file_size) +{ + if (file_header->tags_size < 0) + return "tag list length negative"; + if (file_header->lists_size < 0) + return "reference list length negative"; + // The table and string pool sizes will not be zero, even in an empty + // index. + if (file_header->table_size < 1) + return "table size nonpositive"; + if (file_header->strings_size < 1) + return "string pool size nonpositive"; + size_t sz = (file_header->tags_size * sizeof(tag) + + file_header->lists_size * sizeof(int) + + file_header->table_size * sizeof(int) + + file_header->strings_size + + sizeof(file_header)); + if (sz != file_size) + return("size mismatch between header and data"); + unsigned size_remaining = file_size; + unsigned chunk_size = file_header->tags_size * sizeof(tag); + if (chunk_size > size_remaining) + return "claimed tag list length exceeds file size"; + size_remaining -= chunk_size; + chunk_size = file_header->lists_size * sizeof(int); + if (chunk_size > size_remaining) + return "claimed reference list length exceeds file size"; + size_remaining -= chunk_size; + chunk_size = file_header->table_size * sizeof(int); + if (chunk_size > size_remaining) + return "claimed table size exceeds file size"; + size_remaining -= chunk_size; + chunk_size = file_header->strings_size; + if (chunk_size > size_remaining) + return "claimed string pool size exceeds file size"; + return 0; +} + +bool index_search_item::load(int fd) +{ + file_closer fd_closer(fd); // close fd on return + unused(&fd_closer); + struct stat sb; + if (fstat(fd, &sb) < 0) { + error("can't fstat index '%1': %2", name, strerror(errno)); + return false; + } + if (!S_ISREG(sb.st_mode)) { + error("index '%1' is not a regular file", name); + return false; + } + mtime = sb.st_mtime; + unsigned size = unsigned(sb.st_size); // widening conversion + if (size == 0) { + error("index '%1' is an empty file", name); + return false; + } + char *addr; + map_addr = mapread(fd, size); + if (map_addr) { + addr = (char *)map_addr; + map_len = size; + } + else { + addr = buffer = (char *)malloc(size); + if (buffer == 0) { + error("can't allocate memory to process index '%1'", name); + return false; + } + char *ptr = buffer; + int bytes_to_read = size; + while (bytes_to_read > 0) { + int nread = read(fd, ptr, bytes_to_read); + if (nread == 0) { + error("unexpected end-of-file while reading index '%1'", name); + return false; + } + if (nread < 0) { + error("read error on index '%1': %2", name, strerror(errno)); + return false; + } + bytes_to_read -= nread; + ptr += nread; + } + } + header = *(index_header *)addr; + if (header.magic != INDEX_MAGIC) { + error("'%1' is not an index file: wrong magic number", name); + return false; + } + if (header.version != INDEX_VERSION) { + error("version number in index '%1' is wrong: was %2, should be %3", + name, header.version, INDEX_VERSION); + return false; + } + const char *problem = check_header(&header, size); + if (problem != 0) { + if (do_verify) + error("corrupt header in index file '%1': %2", name, problem); + else + error("corrupt header in index file '%1'", name); + return false; + } + tags = (tag *)(addr + sizeof(header)); + lists = (int *)(tags + header.tags_size); + table = (int *)(lists + header.lists_size); + pool = (char *)(table + header.table_size); + ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1; + key_buffer = new char[header.truncate]; + read_common_words_file(); + return true; +} + +const char *index_search_item::get_invalidity_reason() +{ + if (tags == 0) + return "not loaded"; + if ((header.lists_size > 0) && (lists[header.lists_size - 1] >= 0)) + return "last list element not negative"; + int i; + for (i = 0; i < header.table_size; i++) { + int li = table[i]; + if (li >= header.lists_size) + return "bad list index"; + if (li >= 0) { + for (int *ptr = lists + li; *ptr >= 0; ptr++) { + if (*ptr >= header.tags_size) + return "bad tag index"; + if (*ptr >= ptr[1] && ptr[1] >= 0) + return "list not ordered"; + } + } + } + for (i = 0; i < header.tags_size; i++) { + if (tags[i].filename_index >= header.strings_size) + return "bad index in tags"; + if (tags[i].length < 0) + return "bad length in tags"; + if (tags[i].start < 0) + return "bad start in tags"; + } + if (pool[header.strings_size - 1] != '\0') + return "last character in string pool is not null"; + return 0; +} + +bool index_search_item::is_valid() +{ + const char *reason = get_invalidity_reason(); + if (!reason) + return true; + error("'%1' is bad: %2", name, reason); + return false; +} + +int index_search_item::next_filename_id() const +{ + return filename_id + header.strings_size + 1; +} + +search_item_iterator *index_search_item::make_search_item_iterator( + const char *query) +{ + return new index_search_item_iterator(this, query); +} + +search_item *make_index_search_item(const char *filename, int fid) +{ + char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)]; + strcpy(index_filename, filename); + strcat(index_filename, INDEX_SUFFIX); + int fd = open(index_filename, O_RDONLY | O_BINARY); + if (fd < 0) + return 0; + index_search_item *item = new index_search_item(index_filename, fid); + delete[] index_filename; + if (!item->load(fd)) { + close(fd); + delete item; + return 0; + } + else if (do_verify && !item->is_valid()) { + delete item; + return 0; + } + else { + item->check_files(); + return item; + } +} + + +index_search_item_iterator::index_search_item_iterator(index_search_item *ind, + const char *q) +: indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0), + buf(0), buflen(0), + searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate), + query(strsave(q)) +{ + found_list = indx->search(q, strlen(q), &temp_list); + if (!found_list) { + found_list = &minus_one; + warning("all keys would have been discarded in constructing index '%1'", + indx->name); + } +} + +index_search_item_iterator::~index_search_item_iterator() +{ + delete[] temp_list; + delete[] buf; + delete[] query; + delete out_of_date_files_iter; +} + +int index_search_item_iterator::next(const linear_searcher &, + const char **pp, int *lenp, + reference_id *ridp) +{ + if (found_list) { + for (;;) { + int tagno = *found_list; + if (tagno == -1) + break; + found_list++; + if (get_tag(tagno, searcher, pp, lenp, ridp)) + return 1; + } + found_list = 0; + next_out_of_date_file = indx->out_of_date_files; + } + while (next_out_of_date_file) { + if (out_of_date_files_iter == 0) + out_of_date_files_iter + = next_out_of_date_file->make_search_item_iterator(query); + if (out_of_date_files_iter->next(searcher, pp, lenp, ridp)) + return 1; + delete out_of_date_files_iter; + out_of_date_files_iter = 0; + next_out_of_date_file = next_out_of_date_file->next; + } + return 0; +} + +int index_search_item_iterator::get_tag(int tagno, + const linear_searcher &searchr, + const char **pp, int *lenp, + reference_id *ridp) +{ + if (tagno < 0 || tagno >= indx->header.tags_size) { + error("bad tag number"); + return 0; + } + tag *tp = indx->tags + tagno; + const char *filename = indx->munge_filename(indx->pool + tp->filename_index); + int fd = open(filename, O_RDONLY | O_BINARY); + if (fd < 0) { + error("can't open '%1': %2", filename, strerror(errno)); + return 0; + } + struct stat sb; + if (fstat(fd, &sb) < 0) { + error("can't fstat: %1", strerror(errno)); + close(fd); + return 0; + } + time_t mtime = sb.st_mtime; + if (mtime > indx->mtime) { + indx->add_out_of_date_file(fd, filename, + indx->filename_id + tp->filename_index); + return 0; + } + int res = 0; + FILE *fp = fdopen(fd, FOPEN_RB); + if (!fp) { + error("fdopen failed"); + close(fd); + return 0; + } + if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0) + error("can't seek on '%1': %2", filename, strerror(errno)); + else { + int length = tp->length; + int err = 0; + if (length == 0) { + if (fstat(fileno(fp), &sb) < 0) { + error("can't stat '%1': %2", filename, strerror(errno)); + err = 1; + } + else if (!S_ISREG(sb.st_mode)) { + error("'%1' is not a regular file", filename); + err = 1; + } + else + length = int(sb.st_size); + } + if (!err) { + if (length + 2 > buflen) { + delete[] buf; + buflen = length + 2; + buf = new char[buflen]; + } + if (fread(buf + 1, 1, length, fp) != (size_t)length) + error("fread on '%1' failed: %2", filename, strerror(errno)); + else { + buf[0] = '\n'; + // Remove the CR characters from CRLF pairs. + int sidx = 1, didx = 1; + for ( ; sidx < length + 1; sidx++, didx++) + { + if (buf[sidx] == '\r') + { + if (buf[++sidx] != '\n') + buf[didx++] = '\r'; + else + length--; + } + if (sidx != didx) + buf[didx] = buf[sidx]; + } + buf[length + 1] = '\n'; + res = searchr.search(buf + 1, buf + 2 + length, pp, lenp); + if (res && ridp) + *ridp = reference_id(indx->filename_id + tp->filename_index, + tp->start); + } + } + } + fclose(fp); + return res; +} + +const char *index_search_item::munge_filename(const char *filename) +{ + if (IS_ABSOLUTE(filename)) + return filename; + const char *cwd = pool; + int need_slash = (cwd[0] != 0 + && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0); + int len = strlen(cwd) + strlen(filename) + need_slash + 1; + if (len > filename_buflen) { + delete[] filename_buffer; + filename_buflen = len; + filename_buffer = new char[len]; + } + strcpy(filename_buffer, cwd); + if (need_slash) + strcat(filename_buffer, "/"); + strcat(filename_buffer, filename); + return filename_buffer; +} + +const int *index_search_item::search1(const char **pp, const char *end) +{ + while (*pp < end && !csalnum(**pp)) + *pp += 1; + if (*pp >= end) + return 0; + const char *start = *pp; + while (*pp < end && csalnum(**pp)) + *pp += 1; + int len = *pp - start; + if (len < header.shortest) + return 0; + if (len > header.truncate) + len = header.truncate; + int is_number = 1; + for (int i = 0; i < len; i++) + if (csdigit(start[i])) + key_buffer[i] = start[i]; + else { + key_buffer[i] = cmlower(start[i]); + is_number = 0; + } + if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9')) + return 0; + unsigned hc = hash(key_buffer, len); + if (common_words_table) { + for (int h = hc % common_words_table_size; + common_words_table[h]; + --h) { + if (strlen(common_words_table[h]) == (size_t)len + && memcmp(common_words_table[h], key_buffer, len) == 0) + return 0; + if (h == 0) + h = common_words_table_size; + } + } + int li = table[int(hc % header.table_size)]; + return li < 0 ? &minus_one : lists + li; +} + +static void merge(int *result, const int *s1, const int *s2) +{ + for (; *s1 >= 0; s1++) { + while (*s2 >= 0 && *s2 < *s1) + s2++; + if (*s2 == *s1) + *result++ = *s2; + } + *result++ = -1; +} + +const int *index_search_item::search(const char *ptr, int length, + int **temp_listp) +{ + const char *end = ptr + length; + if (*temp_listp) { + delete[] *temp_listp; + *temp_listp = 0; + } + const int *first_list = 0; + while (ptr < end && (first_list = search1(&ptr, end)) == 0) + ; + if (!first_list) + return 0; + if (*first_list < 0) + return first_list; + const int *second_list = 0; + while (ptr < end && (second_list = search1(&ptr, end)) == 0) + ; + if (!second_list) + return first_list; + if (*second_list < 0) + return second_list; + const int *p; + for (p = first_list; *p >= 0; p++) + ; + int len = p - first_list; + for (p = second_list; *p >= 0; p++) + ; + if (p - second_list < len) + len = p - second_list; + int *matches = new int[len + 1]; + merge(matches, first_list, second_list); + while (ptr < end) { + const int *list = search1(&ptr, end); + if (list != 0) { + if (*list < 0) { + delete[] matches; + return list; + } + merge(matches, matches, list); + if (*matches < 0) { + delete[] matches; + return &minus_one; + } + } + } + *temp_listp = matches; + return matches; +} + +void index_search_item::read_common_words_file() +{ + if (header.common <= 0) + return; + const char *common_words_file = munge_filename(strchr(pool, '\0') + 1); + errno = 0; + FILE *fp = fopen(common_words_file, "r"); + if (!fp) { + error("can't open '%1': %2", common_words_file, strerror(errno)); + return; + } + common_words_table_size = 2*header.common + 1; + while (!is_prime(common_words_table_size)) + common_words_table_size += 2; + common_words_table = new char *[common_words_table_size]; + for (int i = 0; i < common_words_table_size; i++) + common_words_table[i] = 0; + int count = 0; + int key_len = 0; + for (;;) { + int c = getc(fp); + while (c != EOF && !csalnum(c)) + c = getc(fp); + if (c == EOF) + break; + do { + if (key_len < header.truncate) + key_buffer[key_len++] = cmlower(c); + c = getc(fp); + } while (c != EOF && csalnum(c)); + if (key_len >= header.shortest) { + int h = hash(key_buffer, key_len) % common_words_table_size; + while (common_words_table[h]) { + if (h == 0) + h = common_words_table_size; + --h; + } + common_words_table[h] = new char[key_len + 1]; + memcpy(common_words_table[h], key_buffer, key_len); + common_words_table[h][key_len] = '\0'; + } + if (++count >= header.common) + break; + key_len = 0; + if (c == EOF) + break; + } + fclose(fp); +} + +void index_search_item::add_out_of_date_file(int fd, const char *filename, + int fid) +{ + search_item **pp; + for (pp = &out_of_date_files; *pp; pp = &(*pp)->next) + if ((*pp)->is_named(filename)) + return; + *pp = make_linear_search_item(fd, filename, fid); + warning("'%1' modified since index '%2' created", filename, name); +} + +void index_search_item::check_files() +{ + const char *pool_end = pool + header.strings_size; + for (const char *ptr = strchr(ignore_fields, '\0') + 1; + ptr < pool_end; + ptr = strchr(ptr, '\0') + 1) { + const char *path = munge_filename(ptr); + struct stat sb; + if (stat(path, &sb) < 0) + error("can't stat '%1': %2", path, strerror(errno)); + else if (sb.st_mtime > mtime) { + int fd = open(path, O_RDONLY | O_BINARY); + if (fd < 0) + error("can't open '%1': %2", path, strerror(errno)); + else + add_out_of_date_file(fd, path, filename_id + (ptr - pool)); + } + } +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/libs/libbib/libbib.am b/src/libs/libbib/libbib.am new file mode 100644 index 0000000..00e1d3b --- /dev/null +++ b/src/libs/libbib/libbib.am @@ -0,0 +1,35 @@ +# Automake rules for 'libbib' +# +# Copyright (C) 2014-2020 Free Software Foundation, Inc. +# +# 'groff' is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# 'groff' is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/gpl-2.0.html>. +# +######################################################################## + +noinst_LIBRARIES += libbib.a +libbib_a_SOURCES = \ + src/libs/libbib/common.cpp \ + src/libs/libbib/index.cpp \ + src/libs/libbib/linear.cpp \ + src/libs/libbib/search.cpp \ + src/libs/libbib/map.c +src/libs/libbib/index.$(OBJEXT): defs.h + + +# Local Variables: +# mode: makefile-automake +# fill-column: 72 +# End: +# vim: set autoindent filetype=automake textwidth=72: diff --git a/src/libs/libbib/linear.cpp b/src/libs/libbib/linear.cpp new file mode 100644 index 0000000..22e11d8 --- /dev/null +++ b/src/libs/libbib/linear.cpp @@ -0,0 +1,506 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "lib.h" + +#include <assert.h> +#include <stdlib.h> +#include <errno.h> + +#include "posix.h" +#include "errarg.h" +#include "error.h" +#include "cset.h" +#include "cmap.h" +#include "nonposix.h" + +#include "refid.h" +#include "search.h" + +class file_buffer { + char *buffer; + char *bufend; +public: + file_buffer(); + ~file_buffer(); + int load(int fd, const char *filename); + const char *get_start() const; + const char *get_end() const; +}; + +typedef unsigned char uchar; + +static uchar map[256]; +static uchar inv_map[256][3]; + +struct map_init { + map_init(); +}; + +static map_init the_map_init; + +map_init::map_init() +{ + int i; + for (i = 0; i < 256; i++) + map[i] = csalnum(i) ? cmlower(i) : '\0'; + for (i = 0; i < 256; i++) { + if (cslower(i)) { + inv_map[i][0] = i; + inv_map[i][1] = cmupper(i); + inv_map[i][2] = '\0'; + } + else if (csdigit(i)) { + inv_map[i][0] = i; + inv_map[i][1] = 0; + } + else + inv_map[i][0] = '\0'; + } +} + + +class bmpattern { + char *pat; + int len; + int delta[256]; +public: + bmpattern(const char *pattern, int pattern_length); + ~bmpattern(); + const char *search(const char *p, const char *end) const; + int length() const; +}; + +bmpattern::bmpattern(const char *pattern, int pattern_length) +: len(pattern_length) +{ + pat = new char[len]; + int i; + for (i = 0; i < len; i++) + pat[i] = map[uchar(pattern[i])]; + for (i = 0; i < 256; i++) + delta[i] = len; + for (i = 0; i < len; i++) + for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++) + delta[*inv] = len - i - 1; +} + +const char *bmpattern::search(const char *buf, const char *end) const +{ + int buflen = end - buf; + if (len > buflen) + return 0; + const char *strend; + if (buflen > len*4) + strend = end - len*4; + else + strend = buf; + const char *k = buf + len - 1; + const int *del = delta; + const char *pattern = pat; + for (;;) { + while (k < strend) { + int t = del[uchar(*k)]; + if (!t) + break; + k += t; + k += del[uchar(*k)]; + k += del[uchar(*k)]; + } + while (k < end && del[uchar(*k)] != 0) + k++; + if (k == end) + break; + int j = len - 1; + const char *s = k; + for (;;) { + if (j == 0) + return s; + if (map[uchar(*--s)] != uchar(pattern[--j])) + break; + } + k++; + } + return 0; +} + +bmpattern::~bmpattern() +{ + delete[] pat; +} + +inline int bmpattern::length() const +{ + return len; +} + + +static const char *find_end(const char *bufend, const char *p); + +const char *linear_searcher::search_and_check(const bmpattern *key, + const char *buf, const char *bufend, const char **start) const +{ + assert(buf[-1] == '\n'); + assert(bufend[-1] == '\n'); + const char *ptr = buf; + for (;;) { + const char *found = key->search(ptr, bufend); + if (!found) + break; + if (check_match(buf, bufend, found, key->length(), &ptr, start)) + return found; + } + return 0; +} + +static const char *skip_field(const char *end, const char *p) +{ + for (;;) + if (*p++ == '\n') { + if (p == end || *p == '%') + break; + const char *q; + for (q = p; *q == ' ' || *q == '\t'; q++) + ; + if (*q == '\n') + break; + p = q + 1; + } + return p; +} + +static const char *find_end(const char *bufend, const char *p) +{ + for (;;) + if (*p++ == '\n') { + if (p == bufend) + break; + const char *q; + for (q = p; *q == ' ' || *q == '\t'; q++) + ; + if (*q == '\n') + break; + p = q + 1; + } + return p; +} + + +int linear_searcher::check_match(const char *buf, const char *bufend, + const char *match, int matchlen, + const char **cont, const char **start) const +{ + *cont = match + 1; + // The user is required to supply only the first truncate_len characters + // of the key. If truncate_len <= 0, he must supply all the key. + if ((truncate_len <= 0 || matchlen < truncate_len) + && map[uchar(match[matchlen])] != '\0') + return 0; + + // The character before the match must not be an alphanumeric + // character (unless the alphanumeric character follows one or two + // percent characters at the beginning of the line), nor must it be + // a percent character at the beginning of a line, nor a percent + // character following a percent character at the beginning of a + // line. + + switch (match - buf) { + case 0: + break; + case 1: + if (match[-1] == '%' || map[uchar(match[-1])] != '\0') + return 0; + break; + case 2: + if (map[uchar(match[-1])] != '\0' && match[-2] != '%') + return 0; + if (match[-1] == '%' + && (match[-2] == '\n' || match[-2] == '%')) + return 0; + break; + default: + if (map[uchar(match[-1])] != '\0' + && !(match[-2] == '%' + && (match[-3] == '\n' + || (match[-3] == '%' && match[-4] == '\n')))) + return 0; + if (match[-1] == '%' + && (match[-2] == '\n' + || (match[-2] == '%' && match[-3] == '\n'))) + return 0; + } + + const char *p = match; + int had_percent = 0; + for (;;) { + if (*p == '\n') { + if (!had_percent && p[1] == '%') { + if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) { + *cont = skip_field(bufend, match + matchlen); + return 0; + } + if (!start) + break; + had_percent = 1; + } + if (p <= buf) { + if (start) + *start = p + 1; + return 1; + } + const char *q; + for (q = p - 1; *q == ' ' || *q == '\t'; q--) + ; + if (*q == '\n') { + if (start) + *start = p + 1; + break; + } + p = q; + } + p--; + } + return 1; +} + +file_buffer::file_buffer() +: buffer(0), bufend(0) +{ +} + +file_buffer::~file_buffer() +{ + delete[] buffer; +} + +const char *file_buffer::get_start() const +{ + return buffer ? buffer + 4 : 0; +} + +const char *file_buffer::get_end() const +{ + return bufend; +} + +int file_buffer::load(int fd, const char *filename) +{ + struct stat sb; + if (fstat(fd, &sb) < 0) + error("can't fstat '%1': %2", filename, strerror(errno)); + else if (!S_ISREG(sb.st_mode)) + error("'%1' is not a regular file", filename); + else { + // We need one character extra at the beginning for an additional newline + // used as a sentinel. We get 4 instead so that the read buffer will be + // word-aligned. This seems to make the read slightly faster. We also + // need one character at the end also for an additional newline used as a + // sentinel. + int size = int(sb.st_size); + buffer = new char[size + 4 + 1]; + int nread = read(fd, buffer + 4, size); + if (nread < 0) + error("error reading '%1': %2", filename, strerror(errno)); + else if (nread != size) + error("size of '%1' decreased", filename); + else { + char c; + nread = read(fd, &c, 1); + if (nread != 0) + error("size of '%1' increased", filename); + else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0) + error("database '%1' is a binary file", filename); + else { + close(fd); + buffer[3] = '\n'; + int sidx = 4, didx = 4; + for ( ; sidx < size + 4; sidx++, didx++) + { + if (buffer[sidx] == '\r') + { + if (buffer[++sidx] != '\n') + buffer[didx++] = '\r'; + else + size--; + } + if (sidx != didx) + buffer[didx] = buffer[sidx]; + } + bufend = buffer + 4 + size; + if (bufend[-1] != '\n') + *bufend++ = '\n'; + return 1; + } + } + delete[] buffer; + buffer = 0; + } + close(fd); + return 0; +} + +linear_searcher::linear_searcher(const char *query, int query_len, + const char *ign, int trunc) +: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0) +{ + const char *query_end = query + query_len; + int nk = 0; + const char *p; + for (p = query; p < query_end; p++) + if (map[uchar(*p)] != '\0' + && (p[1] == '\0' || map[uchar(p[1])] == '\0')) + nk++; + if (nk == 0) + return; + keys = new bmpattern*[nk]; + p = query; + for (;;) { + while (p < query_end && map[uchar(*p)] == '\0') + p++; + if (p == query_end) + break; + const char *start = p; + while (p < query_end && map[uchar(*p)] != '\0') + p++; + keys[nkeys++] = new bmpattern(start, p - start); + } + assert(nkeys <= nk); + if (nkeys == 0) { + delete[] keys; + keys = 0; + } +} + +linear_searcher::~linear_searcher() +{ + for (int i = 0; i < nkeys; i++) + delete keys[i]; + delete[] keys; +} + +int linear_searcher::search(const char *buffer, const char *bufend, + const char **startp, int *lengthp) const +{ + assert(bufend - buffer > 0); + assert(buffer[-1] == '\n'); + assert(bufend[-1] == '\n'); + if (nkeys == 0) + return 0; + for (;;) { + const char *refstart; + const char *found = search_and_check(keys[0], buffer, bufend, &refstart); + if (!found) + break; + const char *refend = find_end(bufend, found + keys[0]->length()); + int i; + for (i = 1; i < nkeys; i++) + if (!search_and_check(keys[i], refstart, refend)) + break; + if (i >= nkeys) { + *startp = refstart; + *lengthp = refend - refstart; + return 1; + } + buffer = refend; + } + return 0; +} + +class linear_search_item : public search_item { + file_buffer fbuf; +public: + linear_search_item(const char *filename, int fid); + ~linear_search_item(); + int load(int fd); + search_item_iterator *make_search_item_iterator(const char *); + friend class linear_search_item_iterator; +}; + +class linear_search_item_iterator : public search_item_iterator { + linear_search_item *lsi; + int pos; +public: + linear_search_item_iterator(linear_search_item *, const char *query); + ~linear_search_item_iterator(); + int next(const linear_searcher &, const char **ptr, int *lenp, + reference_id *ridp); +}; + +search_item *make_linear_search_item(int fd, const char *filename, int fid) +{ + linear_search_item *item = new linear_search_item(filename, fid); + if (!item->load(fd)) { + delete item; + return 0; + } + else + return item; +} + +linear_search_item::linear_search_item(const char *filename, int fid) +: search_item(filename, fid) +{ +} + +linear_search_item::~linear_search_item() +{ +} + +int linear_search_item::load(int fd) +{ + return fbuf.load(fd, name); +} + +search_item_iterator *linear_search_item::make_search_item_iterator( + const char *query) +{ + return new linear_search_item_iterator(this, query); +} + +linear_search_item_iterator::linear_search_item_iterator( + linear_search_item *p, const char *) +: lsi(p), pos(0) +{ +} + +linear_search_item_iterator::~linear_search_item_iterator() +{ +} + +int linear_search_item_iterator::next(const linear_searcher &searcher, + const char **startp, int *lengthp, + reference_id *ridp) +{ + const char *bufstart = lsi->fbuf.get_start(); + const char *bufend = lsi->fbuf.get_end(); + const char *ptr = bufstart + pos; + if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) { + pos = *startp + *lengthp - bufstart; + if (ridp) + *ridp = reference_id(lsi->filename_id, *startp - bufstart); + return 1; + } + else + return 0; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: diff --git a/src/libs/libbib/map.c b/src/libs/libbib/map.c new file mode 100644 index 0000000..9eae72b --- /dev/null +++ b/src/libs/libbib/map.c @@ -0,0 +1,86 @@ +/* Copyright (C) 1989- 2014 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <stdlib.h> + +#ifdef HAVE_MMAP + +#include <sys/types.h> +#include <sys/mman.h> + +/* The Net-2 man pages says that a MAP_FILE flag is required. */ +#ifndef MAP_FILE +#define MAP_FILE 0 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Prototypes */ +char *mapread(int, int); +int unmap(char *, int); + +char *mapread(int fd, int nbytes) +{ + char *p = (char *)mmap((void *)0, (size_t)nbytes, PROT_READ, + MAP_FILE|MAP_PRIVATE, fd, (off_t)0); + if (p == MAP_FAILED) + return 0; + /* mmap() shouldn't return 0 since MAP_FIXED wasn't specified. */ + if (p == 0) + abort(); + return p; +} + +int unmap(char *p, int len) +{ + return munmap((void *)p, len); +} + +#ifdef __cplusplus +} +#endif + +#else /* not HAVE_MMAP */ + +#include <errno.h> + +#ifdef __cplusplus +extern "C" { +#endif + +char *mapread(int fd, int nbytes) +{ + errno = ENODEV; + return 0; +} + +int unmap(char *p, int len) +{ + errno = EINVAL; + return -1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* not HAVE_MMAP */ diff --git a/src/libs/libbib/search.cpp b/src/libs/libbib/search.cpp new file mode 100644 index 0000000..10867b6 --- /dev/null +++ b/src/libs/libbib/search.cpp @@ -0,0 +1,136 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "lib.h" + +#include <assert.h> +#include <stdlib.h> +#include <errno.h> + +#include "posix.h" +#include "errarg.h" +#include "error.h" +#include "nonposix.h" + +#include "refid.h" +#include "search.h" + +int linear_truncate_len = 6; +const char *linear_ignore_fields = "XYZ"; + +search_list::search_list() +: list(0), niterators(0), next_fid(1) +{ +} + +search_list::~search_list() +{ + assert(niterators == 0); + while (list) { + search_item *tem = list->next; + delete list; + list = tem; + } +} + +void search_list::add_file(const char *filename, int silent) +{ + search_item *p = make_index_search_item(filename, next_fid); + if (!p) { + int fd = open(filename, O_RDONLY | O_BINARY); + if (fd < 0) { + if (!silent) + error("can't open '%1': %2", filename, strerror(errno)); + } + else + p = make_linear_search_item(fd, filename, next_fid); + } + if (p) { + search_item **pp; + for (pp = &list; *pp; pp = &(*pp)->next) + ; + *pp = p; + next_fid = p->next_filename_id(); + } +} + +int search_list::nfiles() const +{ + int n = 0; + for (search_item *ptr = list; ptr; ptr = ptr->next) + n++; + return n; +} + +search_list_iterator::search_list_iterator(search_list *p, const char *q) +: list(p), ptr(p->list), iter(0), query(strsave(q)), + searcher(q, strlen(q), linear_ignore_fields, linear_truncate_len) +{ + list->niterators += 1; +} + +search_list_iterator::~search_list_iterator() +{ + list->niterators -= 1; + delete[] query; + delete iter; +} + +int search_list_iterator::next(const char **pp, int *lenp, reference_id *ridp) +{ + while (ptr) { + if (iter == 0) + iter = ptr->make_search_item_iterator(query); + if (iter->next(searcher, pp, lenp, ridp)) + return 1; + delete iter; + iter = 0; + ptr = ptr->next; + } + return 0; +} + +search_item::search_item(const char *nm, int fid) +: name(strsave(nm)), filename_id(fid), next(0) +{ +} + +search_item::~search_item() +{ + delete[] name; +} + +int search_item::is_named(const char *nm) const +{ + return strcmp(name, nm) == 0; +} + +int search_item::next_filename_id() const +{ + return filename_id + 1; +} + +search_item_iterator::~search_item_iterator() +{ +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: |