diff options
Diffstat (limited to 'src/libs/libbib/linear.cpp')
-rw-r--r-- | src/libs/libbib/linear.cpp | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/src/libs/libbib/linear.cpp b/src/libs/libbib/linear.cpp new file mode 100644 index 0000000..22e11d8 --- /dev/null +++ b/src/libs/libbib/linear.cpp @@ -0,0 +1,506 @@ +/* Copyright (C) 1989-2020 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or +(at your option) any later version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "lib.h" + +#include <assert.h> +#include <stdlib.h> +#include <errno.h> + +#include "posix.h" +#include "errarg.h" +#include "error.h" +#include "cset.h" +#include "cmap.h" +#include "nonposix.h" + +#include "refid.h" +#include "search.h" + +class file_buffer { + char *buffer; + char *bufend; +public: + file_buffer(); + ~file_buffer(); + int load(int fd, const char *filename); + const char *get_start() const; + const char *get_end() const; +}; + +typedef unsigned char uchar; + +static uchar map[256]; +static uchar inv_map[256][3]; + +struct map_init { + map_init(); +}; + +static map_init the_map_init; + +map_init::map_init() +{ + int i; + for (i = 0; i < 256; i++) + map[i] = csalnum(i) ? cmlower(i) : '\0'; + for (i = 0; i < 256; i++) { + if (cslower(i)) { + inv_map[i][0] = i; + inv_map[i][1] = cmupper(i); + inv_map[i][2] = '\0'; + } + else if (csdigit(i)) { + inv_map[i][0] = i; + inv_map[i][1] = 0; + } + else + inv_map[i][0] = '\0'; + } +} + + +class bmpattern { + char *pat; + int len; + int delta[256]; +public: + bmpattern(const char *pattern, int pattern_length); + ~bmpattern(); + const char *search(const char *p, const char *end) const; + int length() const; +}; + +bmpattern::bmpattern(const char *pattern, int pattern_length) +: len(pattern_length) +{ + pat = new char[len]; + int i; + for (i = 0; i < len; i++) + pat[i] = map[uchar(pattern[i])]; + for (i = 0; i < 256; i++) + delta[i] = len; + for (i = 0; i < len; i++) + for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++) + delta[*inv] = len - i - 1; +} + +const char *bmpattern::search(const char *buf, const char *end) const +{ + int buflen = end - buf; + if (len > buflen) + return 0; + const char *strend; + if (buflen > len*4) + strend = end - len*4; + else + strend = buf; + const char *k = buf + len - 1; + const int *del = delta; + const char *pattern = pat; + for (;;) { + while (k < strend) { + int t = del[uchar(*k)]; + if (!t) + break; + k += t; + k += del[uchar(*k)]; + k += del[uchar(*k)]; + } + while (k < end && del[uchar(*k)] != 0) + k++; + if (k == end) + break; + int j = len - 1; + const char *s = k; + for (;;) { + if (j == 0) + return s; + if (map[uchar(*--s)] != uchar(pattern[--j])) + break; + } + k++; + } + return 0; +} + +bmpattern::~bmpattern() +{ + delete[] pat; +} + +inline int bmpattern::length() const +{ + return len; +} + + +static const char *find_end(const char *bufend, const char *p); + +const char *linear_searcher::search_and_check(const bmpattern *key, + const char *buf, const char *bufend, const char **start) const +{ + assert(buf[-1] == '\n'); + assert(bufend[-1] == '\n'); + const char *ptr = buf; + for (;;) { + const char *found = key->search(ptr, bufend); + if (!found) + break; + if (check_match(buf, bufend, found, key->length(), &ptr, start)) + return found; + } + return 0; +} + +static const char *skip_field(const char *end, const char *p) +{ + for (;;) + if (*p++ == '\n') { + if (p == end || *p == '%') + break; + const char *q; + for (q = p; *q == ' ' || *q == '\t'; q++) + ; + if (*q == '\n') + break; + p = q + 1; + } + return p; +} + +static const char *find_end(const char *bufend, const char *p) +{ + for (;;) + if (*p++ == '\n') { + if (p == bufend) + break; + const char *q; + for (q = p; *q == ' ' || *q == '\t'; q++) + ; + if (*q == '\n') + break; + p = q + 1; + } + return p; +} + + +int linear_searcher::check_match(const char *buf, const char *bufend, + const char *match, int matchlen, + const char **cont, const char **start) const +{ + *cont = match + 1; + // The user is required to supply only the first truncate_len characters + // of the key. If truncate_len <= 0, he must supply all the key. + if ((truncate_len <= 0 || matchlen < truncate_len) + && map[uchar(match[matchlen])] != '\0') + return 0; + + // The character before the match must not be an alphanumeric + // character (unless the alphanumeric character follows one or two + // percent characters at the beginning of the line), nor must it be + // a percent character at the beginning of a line, nor a percent + // character following a percent character at the beginning of a + // line. + + switch (match - buf) { + case 0: + break; + case 1: + if (match[-1] == '%' || map[uchar(match[-1])] != '\0') + return 0; + break; + case 2: + if (map[uchar(match[-1])] != '\0' && match[-2] != '%') + return 0; + if (match[-1] == '%' + && (match[-2] == '\n' || match[-2] == '%')) + return 0; + break; + default: + if (map[uchar(match[-1])] != '\0' + && !(match[-2] == '%' + && (match[-3] == '\n' + || (match[-3] == '%' && match[-4] == '\n')))) + return 0; + if (match[-1] == '%' + && (match[-2] == '\n' + || (match[-2] == '%' && match[-3] == '\n'))) + return 0; + } + + const char *p = match; + int had_percent = 0; + for (;;) { + if (*p == '\n') { + if (!had_percent && p[1] == '%') { + if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) { + *cont = skip_field(bufend, match + matchlen); + return 0; + } + if (!start) + break; + had_percent = 1; + } + if (p <= buf) { + if (start) + *start = p + 1; + return 1; + } + const char *q; + for (q = p - 1; *q == ' ' || *q == '\t'; q--) + ; + if (*q == '\n') { + if (start) + *start = p + 1; + break; + } + p = q; + } + p--; + } + return 1; +} + +file_buffer::file_buffer() +: buffer(0), bufend(0) +{ +} + +file_buffer::~file_buffer() +{ + delete[] buffer; +} + +const char *file_buffer::get_start() const +{ + return buffer ? buffer + 4 : 0; +} + +const char *file_buffer::get_end() const +{ + return bufend; +} + +int file_buffer::load(int fd, const char *filename) +{ + struct stat sb; + if (fstat(fd, &sb) < 0) + error("can't fstat '%1': %2", filename, strerror(errno)); + else if (!S_ISREG(sb.st_mode)) + error("'%1' is not a regular file", filename); + else { + // We need one character extra at the beginning for an additional newline + // used as a sentinel. We get 4 instead so that the read buffer will be + // word-aligned. This seems to make the read slightly faster. We also + // need one character at the end also for an additional newline used as a + // sentinel. + int size = int(sb.st_size); + buffer = new char[size + 4 + 1]; + int nread = read(fd, buffer + 4, size); + if (nread < 0) + error("error reading '%1': %2", filename, strerror(errno)); + else if (nread != size) + error("size of '%1' decreased", filename); + else { + char c; + nread = read(fd, &c, 1); + if (nread != 0) + error("size of '%1' increased", filename); + else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0) + error("database '%1' is a binary file", filename); + else { + close(fd); + buffer[3] = '\n'; + int sidx = 4, didx = 4; + for ( ; sidx < size + 4; sidx++, didx++) + { + if (buffer[sidx] == '\r') + { + if (buffer[++sidx] != '\n') + buffer[didx++] = '\r'; + else + size--; + } + if (sidx != didx) + buffer[didx] = buffer[sidx]; + } + bufend = buffer + 4 + size; + if (bufend[-1] != '\n') + *bufend++ = '\n'; + return 1; + } + } + delete[] buffer; + buffer = 0; + } + close(fd); + return 0; +} + +linear_searcher::linear_searcher(const char *query, int query_len, + const char *ign, int trunc) +: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0) +{ + const char *query_end = query + query_len; + int nk = 0; + const char *p; + for (p = query; p < query_end; p++) + if (map[uchar(*p)] != '\0' + && (p[1] == '\0' || map[uchar(p[1])] == '\0')) + nk++; + if (nk == 0) + return; + keys = new bmpattern*[nk]; + p = query; + for (;;) { + while (p < query_end && map[uchar(*p)] == '\0') + p++; + if (p == query_end) + break; + const char *start = p; + while (p < query_end && map[uchar(*p)] != '\0') + p++; + keys[nkeys++] = new bmpattern(start, p - start); + } + assert(nkeys <= nk); + if (nkeys == 0) { + delete[] keys; + keys = 0; + } +} + +linear_searcher::~linear_searcher() +{ + for (int i = 0; i < nkeys; i++) + delete keys[i]; + delete[] keys; +} + +int linear_searcher::search(const char *buffer, const char *bufend, + const char **startp, int *lengthp) const +{ + assert(bufend - buffer > 0); + assert(buffer[-1] == '\n'); + assert(bufend[-1] == '\n'); + if (nkeys == 0) + return 0; + for (;;) { + const char *refstart; + const char *found = search_and_check(keys[0], buffer, bufend, &refstart); + if (!found) + break; + const char *refend = find_end(bufend, found + keys[0]->length()); + int i; + for (i = 1; i < nkeys; i++) + if (!search_and_check(keys[i], refstart, refend)) + break; + if (i >= nkeys) { + *startp = refstart; + *lengthp = refend - refstart; + return 1; + } + buffer = refend; + } + return 0; +} + +class linear_search_item : public search_item { + file_buffer fbuf; +public: + linear_search_item(const char *filename, int fid); + ~linear_search_item(); + int load(int fd); + search_item_iterator *make_search_item_iterator(const char *); + friend class linear_search_item_iterator; +}; + +class linear_search_item_iterator : public search_item_iterator { + linear_search_item *lsi; + int pos; +public: + linear_search_item_iterator(linear_search_item *, const char *query); + ~linear_search_item_iterator(); + int next(const linear_searcher &, const char **ptr, int *lenp, + reference_id *ridp); +}; + +search_item *make_linear_search_item(int fd, const char *filename, int fid) +{ + linear_search_item *item = new linear_search_item(filename, fid); + if (!item->load(fd)) { + delete item; + return 0; + } + else + return item; +} + +linear_search_item::linear_search_item(const char *filename, int fid) +: search_item(filename, fid) +{ +} + +linear_search_item::~linear_search_item() +{ +} + +int linear_search_item::load(int fd) +{ + return fbuf.load(fd, name); +} + +search_item_iterator *linear_search_item::make_search_item_iterator( + const char *query) +{ + return new linear_search_item_iterator(this, query); +} + +linear_search_item_iterator::linear_search_item_iterator( + linear_search_item *p, const char *) +: lsi(p), pos(0) +{ +} + +linear_search_item_iterator::~linear_search_item_iterator() +{ +} + +int linear_search_item_iterator::next(const linear_searcher &searcher, + const char **startp, int *lengthp, + reference_id *ridp) +{ + const char *bufstart = lsi->fbuf.get_start(); + const char *bufend = lsi->fbuf.get_end(); + const char *ptr = bufstart + pos; + if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) { + pos = *startp + *lengthp - bufstart; + if (ridp) + *ridp = reference_id(lsi->filename_id, *startp - bufstart); + return 1; + } + else + return 0; +} + +// Local Variables: +// fill-column: 72 +// mode: C++ +// End: +// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: |