summaryrefslogtreecommitdiffstats
path: root/src/libs/libbib/linear.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/libs/libbib/linear.cpp')
-rw-r--r--src/libs/libbib/linear.cpp506
1 files changed, 506 insertions, 0 deletions
diff --git a/src/libs/libbib/linear.cpp b/src/libs/libbib/linear.cpp
new file mode 100644
index 0000000..22e11d8
--- /dev/null
+++ b/src/libs/libbib/linear.cpp
@@ -0,0 +1,506 @@
+/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
+ Written by James Clark (jjc@jclark.com)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "lib.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "posix.h"
+#include "errarg.h"
+#include "error.h"
+#include "cset.h"
+#include "cmap.h"
+#include "nonposix.h"
+
+#include "refid.h"
+#include "search.h"
+
+class file_buffer {
+ char *buffer;
+ char *bufend;
+public:
+ file_buffer();
+ ~file_buffer();
+ int load(int fd, const char *filename);
+ const char *get_start() const;
+ const char *get_end() const;
+};
+
+typedef unsigned char uchar;
+
+static uchar map[256];
+static uchar inv_map[256][3];
+
+struct map_init {
+ map_init();
+};
+
+static map_init the_map_init;
+
+map_init::map_init()
+{
+ int i;
+ for (i = 0; i < 256; i++)
+ map[i] = csalnum(i) ? cmlower(i) : '\0';
+ for (i = 0; i < 256; i++) {
+ if (cslower(i)) {
+ inv_map[i][0] = i;
+ inv_map[i][1] = cmupper(i);
+ inv_map[i][2] = '\0';
+ }
+ else if (csdigit(i)) {
+ inv_map[i][0] = i;
+ inv_map[i][1] = 0;
+ }
+ else
+ inv_map[i][0] = '\0';
+ }
+}
+
+
+class bmpattern {
+ char *pat;
+ int len;
+ int delta[256];
+public:
+ bmpattern(const char *pattern, int pattern_length);
+ ~bmpattern();
+ const char *search(const char *p, const char *end) const;
+ int length() const;
+};
+
+bmpattern::bmpattern(const char *pattern, int pattern_length)
+: len(pattern_length)
+{
+ pat = new char[len];
+ int i;
+ for (i = 0; i < len; i++)
+ pat[i] = map[uchar(pattern[i])];
+ for (i = 0; i < 256; i++)
+ delta[i] = len;
+ for (i = 0; i < len; i++)
+ for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
+ delta[*inv] = len - i - 1;
+}
+
+const char *bmpattern::search(const char *buf, const char *end) const
+{
+ int buflen = end - buf;
+ if (len > buflen)
+ return 0;
+ const char *strend;
+ if (buflen > len*4)
+ strend = end - len*4;
+ else
+ strend = buf;
+ const char *k = buf + len - 1;
+ const int *del = delta;
+ const char *pattern = pat;
+ for (;;) {
+ while (k < strend) {
+ int t = del[uchar(*k)];
+ if (!t)
+ break;
+ k += t;
+ k += del[uchar(*k)];
+ k += del[uchar(*k)];
+ }
+ while (k < end && del[uchar(*k)] != 0)
+ k++;
+ if (k == end)
+ break;
+ int j = len - 1;
+ const char *s = k;
+ for (;;) {
+ if (j == 0)
+ return s;
+ if (map[uchar(*--s)] != uchar(pattern[--j]))
+ break;
+ }
+ k++;
+ }
+ return 0;
+}
+
+bmpattern::~bmpattern()
+{
+ delete[] pat;
+}
+
+inline int bmpattern::length() const
+{
+ return len;
+}
+
+
+static const char *find_end(const char *bufend, const char *p);
+
+const char *linear_searcher::search_and_check(const bmpattern *key,
+ const char *buf, const char *bufend, const char **start) const
+{
+ assert(buf[-1] == '\n');
+ assert(bufend[-1] == '\n');
+ const char *ptr = buf;
+ for (;;) {
+ const char *found = key->search(ptr, bufend);
+ if (!found)
+ break;
+ if (check_match(buf, bufend, found, key->length(), &ptr, start))
+ return found;
+ }
+ return 0;
+}
+
+static const char *skip_field(const char *end, const char *p)
+{
+ for (;;)
+ if (*p++ == '\n') {
+ if (p == end || *p == '%')
+ break;
+ const char *q;
+ for (q = p; *q == ' ' || *q == '\t'; q++)
+ ;
+ if (*q == '\n')
+ break;
+ p = q + 1;
+ }
+ return p;
+}
+
+static const char *find_end(const char *bufend, const char *p)
+{
+ for (;;)
+ if (*p++ == '\n') {
+ if (p == bufend)
+ break;
+ const char *q;
+ for (q = p; *q == ' ' || *q == '\t'; q++)
+ ;
+ if (*q == '\n')
+ break;
+ p = q + 1;
+ }
+ return p;
+}
+
+
+int linear_searcher::check_match(const char *buf, const char *bufend,
+ const char *match, int matchlen,
+ const char **cont, const char **start) const
+{
+ *cont = match + 1;
+ // The user is required to supply only the first truncate_len characters
+ // of the key. If truncate_len <= 0, he must supply all the key.
+ if ((truncate_len <= 0 || matchlen < truncate_len)
+ && map[uchar(match[matchlen])] != '\0')
+ return 0;
+
+ // The character before the match must not be an alphanumeric
+ // character (unless the alphanumeric character follows one or two
+ // percent characters at the beginning of the line), nor must it be
+ // a percent character at the beginning of a line, nor a percent
+ // character following a percent character at the beginning of a
+ // line.
+
+ switch (match - buf) {
+ case 0:
+ break;
+ case 1:
+ if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
+ return 0;
+ break;
+ case 2:
+ if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
+ return 0;
+ if (match[-1] == '%'
+ && (match[-2] == '\n' || match[-2] == '%'))
+ return 0;
+ break;
+ default:
+ if (map[uchar(match[-1])] != '\0'
+ && !(match[-2] == '%'
+ && (match[-3] == '\n'
+ || (match[-3] == '%' && match[-4] == '\n'))))
+ return 0;
+ if (match[-1] == '%'
+ && (match[-2] == '\n'
+ || (match[-2] == '%' && match[-3] == '\n')))
+ return 0;
+ }
+
+ const char *p = match;
+ int had_percent = 0;
+ for (;;) {
+ if (*p == '\n') {
+ if (!had_percent && p[1] == '%') {
+ if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
+ *cont = skip_field(bufend, match + matchlen);
+ return 0;
+ }
+ if (!start)
+ break;
+ had_percent = 1;
+ }
+ if (p <= buf) {
+ if (start)
+ *start = p + 1;
+ return 1;
+ }
+ const char *q;
+ for (q = p - 1; *q == ' ' || *q == '\t'; q--)
+ ;
+ if (*q == '\n') {
+ if (start)
+ *start = p + 1;
+ break;
+ }
+ p = q;
+ }
+ p--;
+ }
+ return 1;
+}
+
+file_buffer::file_buffer()
+: buffer(0), bufend(0)
+{
+}
+
+file_buffer::~file_buffer()
+{
+ delete[] buffer;
+}
+
+const char *file_buffer::get_start() const
+{
+ return buffer ? buffer + 4 : 0;
+}
+
+const char *file_buffer::get_end() const
+{
+ return bufend;
+}
+
+int file_buffer::load(int fd, const char *filename)
+{
+ struct stat sb;
+ if (fstat(fd, &sb) < 0)
+ error("can't fstat '%1': %2", filename, strerror(errno));
+ else if (!S_ISREG(sb.st_mode))
+ error("'%1' is not a regular file", filename);
+ else {
+ // We need one character extra at the beginning for an additional newline
+ // used as a sentinel. We get 4 instead so that the read buffer will be
+ // word-aligned. This seems to make the read slightly faster. We also
+ // need one character at the end also for an additional newline used as a
+ // sentinel.
+ int size = int(sb.st_size);
+ buffer = new char[size + 4 + 1];
+ int nread = read(fd, buffer + 4, size);
+ if (nread < 0)
+ error("error reading '%1': %2", filename, strerror(errno));
+ else if (nread != size)
+ error("size of '%1' decreased", filename);
+ else {
+ char c;
+ nread = read(fd, &c, 1);
+ if (nread != 0)
+ error("size of '%1' increased", filename);
+ else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
+ error("database '%1' is a binary file", filename);
+ else {
+ close(fd);
+ buffer[3] = '\n';
+ int sidx = 4, didx = 4;
+ for ( ; sidx < size + 4; sidx++, didx++)
+ {
+ if (buffer[sidx] == '\r')
+ {
+ if (buffer[++sidx] != '\n')
+ buffer[didx++] = '\r';
+ else
+ size--;
+ }
+ if (sidx != didx)
+ buffer[didx] = buffer[sidx];
+ }
+ bufend = buffer + 4 + size;
+ if (bufend[-1] != '\n')
+ *bufend++ = '\n';
+ return 1;
+ }
+ }
+ delete[] buffer;
+ buffer = 0;
+ }
+ close(fd);
+ return 0;
+}
+
+linear_searcher::linear_searcher(const char *query, int query_len,
+ const char *ign, int trunc)
+: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
+{
+ const char *query_end = query + query_len;
+ int nk = 0;
+ const char *p;
+ for (p = query; p < query_end; p++)
+ if (map[uchar(*p)] != '\0'
+ && (p[1] == '\0' || map[uchar(p[1])] == '\0'))
+ nk++;
+ if (nk == 0)
+ return;
+ keys = new bmpattern*[nk];
+ p = query;
+ for (;;) {
+ while (p < query_end && map[uchar(*p)] == '\0')
+ p++;
+ if (p == query_end)
+ break;
+ const char *start = p;
+ while (p < query_end && map[uchar(*p)] != '\0')
+ p++;
+ keys[nkeys++] = new bmpattern(start, p - start);
+ }
+ assert(nkeys <= nk);
+ if (nkeys == 0) {
+ delete[] keys;
+ keys = 0;
+ }
+}
+
+linear_searcher::~linear_searcher()
+{
+ for (int i = 0; i < nkeys; i++)
+ delete keys[i];
+ delete[] keys;
+}
+
+int linear_searcher::search(const char *buffer, const char *bufend,
+ const char **startp, int *lengthp) const
+{
+ assert(bufend - buffer > 0);
+ assert(buffer[-1] == '\n');
+ assert(bufend[-1] == '\n');
+ if (nkeys == 0)
+ return 0;
+ for (;;) {
+ const char *refstart;
+ const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
+ if (!found)
+ break;
+ const char *refend = find_end(bufend, found + keys[0]->length());
+ int i;
+ for (i = 1; i < nkeys; i++)
+ if (!search_and_check(keys[i], refstart, refend))
+ break;
+ if (i >= nkeys) {
+ *startp = refstart;
+ *lengthp = refend - refstart;
+ return 1;
+ }
+ buffer = refend;
+ }
+ return 0;
+}
+
+class linear_search_item : public search_item {
+ file_buffer fbuf;
+public:
+ linear_search_item(const char *filename, int fid);
+ ~linear_search_item();
+ int load(int fd);
+ search_item_iterator *make_search_item_iterator(const char *);
+ friend class linear_search_item_iterator;
+};
+
+class linear_search_item_iterator : public search_item_iterator {
+ linear_search_item *lsi;
+ int pos;
+public:
+ linear_search_item_iterator(linear_search_item *, const char *query);
+ ~linear_search_item_iterator();
+ int next(const linear_searcher &, const char **ptr, int *lenp,
+ reference_id *ridp);
+};
+
+search_item *make_linear_search_item(int fd, const char *filename, int fid)
+{
+ linear_search_item *item = new linear_search_item(filename, fid);
+ if (!item->load(fd)) {
+ delete item;
+ return 0;
+ }
+ else
+ return item;
+}
+
+linear_search_item::linear_search_item(const char *filename, int fid)
+: search_item(filename, fid)
+{
+}
+
+linear_search_item::~linear_search_item()
+{
+}
+
+int linear_search_item::load(int fd)
+{
+ return fbuf.load(fd, name);
+}
+
+search_item_iterator *linear_search_item::make_search_item_iterator(
+ const char *query)
+{
+ return new linear_search_item_iterator(this, query);
+}
+
+linear_search_item_iterator::linear_search_item_iterator(
+ linear_search_item *p, const char *)
+: lsi(p), pos(0)
+{
+}
+
+linear_search_item_iterator::~linear_search_item_iterator()
+{
+}
+
+int linear_search_item_iterator::next(const linear_searcher &searcher,
+ const char **startp, int *lengthp,
+ reference_id *ridp)
+{
+ const char *bufstart = lsi->fbuf.get_start();
+ const char *bufend = lsi->fbuf.get_end();
+ const char *ptr = bufstart + pos;
+ if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
+ pos = *startp + *lengthp - bufstart;
+ if (ridp)
+ *ridp = reference_id(lsi->filename_id, *startp - bufstart);
+ return 1;
+ }
+ else
+ return 0;
+}
+
+// Local Variables:
+// fill-column: 72
+// mode: C++
+// End:
+// vim: set cindent noexpandtab shiftwidth=2 textwidth=72: