summaryrefslogtreecommitdiffstats
path: root/src/plugins/fts-squat/squat-test.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/plugins/fts-squat/squat-test.c197
1 files changed, 197 insertions, 0 deletions
diff --git a/src/plugins/fts-squat/squat-test.c b/src/plugins/fts-squat/squat-test.c
new file mode 100644
index 0000000..b55646c
--- /dev/null
+++ b/src/plugins/fts-squat/squat-test.c
@@ -0,0 +1,197 @@
+/* Copyright (c) 2006-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "file-lock.h"
+#include "istream.h"
+#include "time-util.h"
+#include "unichar.h"
+#include "squat-trie.h"
+#include "squat-uidlist.h"
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/time.h>
+
+static void result_print(ARRAY_TYPE(seq_range) *result)
+{
+ const struct seq_range *range;
+ unsigned int i, count;
+
+ range = array_get(result, &count);
+ for (i = 0; i < count; i++) {
+ if (i != 0)
+ printf(",");
+ printf("%u", range[i].seq1);
+ if (range[i].seq1 != range[i].seq2)
+ printf("-%u", range[i].seq2);
+ }
+ printf("\n");
+}
+
+int main(int argc ATTR_UNUSED, char *argv[])
+{
+ const char *trie_path = "/tmp/squat-test-index.search";
+ const char *uidlist_path = "/tmp/squat-test-index.search.uids";
+ struct squat_trie *trie;
+ struct squat_trie_build_context *build_ctx;
+ struct istream *input;
+ struct stat trie_st, uidlist_st;
+ ARRAY_TYPE(seq_range) definite_uids, maybe_uids;
+ char *line, *str, buf[4096];
+ buffer_t *valid;
+ int ret, fd;
+ unsigned int last = 0, seq = 1, node_count, uidlist_count;
+ size_t len;
+ enum squat_index_type index_type;
+ bool data_header = TRUE, first = TRUE, skip_body = FALSE;
+ bool mime_header = TRUE;
+ size_t trie_mem, uidlist_mem;
+ clock_t clock_start, clock_end;
+ struct timeval tv_start, tv_end;
+ double cputime;
+
+ lib_init();
+ i_unlink_if_exists(trie_path);
+ i_unlink_if_exists(uidlist_path);
+ trie = squat_trie_init(trie_path, time(NULL),
+ FILE_LOCK_METHOD_FCNTL, 0, 0600, (gid_t)-1);
+
+ clock_start = clock();
+ i_gettimeofday(&tv_start);
+
+ fd = open(argv[1], O_RDONLY);
+ if (fd == -1)
+ return 1;
+
+ if (squat_trie_build_init(trie, &build_ctx) < 0)
+ return 1;
+
+ valid = buffer_create_dynamic(default_pool, 4096);
+ input = i_stream_create_fd(fd, SIZE_MAX);
+ ret = 0;
+ while (ret == 0 && (line = i_stream_read_next_line(input)) != NULL) {
+ if (last != input->v_offset/(1024*100)) {
+ fprintf(stderr, "\r%ukB", (unsigned)(input->v_offset/1024));
+ fflush(stderr);
+ last = input->v_offset/(1024*100);
+ }
+ if (str_begins(line, "From ")) {
+ if (!first)
+ seq++;
+ data_header = TRUE;
+ skip_body = FALSE;
+ mime_header = TRUE;
+ continue;
+ }
+ first = FALSE;
+
+ if (str_begins(line, "--")) {
+ skip_body = FALSE;
+ mime_header = TRUE;
+ }
+
+ if (mime_header) {
+ if (*line == '\0') {
+ data_header = FALSE;
+ mime_header = FALSE;
+ continue;
+ }
+
+ if (strncasecmp(line, "Content-Type:", 13) == 0 &&
+ strncasecmp(line, "Content-Type: text/", 19) != 0 &&
+ strncasecmp(line, "Content-Type: message/", 22) != 0)
+ skip_body = TRUE;
+ else if (strncasecmp(line, "Content-Transfer-Encoding: base64", 33) == 0)
+ skip_body = TRUE;
+ } else if (skip_body)
+ continue;
+ if (*line == '\0')
+ continue;
+
+ /* we're actually indexing here headers as bodies and bodies
+ as headers. it doesn't really matter in this test, and
+ fixing it would require storing headers temporarily
+ elsewhere and index them only after the body */
+ index_type = !data_header ? SQUAT_INDEX_TYPE_HEADER :
+ SQUAT_INDEX_TYPE_BODY;
+
+ buffer_set_used_size(valid, 0);
+ len = strlen(line);
+ if (uni_utf8_get_valid_data((const unsigned char *)line,
+ len, valid)) {
+ ret = squat_trie_build_more(build_ctx, seq, index_type,
+ (const void *)line, len);
+ } else if (valid->used > 0) {
+ ret = squat_trie_build_more(build_ctx, seq, index_type,
+ valid->data, valid->used);
+ }
+ }
+ buffer_free(&valid);
+ if (squat_trie_build_deinit(&build_ctx, NULL) < 0)
+ ret = -1;
+ if (ret < 0) {
+ printf("build broken\n");
+ return 1;
+ }
+
+ clock_end = clock();
+ i_gettimeofday(&tv_end);
+
+ cputime = (double)(clock_end - clock_start) / CLOCKS_PER_SEC;
+ fprintf(stderr, "\n - Index time: %.2f CPU seconds, "
+ "%.2f real seconds (%.02fMB/CPUs)\n", cputime,
+ timeval_diff_msecs(&tv_end, &tv_start)/1000.0,
+ input->v_offset / cputime / (1024*1024));
+
+ if (stat(trie_path, &trie_st) < 0)
+ i_error("stat(%s) failed: %m", trie_path);
+ if (stat(uidlist_path, &uidlist_st) < 0)
+ i_error("stat(%s) failed: %m", uidlist_path);
+
+ trie_mem = squat_trie_mem_used(trie, &node_count);
+ uidlist_mem = squat_uidlist_mem_used(squat_trie_get_uidlist(trie),
+ &uidlist_count);
+ fprintf(stderr, " - memory: %uk for trie, %uk for uidlist\n",
+ (unsigned)(trie_mem/1024), (unsigned)(uidlist_mem/1024));
+ fprintf(stderr, " - %"PRIuUOFF_T" bytes in %u nodes (%.02f%%)\n",
+ trie_st.st_size, node_count,
+ trie_st.st_size / (float)input->v_offset * 100.0);
+ fprintf(stderr, " - %"PRIuUOFF_T" bytes in %u UID lists (%.02f%%)\n",
+ uidlist_st.st_size, uidlist_count,
+ uidlist_st.st_size / (float)input->v_offset * 100.0);
+ fprintf(stderr, " - %"PRIuUOFF_T" bytes total of %"
+ PRIuUOFF_T" (%.02f%%)\n",
+ (trie_st.st_size + uidlist_st.st_size), input->v_offset,
+ (trie_st.st_size + uidlist_st.st_size) /
+ (float)input->v_offset * 100.0);
+
+ i_stream_unref(&input);
+ i_close_fd(&fd);
+
+ i_array_init(&definite_uids, 128);
+ i_array_init(&maybe_uids, 128);
+ while ((str = fgets(buf, sizeof(buf), stdin)) != NULL) {
+ ret = strlen(str)-1;
+ str[ret] = 0;
+
+ i_gettimeofday(&tv_start);
+ ret = squat_trie_lookup(trie, str, SQUAT_INDEX_TYPE_HEADER |
+ SQUAT_INDEX_TYPE_BODY,
+ &definite_uids, &maybe_uids);
+ if (ret < 0)
+ printf("error\n");
+ else {
+ i_gettimeofday(&tv_end);
+ printf(" - Search took %.05f CPU seconds\n",
+ timeval_diff_usecs(&tv_end, &tv_start)/1000000.0);
+ printf(" - definite uids: ");
+ result_print(&definite_uids);
+ printf(" - maybe uids: ");
+ result_print(&maybe_uids);
+ }
+ }
+ return 0;
+}