summaryrefslogtreecommitdiffstats
path: root/src/libutil/shingles.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/libutil/shingles.c412
1 files changed, 412 insertions, 0 deletions
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
new file mode 100644
index 0000000..42d5168
--- /dev/null
+++ b/src/libutil/shingles.c
@@ -0,0 +1,412 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "shingles.h"
+#include "fstring.h"
+#include "cryptobox.h"
+#include "images.h"
+#include "libstat/stat_api.h"
+
+#define SHINGLES_WINDOW 3
+#define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
+
+static guint
+rspamd_shingles_keys_hash(gconstpointer k)
+{
+ return rspamd_cryptobox_fast_hash(k, SHINGLES_KEY_SIZE,
+ rspamd_hash_seed());
+}
+
+static gboolean
+rspamd_shingles_keys_equal(gconstpointer k1, gconstpointer k2)
+{
+ return (memcmp(k1, k2, SHINGLES_KEY_SIZE) == 0);
+}
+
+static void
+rspamd_shingles_keys_free(gpointer p)
+{
+ guchar **k = p;
+ guint i;
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ g_free(k[i]);
+ }
+
+ g_free(k);
+}
+
+static guchar **
+rspamd_shingles_keys_new(void)
+{
+ guchar **k;
+ guint i;
+
+ k = g_malloc0(sizeof(guchar *) * RSPAMD_SHINGLE_SIZE);
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ k[i] = g_malloc0(sizeof(guchar) * SHINGLES_KEY_SIZE);
+ }
+
+ return k;
+}
+
+static guchar **
+rspamd_shingles_get_keys_cached(const guchar key[SHINGLES_KEY_SIZE])
+{
+ static GHashTable *ht = NULL;
+ guchar **keys = NULL, *key_cpy;
+ rspamd_cryptobox_hash_state_t bs;
+ const guchar *cur_key;
+ guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
+ guint i;
+
+ if (ht == NULL) {
+ ht = g_hash_table_new_full(rspamd_shingles_keys_hash,
+ rspamd_shingles_keys_equal, g_free, rspamd_shingles_keys_free);
+ }
+ else {
+ keys = g_hash_table_lookup(ht, key);
+ }
+
+ if (keys == NULL) {
+ keys = rspamd_shingles_keys_new();
+ key_cpy = g_malloc(SHINGLES_KEY_SIZE);
+ memcpy(key_cpy, key, SHINGLES_KEY_SIZE);
+
+ /* Generate keys */
+ rspamd_cryptobox_hash_init(&bs, NULL, 0);
+ cur_key = key;
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ /*
+ * To generate a set of hashes we just apply sha256 to the
+ * initial key as many times as many hashes are required and
+ * xor left and right parts of sha256 to get a single 16 bytes SIP key.
+ */
+ out_key = keys[i];
+ rspamd_cryptobox_hash_update(&bs, cur_key, 16);
+ rspamd_cryptobox_hash_final(&bs, shabuf);
+
+ memcpy(out_key, shabuf, 16);
+ rspamd_cryptobox_hash_init(&bs, NULL, 0);
+ cur_key = out_key;
+ }
+
+ g_hash_table_insert(ht, key_cpy, keys);
+ }
+
+ return keys;
+}
+
+struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
+ rspamd_shingles_from_text(GArray *input,
+ const guchar key[16],
+ rspamd_mempool_t *pool,
+ rspamd_shingles_filter filter,
+ gpointer filterd,
+ enum rspamd_shingle_alg alg)
+{
+ struct rspamd_shingle *res;
+ guint64 **hashes;
+ guchar **keys;
+ rspamd_fstring_t *row;
+ rspamd_stat_token_t *word;
+ guint64 val;
+ gint i, j, k;
+ gsize hlen, ilen = 0, beg = 0, widx = 0;
+ enum rspamd_cryptobox_fast_hash_type ht;
+
+ if (pool != NULL) {
+ res = rspamd_mempool_alloc(pool, sizeof(*res));
+ }
+ else {
+ res = g_malloc(sizeof(*res));
+ }
+
+ row = rspamd_fstring_sized_new(256);
+
+ for (i = 0; i < input->len; i++) {
+ word = &g_array_index(input, rspamd_stat_token_t, i);
+
+ if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+ ilen++;
+ }
+ }
+
+ /* Init hashes pipes and keys */
+ hashes = g_malloc(sizeof(*hashes) * RSPAMD_SHINGLE_SIZE);
+ hlen = ilen > SHINGLES_WINDOW ? (ilen - SHINGLES_WINDOW + 1) : 1;
+ keys = rspamd_shingles_get_keys_cached(key);
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ hashes[i] = g_malloc(hlen * sizeof(guint64));
+ }
+
+ /* Now parse input words into a vector of hashes using rolling window */
+ if (alg == RSPAMD_SHINGLES_OLD) {
+ for (i = 0; i <= (gint) ilen; i++) {
+ if (i - beg >= SHINGLES_WINDOW || i == (gint) ilen) {
+ for (j = beg; j < i; j++) {
+
+ word = NULL;
+ while (widx < input->len) {
+ word = &g_array_index(input, rspamd_stat_token_t, widx);
+
+ if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+ widx++;
+ }
+ else {
+ break;
+ }
+ }
+
+ if (word == NULL) {
+ /* Nothing but exceptions */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ g_free(hashes[i]);
+ }
+
+ g_free(hashes);
+
+ if (pool == NULL) {
+ g_free(res);
+ }
+
+ rspamd_fstring_free(row);
+
+ return NULL;
+ }
+
+ row = rspamd_fstring_append(row, word->stemmed.begin,
+ word->stemmed.len);
+ }
+
+ /* Now we need to create a new row here */
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j++) {
+ rspamd_cryptobox_siphash((guchar *) &val, row->str, row->len,
+ keys[j]);
+ g_assert(hlen > beg);
+ hashes[j][beg] = val;
+ }
+
+ beg++;
+ widx++;
+
+ row = rspamd_fstring_assign(row, "", 0);
+ }
+ }
+ }
+ else {
+ guint64 window[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
+
+ switch (alg) {
+ case RSPAMD_SHINGLES_XXHASH:
+ ht = RSPAMD_CRYPTOBOX_XXHASH64;
+ break;
+ case RSPAMD_SHINGLES_MUMHASH:
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ break;
+ default:
+ ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
+ break;
+ }
+
+ memset(window, 0, sizeof(window));
+ for (i = 0; i <= ilen; i++) {
+ if (i - beg >= SHINGLES_WINDOW || i == ilen) {
+
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j++) {
+ /* Shift hashes window to right */
+ for (k = 0; k < SHINGLES_WINDOW - 1; k++) {
+ window[j * SHINGLES_WINDOW + k] =
+ window[j * SHINGLES_WINDOW + k + 1];
+ }
+
+ word = NULL;
+
+ while (widx < input->len) {
+ word = &g_array_index(input, rspamd_stat_token_t, widx);
+
+ if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+ widx++;
+ }
+ else {
+ break;
+ }
+ }
+
+ if (word == NULL) {
+ /* Nothing but exceptions */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ g_free(hashes[i]);
+ }
+
+ if (pool == NULL) {
+ g_free(res);
+ }
+
+ g_free(hashes);
+ rspamd_fstring_free(row);
+
+ return NULL;
+ }
+
+ /* Insert the last element to the pipe */
+ memcpy(&seed, keys[j], sizeof(seed));
+ window[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+ rspamd_cryptobox_fast_hash_specific(ht,
+ word->stemmed.begin, word->stemmed.len,
+ seed);
+ val = 0;
+ for (k = 0; k < SHINGLES_WINDOW; k++) {
+ val ^= window[j * SHINGLES_WINDOW + k] >>
+ (8 * (SHINGLES_WINDOW - k - 1));
+ }
+
+ g_assert(hlen > beg);
+ hashes[j][beg] = val;
+ }
+
+ beg++;
+ widx++;
+ }
+ }
+ }
+
+ /* Now we need to filter all hashes and make a shingles result */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ res->hashes[i] = filter(hashes[i], hlen,
+ i, key, filterd);
+ g_free(hashes[i]);
+ }
+
+ g_free(hashes);
+
+ rspamd_fstring_free(row);
+
+ return res;
+}
+
+struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
+ rspamd_shingles_from_image(guchar *dct,
+ const guchar key[16],
+ rspamd_mempool_t *pool,
+ rspamd_shingles_filter filter,
+ gpointer filterd,
+ enum rspamd_shingle_alg alg)
+{
+ struct rspamd_shingle *shingle;
+ guint64 **hashes;
+ guchar **keys;
+ guint64 d;
+ guint64 val;
+ gint i, j;
+ gsize hlen, beg = 0;
+ enum rspamd_cryptobox_fast_hash_type ht;
+ guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
+
+ if (pool != NULL) {
+ shingle = rspamd_mempool_alloc(pool, sizeof(*shingle));
+ }
+ else {
+ shingle = g_malloc(sizeof(*shingle));
+ }
+
+ /* Init hashes pipes and keys */
+ hashes = g_malloc(sizeof(*hashes) * RSPAMD_SHINGLE_SIZE);
+ hlen = RSPAMD_DCT_LEN / NBBY + 1;
+ keys = rspamd_shingles_get_keys_cached(key);
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ hashes[i] = g_malloc(hlen * sizeof(guint64));
+ }
+
+ switch (alg) {
+ case RSPAMD_SHINGLES_OLD:
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ break;
+ case RSPAMD_SHINGLES_XXHASH:
+ ht = RSPAMD_CRYPTOBOX_XXHASH64;
+ break;
+ case RSPAMD_SHINGLES_MUMHASH:
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ break;
+ default:
+ ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
+ break;
+ }
+
+ memset(res, 0, sizeof(res));
+#define INNER_CYCLE_SHINGLES(s, e) \
+ for (j = (s); j < (e); j++) { \
+ d = dct[beg]; \
+ memcpy(&seed, keys[j], sizeof(seed)); \
+ val = rspamd_cryptobox_fast_hash_specific(ht, \
+ &d, sizeof(d), \
+ seed); \
+ hashes[j][beg] = val; \
+ }
+ for (i = 0; i < RSPAMD_DCT_LEN / NBBY; i++) {
+ INNER_CYCLE_SHINGLES(0, RSPAMD_SHINGLE_SIZE / 4);
+ INNER_CYCLE_SHINGLES(RSPAMD_SHINGLE_SIZE / 4, RSPAMD_SHINGLE_SIZE / 2);
+ INNER_CYCLE_SHINGLES(RSPAMD_SHINGLE_SIZE / 2, 3 * RSPAMD_SHINGLE_SIZE / 4);
+ INNER_CYCLE_SHINGLES(3 * RSPAMD_SHINGLE_SIZE / 4, RSPAMD_SHINGLE_SIZE);
+
+ beg++;
+ }
+#undef INNER_CYCLE_SHINGLES
+ /* Now we need to filter all hashes and make a shingles result */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ shingle->hashes[i] = filter(hashes[i], hlen,
+ i, key, filterd);
+ g_free(hashes[i]);
+ }
+
+ g_free(hashes);
+
+ return shingle;
+}
+
+guint64
+rspamd_shingles_default_filter(guint64 *input, gsize count,
+ gint shno, const guchar *key, gpointer ud)
+{
+ guint64 minimal = G_MAXUINT64;
+ gsize i;
+
+ for (i = 0; i < count; i++) {
+ if (minimal > input[i]) {
+ minimal = input[i];
+ }
+ }
+
+ return minimal;
+}
+
+
+gdouble rspamd_shingles_compare(const struct rspamd_shingle *a,
+ const struct rspamd_shingle *b)
+{
+ gint i, common = 0;
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
+ if (a->hashes[i] == b->hashes[i]) {
+ common++;
+ }
+ }
+
+ return (gdouble) common / (gdouble) RSPAMD_SHINGLE_SIZE;
+}