summaryrefslogtreecommitdiffstats
path: root/third_party/rust/rust_cascade/test_data/make-sample-data.py
blob: c9f117da5fdf04ca715cdd574d5c05cf2c8aef49 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import filtercascade
import hashlib
from pathlib import Path

import sys
import logging

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


def predictable_serial_gen(start, end):
    counter = start
    while counter < end:
        counter += 1
        m = hashlib.sha256()
        m.update(counter.to_bytes(4, byteorder="big"))
        yield m.hexdigest()


def store(fc, path):
    if path.exists():
        path.unlink()
    with open(path, "wb") as f:
        fc.tofile(f)


small_set = list(set(predictable_serial_gen(0, 500)))
large_set = set(predictable_serial_gen(500, 10_000))

# filter parameters
growth_factor = 1.0
min_filter_length = 177  # 177 * 1.44 ~ 256, so smallest filter will have 256 bits

print("--- v2_sha256l32_with_salt ---")
v2_sha256l32_with_salt = filtercascade.FilterCascade(
    [],
    defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256,
    salt=b"nacl",
    growth_factor=growth_factor,
    min_filter_length=min_filter_length,
)
v2_sha256l32_with_salt.initialize(
    include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"])
)
store(v2_sha256l32_with_salt, Path("test_v2_sha256l32_salt_mlbf"))

print("--- v2_sha256l32 ---")
v2_sha256l32 = filtercascade.FilterCascade(
    [],
    defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256,
    growth_factor=growth_factor,
    min_filter_length=min_filter_length,
)
v2_sha256l32.initialize(
    include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"])
)
store(v2_sha256l32, Path("test_v2_sha256l32_mlbf"))

print("--- v2_murmur ---")
v2_murmur = filtercascade.FilterCascade(
    [],
    defaultHashAlg=filtercascade.fileformats.HashAlgorithm.MURMUR3,
    growth_factor=growth_factor,
    min_filter_length=min_filter_length,
)
v2_murmur.initialize(
    include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"])
)
store(v2_murmur, Path("test_v2_murmur_mlbf"))

print("--- v2_murmur_inverted ---")
v2_murmur_inverted = filtercascade.FilterCascade(
    [],
    defaultHashAlg=filtercascade.fileformats.HashAlgorithm.MURMUR3,
    growth_factor=growth_factor,
    min_filter_length=min_filter_length,
)
v2_murmur_inverted.initialize(
    include=large_set | set([b"this", b"that"]), exclude=[b"other"] + small_set
)
store(v2_murmur_inverted, Path("test_v2_murmur_inverted_mlbf"))

print("--- v2_sha256l32_inverted ---")
v2_sha256l32_inverted = filtercascade.FilterCascade(
    [],
    defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256,
    growth_factor=growth_factor,
    min_filter_length=min_filter_length,
)
v2_sha256l32_inverted.initialize(
    include=large_set | set([b"this", b"that"]), exclude=[b"other"] + small_set
)
store(v2_sha256l32_inverted, Path("test_v2_sha256l32_inverted_mlbf"))

print("--- v2_sha256ctr_with_salt ---")
v2_sha256ctr_with_salt = filtercascade.FilterCascade(
    [],
    defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256CTR,
    salt=b"nacl",
    growth_factor=growth_factor,
    min_filter_length=min_filter_length,
)
v2_sha256ctr_with_salt.initialize(
    include=[b"this", b"that"] + small_set, exclude=large_set | set([b"other"])
)
store(v2_sha256ctr_with_salt, Path("test_v2_sha256ctr_salt_mlbf"))