summaryrefslogtreecommitdiffstats
path: root/script/attr_count_read
blob: 2f6a4d08f361959d3be8c7199b78bce1d08a7ac3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
#
# Copyright (C) Catalyst IT Ltd. 2019
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import sys
import argparse
import struct
import os
from collections import OrderedDict, Counter
from pprint import pprint

sys.path.insert(0, "bin/python")
import tdb


def unpack_uint(filename, casefold=True):
    db = tdb.Tdb(filename)
    d = {}
    for k in db:
        v = struct.unpack("I", db[k])[0]
        k2 = k.decode('utf-8')
        if casefold:
            k2 = k2.lower()
        if k2 in d: # because casefold
            d[k2] += v
        else:
            d[k2] = v
    return d


def unpack_ssize_t_pair(filename, casefold):
    db = tdb.Tdb(filename)
    pairs = []
    for k in db:
        key = struct.unpack("nn", k)
        v = struct.unpack("I", db[k])[0]
        pairs.append((v, key))

    pairs.sort(reverse=True)
    #print(pairs)
    return [(k, v) for (v, k) in pairs]


DATABASES = [
    ('requested', "debug/attr_counts_requested.tdb", unpack_uint,
     "The attribute was specifically requested."),
    ('duplicates', "debug/attr_counts_duplicates.tdb", unpack_uint,
     "Requested more than once in the same request."),
    ('empty request', "debug/attr_counts_empty_req.tdb", unpack_uint,
     "No attributes were requested, but these were returned"),
    ('null request', "debug/attr_counts_null_req.tdb", unpack_uint,
     "The attribute list was NULL and these were returned."),
    ('found', "debug/attr_counts_found.tdb", unpack_uint,
     "The attribute was specifically requested and it was found."),
    ('not found', "debug/attr_counts_not_found.tdb", unpack_uint,
     "The attribute was specifically requested but was not found."),
    ('unwanted', "debug/attr_counts_unwanted.tdb", unpack_uint,
     "The attribute was not requested and it was found."),
    ('star match', "debug/attr_counts_star_match.tdb", unpack_uint,
     'The attribute was not specifically requested but "*" was.'),
    ('req vs found', "debug/attr_counts_req_vs_found.tdb", unpack_ssize_t_pair,
     "How many attributes were requested versus how many were returned."),
]


def plot_pair_data(name, data, doc, lim=90):
    # Note we keep the matplotlib import internal to this function for
    # two reasons:
    # 1. Some people won't have matplotlib, but might want to run the
    #    script.
    # 2. The import takes hundreds of milliseconds, which is a
    #    nuisance if you don't want graphs.
    #
    # This plot could be improved!
    import matplotlib.pylab as plt
    fig, ax = plt.subplots()
    if lim:
        data2 = []
        for p, c in data:
            if p[0] > lim or p[1] > lim:
                print("not plotting %s: %s" % (p, c))
                continue
            data2.append((p, c))
        skipped = len(data) - len(data2)
        if skipped:
            name += " (excluding %d out of range values)" % skipped
            data = data2
    xy, counts = zip(*data)
    x, y = zip(*xy)
    bins_x = max(x) + 4
    bins_y = max(y)
    ax.set_title(name)
    ax.scatter(x, y, c=counts)
    plt.show()


def print_pair_data(name, data, doc):
    print(name)
    print(doc)
    t = "%14s | %14s | %14s"
    print(t % ("requested", "returned", "count"))
    print(t % (('-' * 14,) * 3))

    for xy, count in data:
        x, y = xy
        if x == -2:
            x = 'NULL'
        elif x == -4:
            x = '*'
        print(t % (x, y, count))


def print_counts(count_data):
    all_attrs = Counter()
    for c in count_data:
        all_attrs.update(c[1])

    print("found %d attrs" % len(all_attrs))
    longest = max(len(x) for x in all_attrs)

    #pprint(all_attrs)
    rows = OrderedDict()
    for a, _ in all_attrs.most_common():
        rows[a] = [a]

    for col_name, counts, doc in count_data:
        for attr, row in rows.items():
            d = counts.get(attr, '')
            row.append(d)

        print("%15s: %s" % (col_name, doc))
    print()

    t = "%{}s".format(longest)
    for c in count_data:
        t += " | %{}s".format(max(len(c[0]), 7))

    h = t % (("attribute",) + tuple(c[0] for c in count_data))
    print(h)
    print("-" * len(h))

    for attr, row in rows.items():
        print(t % tuple(row))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('LDB_PRIVATE_DIR',
                        help="read attr counts in this directory")
    parser.add_argument('--plot', action="store_true",
                        help='attempt to draw graphs')
    parser.add_argument('--no-casefold', action="store_false",
                        default=True, dest="casefold",
                        help='See all the encountered case variants')
    args = parser.parse_args()

    if not os.path.isdir(args.LDB_PRIVATE_DIR):
        parser.print_usage()
        sys.exit(1)

    count_data = []
    pair_data = []
    for k, filename, unpacker, doc in DATABASES:
        filename = os.path.join(args.LDB_PRIVATE_DIR, filename)
        try:
            d = unpacker(filename, casefold=args.casefold)
        except (RuntimeError, IOError) as e:
            print("could not parse %s: %s" % (filename, e))
            continue
        if unpacker is unpack_ssize_t_pair:
            pair_data.append((k, d, doc))
        else:
            count_data.append((k, d, doc))

    for k, v, doc in pair_data:
        if args.plot:
            plot_pair_data(k, v, doc)
        print_pair_data(k, v, doc)

    print()
    print_counts(count_data)

main()