source3/script/samba-log-parser


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

#!/usr/bin/env python3
#
#######################################################################
#
# A script to parse samba (especially winbind) logfiles.
# Trace files should be in a non-syslog format (debug syslog format = no).
#
#    --traceid      ...  Specify the traceid of the request to parse
#    --pid          ...  Specify the pid
#    --breakdown    ...  Break to separate files per each traceid
#    --merge-by-timestamp    ...  Merge logs by timestamp
#    --flow         ...  Show the request/sub-request call flow
#    --flow-compact ...  Show the request/sub-request call flow without dcerpc
#
#
# Copyright (c) 2023      Andreas Schneider <asn@samba.org>
# Copyright (c) 2023      Pavel Filipenský <pfilipen@redhat.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
#
# Requires: ???

import sys
import os
import re
from argparse import ArgumentParser
from collections import defaultdict

# Trace record consists of a trace header followed by one or more text lines.
#
# This tool expects trace header format based on these smb.conf parameters:
#
# debug syslog format = no
# debug hires timestamp = yes
# winbind debug traceid = yes
#
# If 'winbind debug traceid = no' is set, then the option --merge-by-timestamp
# still can be used.
#
# Each trace header contains a traceid, which is the main identifier for this
# tool. A single traceid is either provided via command line option --traceid
# or a list of traceids is derived from the PID specified via option --pid.
# Creating and evaluating list of traceids from PID can be tricky:
# The traceid can appear in a trace record before trace record containing the
# PID is processed. So when we see a new traceid we are not sure if it belongs
# to the traced PID.
# The PID appears only in the main winbind process (log.winbind). If a
# directory with many log files should be processed, we process the files in
# random order.
# It might happen that e.g. log.wb-ADDOMAIN is processed before log.winbind so
# we do not know the list of traceids yet.
# To make all this easy we put into memory all trace records and do the final
# traceid filtering only after all files are read. This can require lot of
# memory if files are big.


def process_file_no_traceid(record_list, fname):
    with open(fname, "r") as infile:
        data = infile.readlines()
    date = ""
    record_lines = []

    RE_HEADER_NO_TRACEID = re.compile(
        r"^\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}\.\d{6}).*")
    for line in data:
        header = RE_HEADER_NO_TRACEID.search(line)
        if header:
            # Append all previous trace lines of a record
            if record_lines:
                record_list.append((date, None, record_lines, fname))
                record_lines = []
            # Remember the new date
            date = header.group(1)
        record_lines.append(line)


def process_file(record_list, traceid_set, fname, opid, otraceid):
    with open(fname, "r") as infile:
        data = infile.readlines()
    pid = None
    traceid = 0
    traceid_prev = None
    undecided_traceid = False
    date = ""
    record_lines = []

    # If traceid option was provided the traceid_set will contain just it
    if otraceid:
        traceid_set.add(otraceid)

    RE_HEADER = re.compile(
        r"^\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}\.\d{6}).*?, .*, "
        r"traceid=([0-9]+).*\]")
    RE_INTERFACE_VERSION = re.compile(
        r"^\s+winbindd_interface_version: \[\S* \((\d+)\)\]")
    RE_ASYNC_REQUEST = re.compile(
        r"^\s+process_request_send: "
        r"\[\S* \((\d+)\)\] Handling async request:")
    # Example of a header line
    # [2023/05/01 07:40:45.439049,  3, pid=418844, effective(0, 0), real(0, 0), class=winbind, traceid=37] ../../source3/winbindd/winbindd_misc.c:355(winbindd_interface_version)
    for line in data:
        header = RE_HEADER.search(line)
        if header:
            # Append all previous trace lines of a record if the traceid is in
            # the list.
            if record_lines:
                record_list.append((date, traceid, record_lines, fname))
                record_lines = []
            # Remember the new date and the new traceid
            date = header.group(1)
            traceid = header.group(2)
            if traceid != traceid_prev:
                traceid_prev = traceid
                undecided_traceid = True
        if opid:
            # Search for lines that identify a new winbind client and the
            # client PID

            # winbindd_interface_version: [nss_winbind (500725)]: request interface version (version = 32)
            # process_request_send: [nss_winbind (500725)] Handling async request: SETPWENT
            interface_version = RE_INTERFACE_VERSION.search(line)
            async_request = RE_ASYNC_REQUEST.search(line)
            if interface_version:
                pid = interface_version.group(1)
                if undecided_traceid:
                    if pid == opid:
                        traceid_set.add(traceid)
                    undecided_traceid = False
            if async_request:
                pid = async_request.group(1)
                if undecided_traceid:
                    if pid == opid:
                        traceid_set.add(traceid)
                    undecided_traceid = False
        # For --breakdown add every traceid
        if not opid and not otraceid:
            traceid_set.add(traceid)

        record_lines.append(line)


def filter_traceids(record_list, traceid_set):
    llist = []
    for (d, t, li, f) in record_list:
        if t in traceid_set:
            llist.append((d, t, li, f))
    return llist


def filter_flow(record_list):
    local_list = []
    for (date, traceid, lines, filename) in record_list:
        for line in lines:
            isflow = re.search(r"^(\s+)flow: (.*)", line)
            if isflow:
                local_list.append(isflow.group(1) + isflow.group(2))
    return local_list


def filter_flowcompact(flist):
    local_list = []
    end_marker = None
    for fl in flist:
        if not end_marker:
            local_list.append(fl)
            dcerpc_start = re.search(r"^(\s+)-> dcerpc_", fl)
            if dcerpc_start:
                end_marker = dcerpc_start.group(1)
        else:
            dcerpc_end = re.search(r"^" + end_marker + "<- dcerpc_", fl)
            if dcerpc_end:
                end_marker = None
                local_list.append(fl)
    return local_list


def print_record_list(record_list, file):
    f_prev = None
    for (date, traceid, lines, filename) in record_list:
        # Inform about filename change
        if filename != f_prev:
            print("-" * 72, file=file)
            print("FILE: ", filename, file=file)
            print("-" * 72, file=file)
        for line in lines:
            print(line, end='', file=file)
        f_prev = filename

# record_list  ... list of quadruplets <date, traceid, [trace lines], filename>
# flow_list    ... lines from record_list with 'flow' traces
# traceid_set  ... list of traceids we want to trace
#                  with --traceid   ... there is a single traceids
#                  with --pid       ... there are all traceids for the PID
#                  with --breakdown ... there are all traceids


def setup_parser():
    parser = ArgumentParser()

    parser.add_argument(
        "path",
        type=str,
        help="logfile or directory"
    )
    parser.add_argument(
        "--traceid",
        dest="traceid",
        help="specify the traceid of the trace records",
        metavar="ID"
    )
    parser.add_argument(
        "--pid",
        dest="pid",
        help="specify the pid of winbind client",
        metavar="PID"
    )
    parser.add_argument(
        "--breakdown",
        action="store_true",
        dest="breakdown",
        default=False,
        help="breakdown the traces into per traceid files"
    )
    parser.add_argument(
        "--merge-by-timestamp",
        action="store_true",
        dest="merge",
        default=False,
        help="merge logs by timestamp"
    )
    parser.add_argument(
        "--flow",
        action="store_true",
        dest="flow",
        default=False,
        help="show the request/sub-request flow traces"
    )
    parser.add_argument(
        "--flow-compact",
        action="store_true",
        dest="flowcompact",
        default=False,
        help="show the request/sub-request flow traces without dcerpc details"
    )
    return parser


def main(): # noqa
    record_list = []
    flow_list = []
    traceid_set = set()

    parser = setup_parser()
    options = parser.parse_args()

    if (not options.traceid and not options.pid and not options.breakdown
            and not options.merge):
        print("One of --traceid or --pid is needed"
              " or --breakdown or --merge-by-timestamp.")
        sys.exit(1)
    elif options.traceid and options.pid:
        print("Only one of --traceid or --pid is allowed.")
        sys.exit(1)
    elif options.breakdown and (options.traceid or options.pid):
        print("--breakdown cannot be combined with --traceid and --pid.")
        sys.exit(1)

    if options.flow and not options.traceid:
        print("Option --flow can be used only together with --traceid.")
        sys.exit(1)

    if options.flowcompact and not options.traceid:
        print("Option --flow-compact can be used only together with "
              "--traceid.")
        sys.exit(1)

    if options.flow and options.flowcompact:
        print("Only one of --flow or --flow-compact is allowed.")
        sys.exit(1)

    if not options.path:
        print("Path to logfile or directory with logs is needed.")
        sys.exit(1)

    merge_with_no_traceid = (not options.traceid and not options.pid
                             and not options.breakdown) and options.merge

    path = options.path
    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            for name in files:
                if merge_with_no_traceid:
                    process_file_no_traceid(
                        record_list,
                        os.path.join(root, name)
                    )
                else:
                    process_file(
                        record_list,
                        traceid_set,
                        os.path.join(root, name),
                        options.pid,
                        options.traceid,
                    )
    elif os.path.isfile(path):
        if merge_with_no_traceid:
            process_file_no_traceid(
                record_list,
                path
            )
        else:
            process_file(
                record_list,
                traceid_set,
                path,
                options.pid,
                options.traceid
            )
    else:
        print(path, "Path is neither file or directory.")
        sys.exit(1)

    # Sort only using timestamps, no use of traceid
    if merge_with_no_traceid:
        record_list.sort()
        print_record_list(record_list, sys.stdout)
        sys.exit(0)

    # Keep only records with matching traceids
    if not options.breakdown:
        record_list = filter_traceids(record_list, traceid_set)

    if options.breakdown:
        for traceid in traceid_set:
            # Full
            with open("%s.full" % traceid, "w") as full_f:
                full_l = filter_traceids(record_list, {traceid})
                if options.merge:
                    full_l.sort()
                print_record_list(full_l, full_f)
            # Flow
            with open("%s.flow" % traceid, "w") as flow_f:
                flow_l = filter_flow(full_l)
                for fl in flow_l:
                    print(fl, file=flow_f)
            # Flow compact
            with open("%s.flowcompact" % traceid, "w") as flowcompact_f:
                flowcompact_l = filter_flowcompact(flow_l)
                for fl in flowcompact_l:
                    print(fl, file=flowcompact_f)
    elif options.flow:
        flow_list = filter_flow(record_list)
        for fl in flow_list:
            print(fl)
    elif options.flowcompact:
        flow_list = filter_flow(record_list)
        flow_list = filter_flowcompact(flow_list)
        for fl in flow_list:
            print(fl)
    else:
        if options.merge:
            record_list.sort()
        print_record_list(record_list, sys.stdout)

    sys.exit(0)


if __name__ == "__main__":
    main()