summaryrefslogtreecommitdiffstats
path: root/tools/generate_authors.py
blob: a74ef1c40b1afafe0af505c52c3befe3678eb1f9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3

#
# Generate the AUTHORS file combining existing AUTHORS file with
# git commit log.
#
# Usage: generate_authors.py AUTHORS.src

# Copyright 2022 Moshe Kaplan
# Based on generate_authors.pl by Michael Mann
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later

import argparse
import io
import re
import subprocess
import sys


def get_git_authors():
    '''
    Sample line:
    #  4321	Navin R. Johnson <nrjohnson@example.com>
    '''
    GIT_LINE_REGEX = r"^\s*\d+\s+([^<]*)\s*<([^>]*)>"
    cmd = "git --no-pager shortlog --email --summary HEAD".split(' ')
    # check_output is used for Python 3.4 compatability
    git_cmd_output = subprocess.check_output(cmd, universal_newlines=True, encoding='utf-8')

    git_authors = []
    for line in git_cmd_output.splitlines():
        # Check if this is needed:
        line = line.strip()
        match = re.match(GIT_LINE_REGEX, line)
        name = match.group(1).strip()
        email = match.group(2).strip()
        # Try to lower how much spam people get:
        email = email.replace('@', '[AT]')
        git_authors.append((name, email))
    return git_authors


def extract_contributors(authors_content):
    # Extract names and email addresses from the AUTHORS file Contributors
    contributors_content = authors_content.split("= Contributors =", 1)[1]
    CONTRIBUTOR_LINE_REGEX = r"^([\w\.\-\'\x80-\xff]+(\s*[\w+\.\-\'\x80-\xff])*)\s+<([^>]*)>"
    contributors = []
    state = ""
    for line in contributors_content.splitlines():
        contributor_match = re.match(CONTRIBUTOR_LINE_REGEX, line)
        if re.search(r'([^\{]*)\{', line):
            if contributor_match:
                name = contributor_match.group(1)
                email = contributor_match.group(3)
                contributors.append((name, email))
            state = "s_in_bracket"
        elif state == "s_in_bracket":
            if re.search(r'([^\}]*)\}', line):
                state = ""
        elif re.search('<', line):
            if contributor_match:
                name = contributor_match.group(1)
                email = contributor_match.group(3)
                contributors.append((name, email))
        elif re.search(r"(e-mail address removed at contributor's request)", line):
            if contributor_match:
                name = contributor_match.group(1)
                email = contributor_match.group(3)
                contributors.append((name, email))
        else:
            pass
    return contributors


def generate_git_contributors_text(contributors_emails, git_authors_emails):
    # Track the email addresses seen to avoid including the same email address twice
    emails_addresses_seen = set()
    for name, email in contributors_emails:
        emails_addresses_seen.add(email.lower())

    output_lines = []
    for name, email in git_authors_emails:
        if email.lower() in emails_addresses_seen:
            continue

        # Skip Gerald, since he's part of the header:
        if email == "gerald[AT]wireshark.org":
            continue

        ntab = 3
        if len(name) >= 8*ntab:
            line = "{name} <{email}>".format(name=name, email=email)
        else:
            ntab -= len(name)/8
            if len(name) % 8:
                ntab += 1
            tabs = '\t'*int(ntab)
            line = "{name}{tabs}<{email}>".format(name=name, tabs=tabs, email=email)

        emails_addresses_seen.add(email.lower())
        output_lines += [line]
    return "\n".join(output_lines)


# Read authos file until we find gitlog entries, then stop
def read_authors(parsed_args):
    lines = []
    with open(parsed_args.authors[0], 'r', encoding='utf-8') as fh:
        for line in fh.readlines():
            if '= From git log =' in line:
                break
            lines.append(line)
    return ''.join(lines)


def main():
    parser = argparse.ArgumentParser(description="Generate the AUTHORS file combining existing AUTHORS file with git commit log.")
    parser.add_argument("authors", metavar='authors', nargs=1, help="path to AUTHORS file")
    parsed_args = parser.parse_args()

    author_content = read_authors(parsed_args)

    # Collect the listed contributors emails so that we don't duplicate them
    # in the listing of git contributors
    contributors_emails = extract_contributors(author_content)
    git_authors_emails = get_git_authors()
    # Then generate the text output for git contributors
    git_contributors_text = generate_git_contributors_text(contributors_emails, git_authors_emails)

    # Now we can write our output:
    git_contributor_header = '= From git log =\n\n'
    output = author_content + git_contributor_header + git_contributors_text + '\n'

    with open(parsed_args.authors[0], 'w', encoding='utf-8') as fh:
        fh.write(output)


if __name__ == '__main__':
    main()