summaryrefslogtreecommitdiffstats
path: root/tools/make-manuf.py
blob: 22f3aa0320a9e6034d03b21fd641cb6f5c681902 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#!/usr/bin/env python3
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later
'''Update the "manuf" file.

Make-manuf creates a file containing ethernet OUIs and their company
IDs from the databases at IEEE.
'''

import csv
import html
import io
import os
import re
import sys
import urllib.request, urllib.error, urllib.parse

have_icu = False
try:
    # Use the grapheme or segments module instead?
    import icu
    have_icu = True
except ImportError:
    pass

def exit_msg(msg=None, status=1):
    if msg is not None:
        sys.stderr.write(msg + '\n\n')
    sys.stderr.write(__doc__ + '\n')
    sys.exit(status)

def open_url(url):
    '''Open a URL.
    Returns a tuple containing the body and response dict. The body is a
    str in Python 3 and bytes in Python 2 in order to be compatibile with
    csv.reader.
    '''

    if len(sys.argv) > 1:
        url_path = os.path.join(sys.argv[1], url[1])
        url_fd = open(url_path)
        body = url_fd.read()
        url_fd.close()
    else:
        url_path = '/'.join(url)

        req_headers = { 'User-Agent': 'Wireshark make-manuf' }
        try:
            req = urllib.request.Request(url_path, headers=req_headers)
            response = urllib.request.urlopen(req)
            body = response.read().decode('UTF-8', 'replace')
        except Exception:
            exit_msg('Error opening ' + url_path)

    return body

# These are applied after punctuation has been removed.
# More examples at https://en.wikipedia.org/wiki/Incorporation_(business)
general_terms = '|'.join([
    ' a +s\\b', # A/S and A.S. but not "As" as in "Connect As".
    ' ab\\b', # Also follows "Oy", which is covered below.
    ' ag\\b',
    ' b ?v\\b',
    ' closed joint stock company\\b',
    ' co\\b',
    ' company\\b',
    ' corp\\b',
    ' corporation\\b',
    ' corporate\\b',
    ' de c ?v\\b', # Follows "S.A.", which is covered separately below.
    ' gmbh\\b',
    ' holding\\b',
    ' inc\\b',
    ' incorporated\\b',
    ' jsc\\b',
    ' kg\\b',
    ' k k\\b', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik".
    ' limited\\b',
    ' llc\\b',
    ' ltd\\b',
    ' n ?v\\b',
    ' oao\\b',
    ' of\\b',
    ' open joint stock company\\b',
    ' ooo\\b',
    ' oü\\b',
    ' oy\\b',
    ' oyj\\b',
    ' plc\\b',
    ' pty\\b',
    ' pvt\\b',
    ' s ?a ?r ?l\\b',
    ' s ?a\\b',
    ' s ?p ?a\\b',
    ' sp ?k\\b',
    ' s ?r ?l\\b',
    ' systems\\b',
    '\\bthe\\b',
    ' zao\\b',
    ' z ?o ?o\\b'
    ])

# Chinese company names tend to start with the location, skip it (non-exhaustive list).
skip_start = [
    'shengzen',
    'shenzhen',
    'beijing',
    'shanghai',
    'wuhan',
    'hangzhou',
    'guangxi',
    'guangdong',
    'chengdu',
]

# Special cases handled directly
special_case = {
    "Advanced Micro Devices": "AMD",
    "杭州德澜科技有限公司": "DelanTech" # 杭州德澜科技有限公司(HangZhou Delan Technology Co.,Ltd)
}

def shorten(manuf):
    '''Convert a long manufacturer name to abbreviated and short names'''
    # Normalize whitespace.
    manuf = ' '.join(manuf.split())
    orig_manuf = manuf
    # Convert all caps to title case
    if manuf.isupper():
        manuf = manuf.title()
    # Remove the contents of parenthesis as ancillary data
    manuf = re.sub(r"\(.*\)", '', manuf)
    # Remove the contents of fullwidth parenthesis (mostly in Asian names)
    manuf = re.sub(r"(.*)", '', manuf)
    # Remove "a" before removing punctuation ("Aruba, a Hewlett [...]" etc.)
    manuf = manuf.replace(" a ", " ")
    # Remove any punctuation
    # XXX Use string.punctuation? Note that it includes '-' and '*'.
    manuf = re.sub(r"[\"',./:()+-]", ' ', manuf)
    # XXX For some reason including the double angle brackets in the above
    # regex makes it bomb
    manuf = re.sub(r"[«»“”]", ' ', manuf)
    # & isn't needed when Standalone
    manuf = manuf.replace(" & ", " ")
    # Remove business types and other general terms ("the", "inc", "plc", etc.)
    plain_manuf = re.sub(general_terms, '', manuf, flags=re.IGNORECASE)
    # ...but make sure we don't remove everything.
    if not all(s == ' ' for s in plain_manuf):
        manuf = plain_manuf

    manuf = manuf.strip()

    # Check for special case
    if manuf in special_case.keys():
        manuf = special_case[manuf]

    # XXX: Some of the entries have Chinese city or other location
    # names written with spaces between each character, like
    # Bei jing, Wu Han, Shen Zhen, etc. We should remove that too.
    split = manuf.split()
    if len(split) > 1 and split[0].lower() in skip_start:
        manuf = ' '.join(split[1:])

    # Remove all spaces
    manuf = re.sub(r'\s+', '', manuf)

    if len(manuf) < 1:
        sys.stderr.write('Manufacturer "{}" shortened to nothing.\n'.format(orig_manuf))
        sys.exit(1)

    # Truncate names to a reasonable length, say, 12 characters. If
    # the string contains UTF-8, this may be substantially more than
    # 12 bytes. It might also be less than 12 visible characters. Plain
    # Python slices Unicode strings by code point, which is better
    # than raw bytes but not as good as grapheme clusters. PyICU
    # supports grapheme clusters. https://bugs.python.org/issue30717
    #

    # Truncate by code points
    trunc_len = 12

    if have_icu:
        # Truncate by grapheme clusters
        bi_ci = icu.BreakIterator.createCharacterInstance(icu.Locale('en_US'))
        bi_ci.setText(manuf)
        bounds = list(bi_ci)
        bounds = bounds[0:trunc_len]
        trunc_len = bounds[-1]

    manuf = manuf[:trunc_len]

    if manuf.lower() == orig_manuf.lower():
        # Original manufacturer name was short and simple.
        return [manuf, None]

    mixed_manuf = orig_manuf
    # At least one entry has whitespace in front of a period.
    mixed_manuf = re.sub(r'\s+\.', '.', mixed_manuf)
    #If company is all caps, convert to mixed case (so it doesn't look like we're screaming the company name)
    if mixed_manuf.upper() == mixed_manuf:
        mixed_manuf = mixed_manuf.title()

    return [manuf, mixed_manuf]

MA_L = 'MA_L'
MA_M = 'MA_M'
MA_S = 'MA_S'

def prefix_to_oui(prefix, prefix_map):
    pfx_len = int(len(prefix) * 8 / 2)
    prefix24 = prefix[:6]
    oui24 = ':'.join(hi + lo for hi, lo in zip(prefix24[0::2], prefix24[1::2]))

    if pfx_len == 24:
        # 24-bit OUI assignment, no mask
        return oui24, MA_L

    # Other lengths which require a mask.
    oui = prefix.ljust(12, '0')
    oui = ':'.join(hi + lo for hi, lo in zip(oui[0::2], oui[1::2]))
    if pfx_len == 28:
        kind = MA_M
    elif pfx_len == 36:
        kind = MA_S
    prefix_map[oui24] = kind

    return '{}/{:d}'.format(oui, int(pfx_len)), kind

def main():
    this_dir = os.path.dirname(__file__)
    manuf_path = os.path.join('epan', 'manuf-data.c')

    ieee_d = {
        'OUI':   { 'url': ["https://standards-oui.ieee.org/oui/", "oui.csv"], 'min_entries': 1000 },
        'CID':   { 'url': ["https://standards-oui.ieee.org/cid/", "cid.csv"], 'min_entries': 75 },
        'IAB':   { 'url': ["https://standards-oui.ieee.org/iab/", "iab.csv"], 'min_entries': 1000 },
        'OUI28': { 'url': ["https://standards-oui.ieee.org/oui28/", "mam.csv"], 'min_entries': 1000 },
        'OUI36': { 'url': ["https://standards-oui.ieee.org/oui36/", "oui36.csv"], 'min_entries': 1000 },
    }
    oui_d = {
        MA_L: { '00:00:00' : ['00:00:00', 'Officially Xerox, but 0:0:0:0:0:0 is more common'] },
        MA_M: {},
        MA_S: {},
    }

    min_total = 35000; # 35830 as of 2018-09-05
    total_added = 0

    # Add IEEE entries from each of their databases
    ieee_db_l = ['OUI', 'OUI28', 'OUI36', 'CID', 'IAB']

    # map a 24-bit prefix to MA-M/MA-S or none (MA-L by default)
    prefix_map = {}

    for db in ieee_db_l:
        db_url = ieee_d[db]['url']
        ieee_d[db]['skipped'] = 0
        ieee_d[db]['added'] = 0
        ieee_d[db]['total'] = 0
        print('Merging {} data from {}'.format(db, db_url))
        body = open_url(db_url)
        ieee_csv = csv.reader(body.splitlines())

        # Pop the title row.
        next(ieee_csv)
        for ieee_row in ieee_csv:
            #Registry,Assignment,Organization Name,Organization Address
            #IAB,0050C2DD6,Transas Marine Limited,Datavagen 37 Askim Vastra Gotaland SE 436 32
            oui, kind = prefix_to_oui(ieee_row[1].upper(), prefix_map)
            manuf = ieee_row[2].strip()
            # The Organization Name field occasionally contains HTML entities. Undo them.
            manuf = html.unescape(manuf)
            # "Watts A\S"
            manuf = manuf.replace('\\', '/')
            if manuf == 'IEEE Registration Authority':
                continue
            if manuf == 'Private':
                continue
            if oui in oui_d[kind]:
                action = 'Skipping'
                print('{} - {} IEEE "{}" in favor of "{}"'.format(oui, action, manuf, oui_d[kind][oui]))
                ieee_d[db]['skipped'] += 1
            else:
                oui_d[kind][oui] = shorten(manuf)
                ieee_d[db]['added'] += 1
            ieee_d[db]['total'] += 1

        if ieee_d[db]['total'] < ieee_d[db]['min_entries']:
            exit_msg("Too few {} entries. Got {}, wanted {}".format(db, ieee_d[db]['total'], ieee_d[db]['min_entries']))
        total_added += ieee_d[db]['total']

    if total_added < min_total:
        exit_msg("Too few total entries ({})".format(total_added))

    try:
        manuf_fd = io.open(manuf_path, 'w', encoding='UTF-8')
    except Exception:
        exit_msg("Couldn't open manuf file for reading ({}) ".format(manuf_path))

    manuf_fd.write('''/*
 * This file was generated by running ./tools/make-manuf.py.
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 *
 * The data below has been assembled from the following sources:
 *
 * The IEEE public OUI listings available from:
 * <http://standards-oui.ieee.org/oui/oui.csv>
 * <http://standards-oui.ieee.org/cid/cid.csv>
 * <http://standards-oui.ieee.org/iab/iab.csv>
 * <http://standards-oui.ieee.org/oui28/mam.csv>
 * <http://standards-oui.ieee.org/oui36/oui36.csv>
 *
 */

''')

    # Write the prefix map
    manuf_fd.write("static const manuf_registry_t ieee_registry_table[] = {\n")
    keys = list(prefix_map.keys())
    keys.sort()
    for oui in keys:
        manuf_fd.write("    {{ {{ 0x{}, 0x{}, 0x{} }}, {} }},\n".format(oui[0:2], oui[3:5], oui[6:8], prefix_map[oui]))
    manuf_fd.write("};\n\n")

    # write the MA-L table
    manuf_fd.write("static const manuf_oui24_t global_manuf_oui24_table[] = {\n")
    keys = list(oui_d[MA_L].keys())
    keys.sort()
    for oui in keys:
        short = oui_d[MA_L][oui][0]
        if oui_d[MA_L][oui][1]:
            long = oui_d[MA_L][oui][1]
        else:
            long = short
        line = "    {{ {{ 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], short)
        sep = 44 - len(line)
        if sep <= 0:
            sep = 0
        line += sep * ' '
        line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
        manuf_fd.write(line)
    manuf_fd.write("};\n\n")

    # write the MA-M table
    manuf_fd.write("static const manuf_oui28_t global_manuf_oui28_table[] = {\n")
    keys = list(oui_d[MA_M].keys())
    keys.sort()
    for oui in keys:
        short = oui_d[MA_M][oui][0]
        if oui_d[MA_M][oui][1]:
            long = oui_d[MA_M][oui][1]
        else:
            long = short
        line = "    {{ {{ 0x{}, 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], oui[9:11], short)
        sep = 50 - len(line)
        if sep <= 0:
            sep = 0
        line += sep * ' '
        line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
        manuf_fd.write(line)
    manuf_fd.write("};\n\n")

    #write the MA-S table
    manuf_fd.write("static const manuf_oui36_t global_manuf_oui36_table[] = {\n")
    keys = list(oui_d[MA_S].keys())
    keys.sort()
    for oui in keys:
        short = oui_d[MA_S][oui][0]
        if oui_d[MA_S][oui][1]:
            long = oui_d[MA_S][oui][1]
        else:
            long = short
        line = "    {{ {{ 0x{}, 0x{}, 0x{}, 0x{}, 0x{} }}, \"{}\", ".format(oui[0:2], oui[3:5], oui[6:8], oui[9:11], oui[12:14], short)
        sep = 56 - len(line)
        if sep <= 0:
            sep = 0
        line += sep * ' '
        line += "\"{}\" }},\n".format(long.replace('"', '\\"'))
        manuf_fd.write(line)
    manuf_fd.write("};\n")

    manuf_fd.close()

    for db in ieee_d:
        print('{:<20}: {}'.format('IEEE ' + db + ' added', ieee_d[db]['added']))
    print('{:<20}: {}'.format('Total added', total_added))

    print()
    for db in ieee_d:
        print('{:<20}: {}'.format('IEEE ' + db + ' total', ieee_d[db]['total']))

    print()
    for db in ieee_d:
        print('{:<20}: {}'.format('IEEE ' + db + ' skipped', ieee_d[db]['skipped']))

if __name__ == '__main__':
    main()