summaryrefslogtreecommitdiffstats
path: root/ndiff/test-scans/anonymize.py
diff options
context:
space:
mode:
Diffstat (limited to 'ndiff/test-scans/anonymize.py')
-rwxr-xr-xndiff/test-scans/anonymize.py122
1 files changed, 122 insertions, 0 deletions
diff --git a/ndiff/test-scans/anonymize.py b/ndiff/test-scans/anonymize.py
new file mode 100755
index 0000000..7b35237
--- /dev/null
+++ b/ndiff/test-scans/anonymize.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+# Anonymize an Nmap XML file, replacing host name and IP addresses with random
+# anonymous ones. Anonymized names will be consistent between runs of the
+# program. Any servicefp attributes are removed. Give a file name as an
+# argument. The anonymized file is written to stdout.
+#
+# The anonymization is not rigorous. This program just matches regular
+# expressions against things that look like address and host names. It is
+# possible that it will leave some identifying information.
+
+import hashlib
+import random
+import re
+import sys
+
+VERBOSE = True
+
+r = random.Random()
+
+
+def hash(s):
+ digest = hashlib.sha512(s.encode()).hexdigest()
+ return int(digest, 16)
+
+
+def anonymize_mac_address(addr):
+ r.seed(hash(addr))
+ nums = (0, 0, 0) + tuple(r.randrange(256) for i in range(3))
+ return ":".join("%02X" % x for x in nums)
+
+
+def anonymize_ipv4_address(addr):
+ r.seed(hash(addr))
+ nums = (10,) + tuple(r.randrange(256) for i in range(3))
+ return ".".join(str(x) for x in nums)
+
+
+def anonymize_ipv6_address(addr):
+ r.seed(hash(addr))
+ # RFC 4193.
+ nums = (0xFD00 + r.randrange(256),)
+ nums = nums + tuple(r.randrange(65536) for i in range(7))
+ return ":".join("%04X" % x for x in nums)
+
+# Maps to memoize address and host name conversions.
+hostname_map = {}
+address_map = {}
+
+
+def anonymize_hostname(name):
+ if name in hostname_map:
+ return hostname_map[name]
+ LETTERS = "acbdefghijklmnopqrstuvwxyz"
+ r.seed(hash(name))
+ length = r.randrange(5, 10)
+ prefix = "".join(r.sample(LETTERS, length))
+ num = r.randrange(1000)
+ hostname_map[name] = "%s-%d.example.com" % (prefix, num)
+ if VERBOSE:
+ print("Replace %s with %s" % (name, hostname_map[name]), file=sys.stderr)
+ return hostname_map[name]
+
+mac_re = re.compile(r'\b([0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}\b')
+ipv4_re = re.compile(r'\b([0-9]{1,3}\.){3}[0-9]{1,3}\b')
+ipv6_re = re.compile(r'\b([0-9a-fA-F]{1,4}::?){3,}[0-9a-fA-F]{1,4}\b')
+
+
+def anonymize_address(addr):
+ if addr in address_map:
+ return address_map[addr]
+ if mac_re.match(addr):
+ address_map[addr] = anonymize_mac_address(addr)
+ elif ipv4_re.match(addr):
+ address_map[addr] = anonymize_ipv4_address(addr)
+ elif ipv6_re.match(addr):
+ address_map[addr] = anonymize_ipv6_address(addr)
+ else:
+ assert False
+ if VERBOSE:
+ print("Replace %s with %s" % (addr, address_map[addr]), file=sys.stderr)
+ return address_map[addr]
+
+
+def repl_addr(match):
+ addr = match.group(0)
+ anon_addr = anonymize_address(addr)
+ return anon_addr
+
+
+def repl_hostname_name(match):
+ name = match.group(1)
+ anon_name = anonymize_hostname(name)
+ return r'<hostname name="%s"' % anon_name
+
+
+def repl_hostname(match):
+ name = match.group(1)
+ anon_name = anonymize_hostname(name)
+ return r'hostname="%s"' % anon_name
+
+
+def anonymize_file(f):
+ for line in f:
+ line = re.sub(mac_re, repl_addr, line)
+ line = re.sub(ipv4_re, repl_addr, line)
+ line = re.sub(ipv6_re, repl_addr, line)
+ line = re.sub(r'<hostname name="([^"]*)"', repl_hostname_name, line)
+ line = re.sub(r'\bhostname="([^"]*)"', repl_hostname, line)
+ line = re.sub(r' *\bservicefp="([^"]*)"', r'', line)
+ yield line
+
+
+def main():
+ filename = sys.argv[1]
+ f = open(filename, "r")
+ for line in anonymize_file(f):
+ sys.stdout.write(line)
+ f.close()
+
+if __name__ == "__main__":
+ main()