diff options
Diffstat (limited to 'tools/json2pcap/json2pcap.py')
-rwxr-xr-x | tools/json2pcap/json2pcap.py | 686 |
1 files changed, 686 insertions, 0 deletions
diff --git a/tools/json2pcap/json2pcap.py b/tools/json2pcap/json2pcap.py new file mode 100755 index 00000000..2a059ad0 --- /dev/null +++ b/tools/json2pcap/json2pcap.py @@ -0,0 +1,686 @@ +#!/usr/bin/env python3 + +# +# Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors +# +# Wireshark - Network traffic analyzer +# By Gerald Combs <gerald@wireshark.org> +# Copyright 1998 Gerald Combs +# +# SPDX-License-Identifier: GPL-2.0-or-later + +import sys +import ijson +import operator +import copy +import binascii +import array +import argparse +import string +import random +import math +import hashlib +import re +from collections import OrderedDict +from scapy import all as scapy + +# Field anonymization class +class AnonymizedField: + ''' + The Anonymization field object specifying anonymization + :filed arg: field name + :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256] + :start arg: If specified, the anonymization starts at given byte number + :end arg: If specified, the anonymization ends at given byte number + ''' + def __init__(self, field, type): + self.field = field + self.type = type + self.start = None + self.end = None + + match = re.search(r'(\S+)\[(-?\d+)?:(-?\d+)?\]', field) + if match: + self.field = match.group(1) + self.start = match.group(2) + if self.start is not None: + self.start = int(self.start) + self.end = match.group(3) + if self.end is not None: + self.end = int(self.end) + + # Returns the new field value after anonymization + def anonymize_field_shake256(self, field, type, salt): + shake = hashlib.shake_256(str(field + ':' + salt).encode('utf-8')) + + # String type, output should be ASCII + if type in [26, 27, 28]: + length = math.ceil(len(field)/4) + shake_hash = shake.hexdigest(length) + ret_string = array.array('B', str.encode(shake_hash)) + ret_string = ''.join('{:02x}'.format(x) for x in ret_string) + # Other types, output could be HEX + else: + length = math.ceil(len(field)/2) + shake_hash = shake.hexdigest(length) + ret_string = shake_hash + + # Correct the string length + if (len(ret_string) < len(field)): + ret_string = ret_string.ljust(len(field)) + if (len(ret_string) > len(field)): + ret_string = ret_string[:len(field)] + + return ret_string + + def anonymize_field(self, _h, _t, salt): + s = 0 + e = None + if self.start: + s = self.start + if self.end: + e = self.end + if e < 0: + e = len(_h) + e + else: + e = len(_h) + h = _h[s:e] + if self.type == 0: + h = 'f' * len(h) + elif self.type == 1: + h = self.anonymize_field_shake256(h, _t, salt) + + h_mask = '0' * len(_h[0:s]) + 'f' * len(h) + '0' * len(_h[e:]) + h = _h[0:s] + h + _h[e:] + return [h, h_mask] + +def make_unique(key, dct): + counter = 0 + unique_key = key + + while unique_key in dct: + counter += 1 + unique_key = '{}_{}'.format(key, counter) + return unique_key + + +def parse_object_pairs(pairs): + dct = OrderedDict() + for key, value in pairs: + if key in dct: + key = make_unique(key, dct) + dct[key] = value + + return dct + +# +# ********* PY TEMPLATES ********* +# +def read_py_function(name): + s = '' + record = False + indent = 0 + + file = open(__file__) + for line in file: + + ind = len(line) - len(line.lstrip()) + + if line.find("def " + name) != -1: + record = True + indent = ind + elif record and indent == ind and len(line) > 1: + record = False + + if record: + s = s + line + + file.close() + return s + +py_header = """#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# File generated by json2pcap.py +# json2pcap.py created by Martin Kacer, 2020 + +import os +import binascii +import array +import sys +import subprocess +from collections import OrderedDict +from scapy import all as scapy + +# ***************************************************** +# * PACKET PAYLOAD GENERATED FROM INPUT PCAP * +# * Modify this function to edit the packet * +# ***************************************************** +def main(): + d = OrderedDict() +""" + +py_footer = """ generate_pcap(d) + +# ***************************************************** +# * FUNCTIONS from TEMPLATE * +# * Do not edit these functions if not required * +# ***************************************************** + +""" +py_footer = py_footer + read_py_function("to_bytes") +py_footer = py_footer + read_py_function("lsb") +py_footer = py_footer + read_py_function("multiply_strings") +py_footer = py_footer + read_py_function("rewrite_frame") +py_footer = py_footer + read_py_function("assemble_frame") +py_footer = py_footer + read_py_function("generate_pcap") + +py_footer = py_footer + """ + +if __name__ == '__main__': + main() +""" +# +# ***** End of PY TEMPLATES ****** +# + + + +# +# ********** FUNCTIONS *********** +# + +def raw_flat_collector(dict): + if hasattr(dict, 'items'): + for k, v in dict.items(): + if k.endswith("_raw"): + yield k, v + else: + for val in raw_flat_collector(v): + yield val + + +# d - input dictionary, parsed from json +# r - result dictionary +# frame_name - parent protocol name +# frame_position - parent protocol position +def py_generator(d, r, frame_name='frame_raw', frame_position=0): + if (d is None or d is None): + return + + if hasattr(d, 'items'): + for k, v in d.items(): + + # no recursion + if k.endswith("_raw") or "_raw_" in k: + if isinstance(v[1], (list, tuple)) or isinstance(v[2], (list, tuple)): + #i = 1; + for _v in v: + h = _v[0] + p = _v[1] + l = _v[2] * 2 + b = _v[3] + t = _v[4] + if (len(h) != l): + l = len(h) + + p = p - frame_position + + # Add into result dictionary + key = str(k).replace('.', '_') + key = make_unique(key, r) + + fn = frame_name.replace('.', '_') + if (fn == key): + fn = None + value = [fn, h, p, l, b, t] + + r[key] = value + + else: + h = v[0] + p = v[1] + l = v[2] * 2 + b = v[3] + t = v[4] + if (len(h) != l): + l = len(h) + + p = p - frame_position + + # Add into result dictionary + key = str(k).replace('.', '_') + key = make_unique(key, r) + + fn = frame_name.replace('.', '_') + if (fn == key): + fn = None + value = [fn , h, p, l, b, t] + + r[key] = value + + # recursion + else: + if isinstance(v, dict): + fn = frame_name + fp = frame_position + + # if there is also preceding raw protocol frame use it + # remove tree suffix + key = k + if (key.endswith("_tree") or ("_tree_" in key)): + key = key.replace('_tree', '') + + raw_key = key + "_raw" + if (raw_key in d): + # f = d[raw_key][0] + fn = raw_key + fp = d[raw_key][1] + + + py_generator(v, r, fn, fp) + + elif isinstance(v, (list, tuple)): + + fn = frame_name + fp = frame_position + + # if there is also preceding raw protocol frame use it + # remove tree suffix + key = k + if (key.endswith("_tree") or ("_tree_" in key)): + key = key.replace('_tree', '') + + raw_key = key + "_raw" + if (raw_key in d): + fn = raw_key + fp = d[raw_key][1] + + for _v in v: + py_generator(_v, r, frame_name, frame_position) + +# To emulate Python 3.2 +def to_bytes(n, length, endianess='big'): + h = '%x' % n + s = bytearray.fromhex(('0' * (len(h) % 2) + h).zfill(length * 2)) + return s if endianess == 'big' else s[::-1] + +# Returns the index, counting from 0, of the least significant set bit in x +def lsb(x): + return (x & -x).bit_length() - 1 + +# Replace parts of original_string by new_string, only if mask in the byte is not ff +def multiply_strings(original_string, new_string, mask): + + ret_string = new_string + if mask is None: + return ret_string + for i in range(0, min(len(original_string), len(new_string), len(mask)), 2): + if mask[i:i + 2] == 'ff': + #print("ff") + ret_string = ret_string[:i] + original_string[i:i + 2] + ret_string[i + 2:] + + return ret_string + +# Rewrite frame +# h - hex bytes +# p - position +# l - length +# b - bitmask +# t - type +# frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte) +def rewrite_frame(frame_raw, h, p, l, b, t, frame_amask=None): + if p < 0 or l < 0 or h is None: + return frame_raw + + # no bitmask + if(b == 0): + if (len(h) != l): + l = len(h) + frame_raw_new = frame_raw[:p] + h + frame_raw[p + l:] + return multiply_strings(frame_raw, frame_raw_new, frame_amask) + # bitmask + else: + # get hex string from frame which will be replaced + _h = frame_raw[p:p + l] + + # add 0 padding to have correct length + if (len(_h) % 2 == 1): + _h = '0' + _h + if (len(h) % 2 == 1): + h = '0' + h + + # Only replace bits defined by mask + # new_hex = (old_hex & !mask) | (new_hex & mask) + _H = bytearray.fromhex(_h) + _H = array.array('B', _H) + + M = to_bytes(b, len(_H)) + M = array.array('B', M) + # shift mask aligned to position + for i in range(len(M)): + if (i + p / 2) < len(M): + M[i] = M[i + int(p / 2)] + else: + M[i] = 0x00 + + H = bytearray.fromhex(h) + H = array.array('B', H) + + # for i in range(len(_H)): + # print "{0:08b}".format(_H[i]), + # print + # for i in range(len(M)): + # print "{0:08b}".format(M[i]), + # print + + j = 0 + for i in range(len(_H)): + if (M[i] != 0): + v = H[j] << lsb(M[i]) + # print "Debug: {0:08b}".format(v), + _H[i] = (_H[i] & ~M[i]) | (v & M[i]) + # print "Debug: " + str(_H[i]), + j = j + 1 + + # for i in range(len(_H)): + # print "{0:08b}".format(_H[i]), + # print + + masked_h = binascii.hexlify(_H) + masked_h = masked_h.decode('ascii') + + frame_raw_new = frame_raw[:p] + str(masked_h) + frame_raw[p + l:] + return multiply_strings(frame_raw, frame_raw_new, frame_amask) + + +def assemble_frame(d, frame_time): + input = d['frame_raw'][1] + isFlat = False + linux_cooked_header = False + while not isFlat: + isFlat = True + _d = d.copy() + for key, val in _d.items(): + h = str(val[1]) # hex + p = val[2] * 2 # position + l = val[3] * 2 # length + b = val[4] # bitmask + t = val[5] # type + + if (key == "sll_raw"): + linux_cooked_header = True + + # only if the node is not parent + isParent = False + for k, v in d.items(): + if (v[0] == key): + isParent = True + isFlat = False + break + + if not isParent and val[0] is not None: + d[val[0]][1] = rewrite_frame(d[val[0]][1], h, p, l, b, t) + del d[key] + + output = d['frame_raw'][1] + + # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame + if (linux_cooked_header): + output = "000000000000" + output[6*2:] # replce dest MAC + output = output[:12*2] + "" + output[14*2:] # remove two bytes before Protocol + + return output + +def generate_pcap(d): + # 1. Assemble frame + input = d['frame_raw'][1] + output = assemble_frame(d, None) + print(input) + print(output) + # 2. Testing: compare input and output for not modified json + if (input != output): + print("Modified frames: ") + s1 = input + s2 = output + print(s1) + print(s2) + if (len(s1) == len(s2)): + d = [i for i in range(len(s1)) if s1[i] != s2[i]] + print(d) + # 3. Generate pcap + outfile = sys.argv[0] + ".pcap" + pcap_out = scapy.PcapWriter(outfile, append=False, sync=False) + new_packet = scapy.Packet(bytearray.fromhex(output)) + pcap_out.write(new_packet) + print("Generated " + outfile) + +# +# ************ MAIN ************** +# +VERSION = "1.1" + +parser = argparse.ArgumentParser(description=""" +json2pcap {version} + +Utility to generate pcap from json format. + +Packet modification: +In input json it is possible to modify the raw values of decoded fields. +The output pcap will include the modified values. The algorithm of +generating the output pcap is to get all raw hex fields from input json and +then assembling them by layering from longest (less decoded fields) to +shortest (more decoded fields). It means if the modified raw field is +shorter field (more decoded field) it takes precedence against modification +in longer field (less decoded field). If the json includes duplicated raw +fields with same position and length, the behavior is not deterministic. +For manual packet editing it is always possible to remove any not required +raw fields from json, only frame_raw is field mandatory for reconstruction. + +Packet modification with -p switch: +The python script is generated instead of pcap. This python script when +executed will generate the pcap of 1st packet from input json. The +generated code includes the decoded fields and the function to assembly the +packet. This enables to modify the script and programmatically edit or +encode the packet variables. The assembling algorithm is different, because +the decoded packet fields are relative and points to parent node with their +position (compared to input json which has absolute positions). + +Pcap masking and anonymization with -m and -a switch: +The script allows to mask or anonymize the selected json raw fields. If the +The fields are selected and located on lower protocol layers, they are not +The overwritten by upper fields which are not marked by these switches. +The pcap masking and anonymization can be performed in the following way: + +tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw" +-a "ip.dst_raw" -o anonymized.pcap +In this example the ip.src_raw field is masked with ffffffff by byte values +and ip.dst_raw is hashed by randomly generated salt. + +Additionally the following syntax is valid to anonymize portion of field +tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:]" +-a "ip.dst_raw[:-2]" -o anonymized.pcap +Where the src_ip first byte is preserved and dst_ip last byte is preserved. +And the same can be achieved by +tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:8]" +-a "ip.dst_raw[0:6]" -o anonymized.pcap + +Masking and anonymization limitations are mainly the following: +- In case the tshark is performing reassembling from multiple frames, the +backward pcap reconstruction is not properly performed and can result in +malformed frames. +- The new values in the fields could violate the field format, as the +json2pcap is no performing correct protocol encoding with respect to +allowed values of the target field and field encoding. + +""".format(version=VERSION), formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) +parser.add_argument('-i', '--infile', nargs='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.') +parser.add_argument('-o', '--outfile', required=True, help='output pcap filename') +parser.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default=False, action='store_true') +parser.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action='append', metavar='MASKED_FIELD') +parser.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action='append', metavar='ANONYMIZED_FIELD') +parser.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default=None) +parser.add_argument('-v', '--verbose', help='verbose output', default=False, action='store_true') +args = parser.parse_args() + +# read JSON +infile = args.infile +outfile = args.outfile + +# Read from input file +if infile: + data_file = open(infile) +# Read from pipe +else: + data_file = sys.stdin + +# Parse anonymization fields +anonymize = {} +if args.mask: + for m in args.mask: + if '_raw' not in m: + print("Error: The specified fields by -m switch should be raw fields. " + m + " does not have _raw suffix") + sys.exit() + af = AnonymizedField(m, 0) + anonymize[af.field] = af +if args.anonymize: + for a in args.anonymize: + if '_raw' not in a: + print("Error: The specified fields by -a switch should be raw fields. " + a + " does not have _raw suffix") + sys.exit() + af = AnonymizedField(a, 1) + anonymize[af.field] = af + +input_frame_raw = '' +frame_raw = '' +frame_time = None + +salt = args.salt +if salt is None: + # generate random salt if no salt was provided + salt = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10)) + +# Generate pcap +if args.python is False: + pcap_out = scapy.PcapWriter(outfile, append=False, sync=False) + + # Iterate over packets in JSON + for packet in ijson.items(data_file, "item", buf_size=200000): + _list = [] + linux_cooked_header = False + + # get flat raw fields into _list + for raw in raw_flat_collector(packet['_source']['layers']): + if len(raw) >= 2: + if (raw[0] == "frame_raw"): + frame_raw = raw[1][0] + frame_amask = "0"*len(frame_raw) # initialize anonymization mask + input_frame_raw = copy.copy(frame_raw) + frame_time = None + if 'frame.time_epoch' in packet['_source']['layers']['frame']: + frame_time = packet['_source']['layers']['frame']['frame.time_epoch'] + else: + # add into value list into raw[5] the field name + if isinstance(raw[1], list): + raw[1].append(raw[0]) + _list.append(raw[1]) + if (raw[0] == "sll_raw"): + linux_cooked_header = True + + # sort _list + sorted_list = sorted(_list, key=operator.itemgetter(1), reverse=False) + sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True) + # print("Debug: " + str(sorted_list)) + + # rewrite frame + for raw in sorted_list: + if len(raw) >= 6: + h = str(raw[0]) # hex + p = raw[1] * 2 # position + l = raw[2] * 2 # length + b = raw[3] # bitmask + t = raw[4] # type + # raw[5] # field_name (added by script) + h_mask = h # hex for anonymization mask + + # anonymize fields + if (raw[5] in anonymize): + [h, h_mask] = anonymize[raw[5]].anonymize_field(h, t, salt) + + if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))): + for r in raw: + _h = str(r[0]) # hex + _p = r[1] * 2 # position + _l = r[2] * 2 # length + _b = r[3] # bitmask + _t = r[4] # type + # raw[5] # field_name (added by script) + _h_mask = _h # hex for anonymization mask + + # anonymize fields + if (raw[5] in anonymize): + [_h, _h_mask] = anonymize[raw[5]].anonymize_field(_h, _t, salt) + + # print("Debug: " + str(raw)) + frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t, frame_amask) + + # update anonymization mask + if (raw[5] in anonymize): + frame_amask = rewrite_frame(frame_amask, _h_mask, _p, _l, _b, _t) + + else: + # print("Debug: " + str(raw)) + frame_raw = rewrite_frame(frame_raw, h, p, l, b, t, frame_amask) + + # update anonymization mask + if (raw[5] in anonymize): + frame_amask = rewrite_frame(frame_amask, h_mask, p, l, b, t) + + # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap + if (linux_cooked_header): + frame_raw = "000000000000" + frame_raw[6 * 2:] # replce dest MAC + frame_raw = frame_raw[:12 * 2] + "" + frame_raw[14 * 2:] # remove two bytes before Protocol + + # Testing: remove comment to compare input and output for not modified json + if (args.verbose and input_frame_raw != frame_raw): + print("Modified frames: ") + s1 = input_frame_raw + s2 = frame_raw + print(s1) + print(s2) + if (len(s1) == len(s2)): + d = [i for i in range(len(s1)) if s1[i] != s2[i]] + print(d) + + new_packet = scapy.Packet(bytearray.fromhex(frame_raw)) + if frame_time: + new_packet.time = float(frame_time) + pcap_out.write(new_packet) + +# Generate python payload only for first packet +else: + py_outfile = outfile + '.py' + f = open(py_outfile, 'w') + + #for packet in json: + for packet in ijson.items(data_file, "item", buf_size=200000): + f.write(py_header) + + r = OrderedDict({}) + + #print "packet = " + str(packet['_source']['layers']) + py_generator(packet['_source']['layers'], r) + + for key, value in r.items(): + f.write(" d['" + key + "'] =",) + f.write(" " + str(value) + "\n") + + f.write(py_footer) + + # Currently only first packet is used from pcap + f.close + + print("Generated " + py_outfile) + + break |