summaryrefslogtreecommitdiffstats
path: root/tools/json2pcap/json2pcap.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/json2pcap/json2pcap.py')
-rwxr-xr-xtools/json2pcap/json2pcap.py686
1 files changed, 686 insertions, 0 deletions
diff --git a/tools/json2pcap/json2pcap.py b/tools/json2pcap/json2pcap.py
new file mode 100755
index 00000000..2a059ad0
--- /dev/null
+++ b/tools/json2pcap/json2pcap.py
@@ -0,0 +1,686 @@
+#!/usr/bin/env python3
+
+#
+# Copyright 2020, Martin Kacer <kacer.martin[AT]gmail.com> and contributors
+#
+# Wireshark - Network traffic analyzer
+# By Gerald Combs <gerald@wireshark.org>
+# Copyright 1998 Gerald Combs
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import sys
+import ijson
+import operator
+import copy
+import binascii
+import array
+import argparse
+import string
+import random
+import math
+import hashlib
+import re
+from collections import OrderedDict
+from scapy import all as scapy
+
+# Field anonymization class
+class AnonymizedField:
+ '''
+ The Anonymization field object specifying anonymization
+ :filed arg: field name
+ :type arg: anonymization type [0 masking 0xff, 1 anonymization shake_256]
+ :start arg: If specified, the anonymization starts at given byte number
+ :end arg: If specified, the anonymization ends at given byte number
+ '''
+ def __init__(self, field, type):
+ self.field = field
+ self.type = type
+ self.start = None
+ self.end = None
+
+ match = re.search(r'(\S+)\[(-?\d+)?:(-?\d+)?\]', field)
+ if match:
+ self.field = match.group(1)
+ self.start = match.group(2)
+ if self.start is not None:
+ self.start = int(self.start)
+ self.end = match.group(3)
+ if self.end is not None:
+ self.end = int(self.end)
+
+ # Returns the new field value after anonymization
+ def anonymize_field_shake256(self, field, type, salt):
+ shake = hashlib.shake_256(str(field + ':' + salt).encode('utf-8'))
+
+ # String type, output should be ASCII
+ if type in [26, 27, 28]:
+ length = math.ceil(len(field)/4)
+ shake_hash = shake.hexdigest(length)
+ ret_string = array.array('B', str.encode(shake_hash))
+ ret_string = ''.join('{:02x}'.format(x) for x in ret_string)
+ # Other types, output could be HEX
+ else:
+ length = math.ceil(len(field)/2)
+ shake_hash = shake.hexdigest(length)
+ ret_string = shake_hash
+
+ # Correct the string length
+ if (len(ret_string) < len(field)):
+ ret_string = ret_string.ljust(len(field))
+ if (len(ret_string) > len(field)):
+ ret_string = ret_string[:len(field)]
+
+ return ret_string
+
+ def anonymize_field(self, _h, _t, salt):
+ s = 0
+ e = None
+ if self.start:
+ s = self.start
+ if self.end:
+ e = self.end
+ if e < 0:
+ e = len(_h) + e
+ else:
+ e = len(_h)
+ h = _h[s:e]
+ if self.type == 0:
+ h = 'f' * len(h)
+ elif self.type == 1:
+ h = self.anonymize_field_shake256(h, _t, salt)
+
+ h_mask = '0' * len(_h[0:s]) + 'f' * len(h) + '0' * len(_h[e:])
+ h = _h[0:s] + h + _h[e:]
+ return [h, h_mask]
+
+def make_unique(key, dct):
+ counter = 0
+ unique_key = key
+
+ while unique_key in dct:
+ counter += 1
+ unique_key = '{}_{}'.format(key, counter)
+ return unique_key
+
+
+def parse_object_pairs(pairs):
+ dct = OrderedDict()
+ for key, value in pairs:
+ if key in dct:
+ key = make_unique(key, dct)
+ dct[key] = value
+
+ return dct
+
+#
+# ********* PY TEMPLATES *********
+#
+def read_py_function(name):
+ s = ''
+ record = False
+ indent = 0
+
+ file = open(__file__)
+ for line in file:
+
+ ind = len(line) - len(line.lstrip())
+
+ if line.find("def " + name) != -1:
+ record = True
+ indent = ind
+ elif record and indent == ind and len(line) > 1:
+ record = False
+
+ if record:
+ s = s + line
+
+ file.close()
+ return s
+
+py_header = """#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# File generated by json2pcap.py
+# json2pcap.py created by Martin Kacer, 2020
+
+import os
+import binascii
+import array
+import sys
+import subprocess
+from collections import OrderedDict
+from scapy import all as scapy
+
+# *****************************************************
+# * PACKET PAYLOAD GENERATED FROM INPUT PCAP *
+# * Modify this function to edit the packet *
+# *****************************************************
+def main():
+ d = OrderedDict()
+"""
+
+py_footer = """ generate_pcap(d)
+
+# *****************************************************
+# * FUNCTIONS from TEMPLATE *
+# * Do not edit these functions if not required *
+# *****************************************************
+
+"""
+py_footer = py_footer + read_py_function("to_bytes")
+py_footer = py_footer + read_py_function("lsb")
+py_footer = py_footer + read_py_function("multiply_strings")
+py_footer = py_footer + read_py_function("rewrite_frame")
+py_footer = py_footer + read_py_function("assemble_frame")
+py_footer = py_footer + read_py_function("generate_pcap")
+
+py_footer = py_footer + """
+
+if __name__ == '__main__':
+ main()
+"""
+#
+# ***** End of PY TEMPLATES ******
+#
+
+
+
+#
+# ********** FUNCTIONS ***********
+#
+
+def raw_flat_collector(dict):
+ if hasattr(dict, 'items'):
+ for k, v in dict.items():
+ if k.endswith("_raw"):
+ yield k, v
+ else:
+ for val in raw_flat_collector(v):
+ yield val
+
+
+# d - input dictionary, parsed from json
+# r - result dictionary
+# frame_name - parent protocol name
+# frame_position - parent protocol position
+def py_generator(d, r, frame_name='frame_raw', frame_position=0):
+ if (d is None or d is None):
+ return
+
+ if hasattr(d, 'items'):
+ for k, v in d.items():
+
+ # no recursion
+ if k.endswith("_raw") or "_raw_" in k:
+ if isinstance(v[1], (list, tuple)) or isinstance(v[2], (list, tuple)):
+ #i = 1;
+ for _v in v:
+ h = _v[0]
+ p = _v[1]
+ l = _v[2] * 2
+ b = _v[3]
+ t = _v[4]
+ if (len(h) != l):
+ l = len(h)
+
+ p = p - frame_position
+
+ # Add into result dictionary
+ key = str(k).replace('.', '_')
+ key = make_unique(key, r)
+
+ fn = frame_name.replace('.', '_')
+ if (fn == key):
+ fn = None
+ value = [fn, h, p, l, b, t]
+
+ r[key] = value
+
+ else:
+ h = v[0]
+ p = v[1]
+ l = v[2] * 2
+ b = v[3]
+ t = v[4]
+ if (len(h) != l):
+ l = len(h)
+
+ p = p - frame_position
+
+ # Add into result dictionary
+ key = str(k).replace('.', '_')
+ key = make_unique(key, r)
+
+ fn = frame_name.replace('.', '_')
+ if (fn == key):
+ fn = None
+ value = [fn , h, p, l, b, t]
+
+ r[key] = value
+
+ # recursion
+ else:
+ if isinstance(v, dict):
+ fn = frame_name
+ fp = frame_position
+
+ # if there is also preceding raw protocol frame use it
+ # remove tree suffix
+ key = k
+ if (key.endswith("_tree") or ("_tree_" in key)):
+ key = key.replace('_tree', '')
+
+ raw_key = key + "_raw"
+ if (raw_key in d):
+ # f = d[raw_key][0]
+ fn = raw_key
+ fp = d[raw_key][1]
+
+
+ py_generator(v, r, fn, fp)
+
+ elif isinstance(v, (list, tuple)):
+
+ fn = frame_name
+ fp = frame_position
+
+ # if there is also preceding raw protocol frame use it
+ # remove tree suffix
+ key = k
+ if (key.endswith("_tree") or ("_tree_" in key)):
+ key = key.replace('_tree', '')
+
+ raw_key = key + "_raw"
+ if (raw_key in d):
+ fn = raw_key
+ fp = d[raw_key][1]
+
+ for _v in v:
+ py_generator(_v, r, frame_name, frame_position)
+
+# To emulate Python 3.2
+def to_bytes(n, length, endianess='big'):
+ h = '%x' % n
+ s = bytearray.fromhex(('0' * (len(h) % 2) + h).zfill(length * 2))
+ return s if endianess == 'big' else s[::-1]
+
+# Returns the index, counting from 0, of the least significant set bit in x
+def lsb(x):
+ return (x & -x).bit_length() - 1
+
+# Replace parts of original_string by new_string, only if mask in the byte is not ff
+def multiply_strings(original_string, new_string, mask):
+
+ ret_string = new_string
+ if mask is None:
+ return ret_string
+ for i in range(0, min(len(original_string), len(new_string), len(mask)), 2):
+ if mask[i:i + 2] == 'ff':
+ #print("ff")
+ ret_string = ret_string[:i] + original_string[i:i + 2] + ret_string[i + 2:]
+
+ return ret_string
+
+# Rewrite frame
+# h - hex bytes
+# p - position
+# l - length
+# b - bitmask
+# t - type
+# frame_amask - optional, anonymization mask (00 - not anonymized byte, ff - anonymized byte)
+def rewrite_frame(frame_raw, h, p, l, b, t, frame_amask=None):
+ if p < 0 or l < 0 or h is None:
+ return frame_raw
+
+ # no bitmask
+ if(b == 0):
+ if (len(h) != l):
+ l = len(h)
+ frame_raw_new = frame_raw[:p] + h + frame_raw[p + l:]
+ return multiply_strings(frame_raw, frame_raw_new, frame_amask)
+ # bitmask
+ else:
+ # get hex string from frame which will be replaced
+ _h = frame_raw[p:p + l]
+
+ # add 0 padding to have correct length
+ if (len(_h) % 2 == 1):
+ _h = '0' + _h
+ if (len(h) % 2 == 1):
+ h = '0' + h
+
+ # Only replace bits defined by mask
+ # new_hex = (old_hex & !mask) | (new_hex & mask)
+ _H = bytearray.fromhex(_h)
+ _H = array.array('B', _H)
+
+ M = to_bytes(b, len(_H))
+ M = array.array('B', M)
+ # shift mask aligned to position
+ for i in range(len(M)):
+ if (i + p / 2) < len(M):
+ M[i] = M[i + int(p / 2)]
+ else:
+ M[i] = 0x00
+
+ H = bytearray.fromhex(h)
+ H = array.array('B', H)
+
+ # for i in range(len(_H)):
+ # print "{0:08b}".format(_H[i]),
+ # print
+ # for i in range(len(M)):
+ # print "{0:08b}".format(M[i]),
+ # print
+
+ j = 0
+ for i in range(len(_H)):
+ if (M[i] != 0):
+ v = H[j] << lsb(M[i])
+ # print "Debug: {0:08b}".format(v),
+ _H[i] = (_H[i] & ~M[i]) | (v & M[i])
+ # print "Debug: " + str(_H[i]),
+ j = j + 1
+
+ # for i in range(len(_H)):
+ # print "{0:08b}".format(_H[i]),
+ # print
+
+ masked_h = binascii.hexlify(_H)
+ masked_h = masked_h.decode('ascii')
+
+ frame_raw_new = frame_raw[:p] + str(masked_h) + frame_raw[p + l:]
+ return multiply_strings(frame_raw, frame_raw_new, frame_amask)
+
+
+def assemble_frame(d, frame_time):
+ input = d['frame_raw'][1]
+ isFlat = False
+ linux_cooked_header = False
+ while not isFlat:
+ isFlat = True
+ _d = d.copy()
+ for key, val in _d.items():
+ h = str(val[1]) # hex
+ p = val[2] * 2 # position
+ l = val[3] * 2 # length
+ b = val[4] # bitmask
+ t = val[5] # type
+
+ if (key == "sll_raw"):
+ linux_cooked_header = True
+
+ # only if the node is not parent
+ isParent = False
+ for k, v in d.items():
+ if (v[0] == key):
+ isParent = True
+ isFlat = False
+ break
+
+ if not isParent and val[0] is not None:
+ d[val[0]][1] = rewrite_frame(d[val[0]][1], h, p, l, b, t)
+ del d[key]
+
+ output = d['frame_raw'][1]
+
+ # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame
+ if (linux_cooked_header):
+ output = "000000000000" + output[6*2:] # replce dest MAC
+ output = output[:12*2] + "" + output[14*2:] # remove two bytes before Protocol
+
+ return output
+
+def generate_pcap(d):
+ # 1. Assemble frame
+ input = d['frame_raw'][1]
+ output = assemble_frame(d, None)
+ print(input)
+ print(output)
+ # 2. Testing: compare input and output for not modified json
+ if (input != output):
+ print("Modified frames: ")
+ s1 = input
+ s2 = output
+ print(s1)
+ print(s2)
+ if (len(s1) == len(s2)):
+ d = [i for i in range(len(s1)) if s1[i] != s2[i]]
+ print(d)
+ # 3. Generate pcap
+ outfile = sys.argv[0] + ".pcap"
+ pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
+ new_packet = scapy.Packet(bytearray.fromhex(output))
+ pcap_out.write(new_packet)
+ print("Generated " + outfile)
+
+#
+# ************ MAIN **************
+#
+VERSION = "1.1"
+
+parser = argparse.ArgumentParser(description="""
+json2pcap {version}
+
+Utility to generate pcap from json format.
+
+Packet modification:
+In input json it is possible to modify the raw values of decoded fields.
+The output pcap will include the modified values. The algorithm of
+generating the output pcap is to get all raw hex fields from input json and
+then assembling them by layering from longest (less decoded fields) to
+shortest (more decoded fields). It means if the modified raw field is
+shorter field (more decoded field) it takes precedence against modification
+in longer field (less decoded field). If the json includes duplicated raw
+fields with same position and length, the behavior is not deterministic.
+For manual packet editing it is always possible to remove any not required
+raw fields from json, only frame_raw is field mandatory for reconstruction.
+
+Packet modification with -p switch:
+The python script is generated instead of pcap. This python script when
+executed will generate the pcap of 1st packet from input json. The
+generated code includes the decoded fields and the function to assembly the
+packet. This enables to modify the script and programmatically edit or
+encode the packet variables. The assembling algorithm is different, because
+the decoded packet fields are relative and points to parent node with their
+position (compared to input json which has absolute positions).
+
+Pcap masking and anonymization with -m and -a switch:
+The script allows to mask or anonymize the selected json raw fields. If the
+The fields are selected and located on lower protocol layers, they are not
+The overwritten by upper fields which are not marked by these switches.
+The pcap masking and anonymization can be performed in the following way:
+
+tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw"
+-a "ip.dst_raw" -o anonymized.pcap
+In this example the ip.src_raw field is masked with ffffffff by byte values
+and ip.dst_raw is hashed by randomly generated salt.
+
+Additionally the following syntax is valid to anonymize portion of field
+tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:]"
+-a "ip.dst_raw[:-2]" -o anonymized.pcap
+Where the src_ip first byte is preserved and dst_ip last byte is preserved.
+And the same can be achieved by
+tshark -r orig.pcap -T json -x | \ python json2pcap.py -m "ip.src_raw[2:8]"
+-a "ip.dst_raw[0:6]" -o anonymized.pcap
+
+Masking and anonymization limitations are mainly the following:
+- In case the tshark is performing reassembling from multiple frames, the
+backward pcap reconstruction is not properly performed and can result in
+malformed frames.
+- The new values in the fields could violate the field format, as the
+json2pcap is no performing correct protocol encoding with respect to
+allowed values of the target field and field encoding.
+
+""".format(version=VERSION), formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
+parser.add_argument('-i', '--infile', nargs='?', help='json generated by tshark -T json -x\nor by tshark -T jsonraw (not preserving frame timestamps).\nIf no inpout file is specified script reads from stdin.')
+parser.add_argument('-o', '--outfile', required=True, help='output pcap filename')
+parser.add_argument('-p', '--python', help='generate python payload instead of pcap (only 1st packet)', default=False, action='store_true')
+parser.add_argument('-m', '--mask', help='mask the specific raw field (e.g. -m "ip.src_raw" -m "ip.dst_raw[2:6]")', action='append', metavar='MASKED_FIELD')
+parser.add_argument('-a', '--anonymize', help='anonymize the specific raw field (e.g. -a "ip.src_raw[2:]" -a "ip.dst_raw[:-2]")', action='append', metavar='ANONYMIZED_FIELD')
+parser.add_argument('-s', '--salt', help='salt use for anonymization. If no value is provided it is randomized.', default=None)
+parser.add_argument('-v', '--verbose', help='verbose output', default=False, action='store_true')
+args = parser.parse_args()
+
+# read JSON
+infile = args.infile
+outfile = args.outfile
+
+# Read from input file
+if infile:
+ data_file = open(infile)
+# Read from pipe
+else:
+ data_file = sys.stdin
+
+# Parse anonymization fields
+anonymize = {}
+if args.mask:
+ for m in args.mask:
+ if '_raw' not in m:
+ print("Error: The specified fields by -m switch should be raw fields. " + m + " does not have _raw suffix")
+ sys.exit()
+ af = AnonymizedField(m, 0)
+ anonymize[af.field] = af
+if args.anonymize:
+ for a in args.anonymize:
+ if '_raw' not in a:
+ print("Error: The specified fields by -a switch should be raw fields. " + a + " does not have _raw suffix")
+ sys.exit()
+ af = AnonymizedField(a, 1)
+ anonymize[af.field] = af
+
+input_frame_raw = ''
+frame_raw = ''
+frame_time = None
+
+salt = args.salt
+if salt is None:
+ # generate random salt if no salt was provided
+ salt = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10))
+
+# Generate pcap
+if args.python is False:
+ pcap_out = scapy.PcapWriter(outfile, append=False, sync=False)
+
+ # Iterate over packets in JSON
+ for packet in ijson.items(data_file, "item", buf_size=200000):
+ _list = []
+ linux_cooked_header = False
+
+ # get flat raw fields into _list
+ for raw in raw_flat_collector(packet['_source']['layers']):
+ if len(raw) >= 2:
+ if (raw[0] == "frame_raw"):
+ frame_raw = raw[1][0]
+ frame_amask = "0"*len(frame_raw) # initialize anonymization mask
+ input_frame_raw = copy.copy(frame_raw)
+ frame_time = None
+ if 'frame.time_epoch' in packet['_source']['layers']['frame']:
+ frame_time = packet['_source']['layers']['frame']['frame.time_epoch']
+ else:
+ # add into value list into raw[5] the field name
+ if isinstance(raw[1], list):
+ raw[1].append(raw[0])
+ _list.append(raw[1])
+ if (raw[0] == "sll_raw"):
+ linux_cooked_header = True
+
+ # sort _list
+ sorted_list = sorted(_list, key=operator.itemgetter(1), reverse=False)
+ sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True)
+ # print("Debug: " + str(sorted_list))
+
+ # rewrite frame
+ for raw in sorted_list:
+ if len(raw) >= 6:
+ h = str(raw[0]) # hex
+ p = raw[1] * 2 # position
+ l = raw[2] * 2 # length
+ b = raw[3] # bitmask
+ t = raw[4] # type
+ # raw[5] # field_name (added by script)
+ h_mask = h # hex for anonymization mask
+
+ # anonymize fields
+ if (raw[5] in anonymize):
+ [h, h_mask] = anonymize[raw[5]].anonymize_field(h, t, salt)
+
+ if (isinstance(p, (list, tuple)) or isinstance(l, (list, tuple))):
+ for r in raw:
+ _h = str(r[0]) # hex
+ _p = r[1] * 2 # position
+ _l = r[2] * 2 # length
+ _b = r[3] # bitmask
+ _t = r[4] # type
+ # raw[5] # field_name (added by script)
+ _h_mask = _h # hex for anonymization mask
+
+ # anonymize fields
+ if (raw[5] in anonymize):
+ [_h, _h_mask] = anonymize[raw[5]].anonymize_field(_h, _t, salt)
+
+ # print("Debug: " + str(raw))
+ frame_raw = rewrite_frame(frame_raw, _h, _p, _l, _b, _t, frame_amask)
+
+ # update anonymization mask
+ if (raw[5] in anonymize):
+ frame_amask = rewrite_frame(frame_amask, _h_mask, _p, _l, _b, _t)
+
+ else:
+ # print("Debug: " + str(raw))
+ frame_raw = rewrite_frame(frame_raw, h, p, l, b, t, frame_amask)
+
+ # update anonymization mask
+ if (raw[5] in anonymize):
+ frame_amask = rewrite_frame(frame_amask, h_mask, p, l, b, t)
+
+ # for Linux cooked header replace dest MAC and remove two bytes to reconstruct normal frame using text2pcap
+ if (linux_cooked_header):
+ frame_raw = "000000000000" + frame_raw[6 * 2:] # replce dest MAC
+ frame_raw = frame_raw[:12 * 2] + "" + frame_raw[14 * 2:] # remove two bytes before Protocol
+
+ # Testing: remove comment to compare input and output for not modified json
+ if (args.verbose and input_frame_raw != frame_raw):
+ print("Modified frames: ")
+ s1 = input_frame_raw
+ s2 = frame_raw
+ print(s1)
+ print(s2)
+ if (len(s1) == len(s2)):
+ d = [i for i in range(len(s1)) if s1[i] != s2[i]]
+ print(d)
+
+ new_packet = scapy.Packet(bytearray.fromhex(frame_raw))
+ if frame_time:
+ new_packet.time = float(frame_time)
+ pcap_out.write(new_packet)
+
+# Generate python payload only for first packet
+else:
+ py_outfile = outfile + '.py'
+ f = open(py_outfile, 'w')
+
+ #for packet in json:
+ for packet in ijson.items(data_file, "item", buf_size=200000):
+ f.write(py_header)
+
+ r = OrderedDict({})
+
+ #print "packet = " + str(packet['_source']['layers'])
+ py_generator(packet['_source']['layers'], r)
+
+ for key, value in r.items():
+ f.write(" d['" + key + "'] =",)
+ f.write(" " + str(value) + "\n")
+
+ f.write(py_footer)
+
+ # Currently only first packet is used from pcap
+ f.close
+
+ print("Generated " + py_outfile)
+
+ break