diff options
Diffstat (limited to '')
-rw-r--r-- | xpcom/ds/tools/make_dafsa.py | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/xpcom/ds/tools/make_dafsa.py b/xpcom/ds/tools/make_dafsa.py new file mode 100644 index 0000000000..a9cedf78a8 --- /dev/null +++ b/xpcom/ds/tools/make_dafsa.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +""" +A Deterministic acyclic finite state automaton (DAFSA) is a compact +representation of an unordered word list (dictionary). + +http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton + +This python program converts a list of strings to a byte array in C++. +This python program fetches strings and return values from a gperf file +and generates a C++ file with a byte array representing graph that can be +used as a memory efficient replacement for the perfect hash table. + +The input strings are assumed to consist of printable 7-bit ASCII characters +and the return values are assumed to be one digit integers. + +In this program a DAFSA is a diamond shaped graph starting at a common +root node and ending at a common end node. All internal nodes contain +a character and each word is represented by the characters in one path from +the root node to the end node. + +The order of the operations is crucial since lookups will be performed +starting from the source with no backtracking. Thus a node must have at +most one child with a label starting by the same character. The output +is also arranged so that all jumps are to increasing addresses, thus forward +in memory. + +The generated output has suffix free decoding so that the sign of leading +bits in a link (a reference to a child node) indicate if it has a size of one, +two or three bytes and if it is the last outgoing link from the actual node. +A node label is terminated by a byte with the leading bit set. + +The generated byte array can described by the following BNF: + +<byte> ::= < 8-bit value in range [0x00-0xFF] > + +<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > +<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > +<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > + +<offset1> ::= < byte in range [0x00-0x3F] > +<offset2> ::= < byte in range [0x40-0x5F] > +<offset3> ::= < byte in range [0x60-0x7F] > + +<end_offset1> ::= < byte in range [0x80-0xBF] > +<end_offset2> ::= < byte in range [0xC0-0xDF] > +<end_offset3> ::= < byte in range [0xE0-0xFF] > + +<prefix> ::= <char> + +<label> ::= <end_char> + | <char> <label> + +<end_label> ::= <return_value> + | <char> <end_label> + +<offset> ::= <offset1> + | <offset2> <byte> + | <offset3> <byte> <byte> + +<end_offset> ::= <end_offset1> + | <end_offset2> <byte> + | <end_offset3> <byte> <byte> + +<offsets> ::= <end_offset> + | <offset> <offsets> + +<source> ::= <offsets> + +<node> ::= <label> <offsets> + | <prefix> <node> + | <end_label> + +<dafsa> ::= <source> + | <dafsa> <node> + +Decoding: + +<char> -> printable 7-bit ASCII character +<end_char> & 0x7F -> printable 7-bit ASCII character +<return value> & 0x0F -> integer +<offset1 & 0x3F> -> integer +((<offset2> & 0x1F>) << 8) + <byte> -> integer +((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer + +end_offset1, end_offset2 and and_offset3 are decoded same as offset1, +offset2 and offset3 respectively. + +The first offset in a list of offsets is the distance in bytes between the +offset itself and the first child node. Subsequent offsets are the distance +between previous child node and next child node. Thus each offset links a node +to a child node. The distance is always counted between start addresses, i.e. +first byte in decoded offset or first byte in child node. + +Example 1: + +%% +aa, 1 +a, 2 +%% + +The input is first parsed to a list of words: +["aa1", "a2"] + +This produces the following graph: +[root] --- a --- 0x02 --- [end] + | / + | / + - a --- 0x01 + +A C++ representation of the compressed graph is generated: + +const unsigned char dafsa[7] = { + 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, +}; + +The bytes in the generated array has the following meaning: + + 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 + + 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" + 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 + + 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 + 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 + + 5: 0x61 <char> label character 0x61 -> match "a" + 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 + +Example 2: + +%% +aa, 1 +bbb, 2 +baa, 1 +%% + +The input is first parsed to a list of words: +["aa1", "bbb2", "baa1"] + +This produces the following graph: +[root] --- a --- a --- 0x01 --- [end] + | / / / + | / / / + - b --- b --- b --- 0x02 + +A C++ representation of the compressed graph is generated: + +const unsigned char dafsa[11] = { + 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, +}; + +The bytes in the generated array has the following meaning: + + 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 + 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 + + 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" + 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 + 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 + + 5: 0x61 <char> label character 0x61 -> match "a" + 6: 0x61 <char> label character 0x61 -> match "a" + 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 + + 8: 0x62 <char> label character 0x62 -> match "b" + 9: 0x62 <char> label character 0x62 -> match "b" +10: 0x82 <return_value> 0x82 & 0x0F -> return 2 +""" +import struct +import sys + +from incremental_dafsa import Dafsa, Node + + +class InputError(Exception): + """Exception raised for errors in the input file.""" + + +def top_sort(dafsa: Dafsa): + """Generates list of nodes in topological sort order.""" + incoming = {} + + def count_incoming(node: Node): + """Counts incoming references.""" + if not node.is_end_node: + if id(node) not in incoming: + incoming[id(node)] = 1 + for child in node.children.values(): + count_incoming(child) + else: + incoming[id(node)] += 1 + + for node in dafsa.root_node.children.values(): + count_incoming(node) + + for node in dafsa.root_node.children.values(): + incoming[id(node)] -= 1 + + waiting = [ + node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0 + ] + nodes = [] + + while waiting: + node = waiting.pop() + assert incoming[id(node)] == 0 + nodes.append(node) + for child in node.children.values(): + if not child.is_end_node: + incoming[id(child)] -= 1 + if incoming[id(child)] == 0: + waiting.append(child) + return nodes + + +def encode_links(node: Node, offsets, current): + """Encodes a list of children as one, two or three byte offsets.""" + if next(iter(node.children.values())).is_end_node: + # This is an <end_label> node and no links follow such nodes + return [] + guess = 3 * len(node.children) + assert node.children + + children = sorted(node.children.values(), key=lambda x: -offsets[id(x)]) + while True: + offset = current + guess + buf = [] + for child in children: + last = len(buf) + distance = offset - offsets[id(child)] + assert distance > 0 and distance < (1 << 21) + + if distance < (1 << 6): + # A 6-bit offset: "s0xxxxxx" + buf.append(distance) + elif distance < (1 << 13): + # A 13-bit offset: "s10xxxxxxxxxxxxx" + buf.append(0x40 | (distance >> 8)) + buf.append(distance & 0xFF) + else: + # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" + buf.append(0x60 | (distance >> 16)) + buf.append((distance >> 8) & 0xFF) + buf.append(distance & 0xFF) + # Distance in first link is relative to following record. + # Distance in other links are relative to previous link. + offset -= distance + if len(buf) == guess: + break + guess = len(buf) + # Set most significant bit to mark end of links in this node. + buf[last] |= 1 << 7 + buf.reverse() + return buf + + +def encode_prefix(label): + """Encodes a node label as a list of bytes without a trailing high byte. + + This method encodes a node if there is exactly one child and the + child follows immediately after so that no jump is needed. This label + will then be a prefix to the label in the child node. + """ + assert label + return [ord(c) for c in reversed(label)] + + +def encode_label(label): + """Encodes a node label as a list of bytes with a trailing high byte >0x80.""" + buf = encode_prefix(label) + # Set most significant bit to mark end of label in this node. + buf[0] |= 1 << 7 + return buf + + +def encode(dafsa: Dafsa): + """Encodes a DAFSA to a list of bytes""" + output = [] + offsets = {} + + for node in reversed(top_sort(dafsa)): + if ( + len(node.children) == 1 + and not next(iter(node.children.values())).is_end_node + and (offsets[id(next(iter(node.children.values())))] == len(output)) + ): + output.extend(encode_prefix(node.character)) + else: + output.extend(encode_links(node, offsets, len(output))) + output.extend(encode_label(node.character)) + offsets[id(node)] = len(output) + + output.extend(encode_links(dafsa.root_node, offsets, len(output))) + output.reverse() + return output + + +def to_cxx(data, preamble=None): + """Generates C++ code from a list of encoded bytes.""" + text = "/* This file is generated. DO NOT EDIT!\n\n" + text += "The byte array encodes a dictionary of strings and values. See " + text += "make_dafsa.py for documentation." + text += "*/\n\n" + + if preamble: + text += preamble + text += "\n\n" + + text += "const unsigned char kDafsa[%s] = {\n" % len(data) + for i in range(0, len(data), 12): + text += " " + text += ", ".join("0x%02x" % byte for byte in data[i : i + 12]) + text += ",\n" + text += "};\n" + return text + + +def words_to_cxx(words, preamble=None): + """Generates C++ code from a word list""" + dafsa = Dafsa.from_tld_data(words) + return to_cxx(encode(dafsa), preamble) + + +def words_to_bin(words): + """Generates bytes from a word list""" + dafsa = Dafsa.from_tld_data(words) + data = encode(dafsa) + return struct.pack("%dB" % len(data), *data) + + +def parse_gperf(infile): + """Parses gperf file and extract strings and return code""" + lines = [line.strip() for line in infile] + + # Extract the preamble. + first_delimeter = lines.index("%%") + preamble = "\n".join(lines[0:first_delimeter]) + + # Extract strings after the first '%%' and before the second '%%'. + begin = first_delimeter + 1 + end = lines.index("%%", begin) + lines = lines[begin:end] + for line in lines: + if line[-3:-1] != ", ": + raise InputError('Expected "domainname, <digit>", found "%s"' % line) + # Technically the DAFSA format could support return values in range [0-31], + # but the values below are the only with a defined meaning. + if line[-1] not in "0124": + raise InputError( + 'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1] + ) + return (preamble, [line[:-3] + line[-1] for line in lines]) + + +def main(outfile, infile): + with open(infile, "r") as infile: + preamble, words = parse_gperf(infile) + outfile.write(words_to_cxx(words, preamble)) + return 0 + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("usage: %s infile outfile" % sys.argv[0]) + sys.exit(1) + + with open(sys.argv[2], "w") as outfile: + sys.exit(main(outfile, sys.argv[1])) |