1 files changed, 372 insertions, 0 deletions
diff --git a/xpcom/ds/tools/make_dafsa.py b/xpcom/ds/tools/make_dafsa.py
new file mode 100644
index 0000000000..7b1e7cb9f4
--- /dev/null
+++ b/xpcom/ds/tools/make_dafsa.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+A Deterministic acyclic finite state automaton (DAFSA) is a compact
+representation of an unordered word list (dictionary).
+
+http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
+
+This python program converts a list of strings to a byte array in C++.
+This python program fetches strings and return values from a gperf file
+and generates a C++ file with a byte array representing graph that can be
+used as a memory efficient replacement for the perfect hash table.
+
+The input strings are assumed to consist of printable 7-bit ASCII characters
+and the return values are assumed to be one digit integers.
+
+In this program a DAFSA is a diamond shaped graph starting at a common
+root node and ending at a common end node. All internal nodes contain
+a character and each word is represented by the characters in one path from
+the root node to the end node.
+
+The order of the operations is crucial since lookups will be performed
+starting from the source with no backtracking. Thus a node must have at
+most one child with a label starting by the same character. The output
+is also arranged so that all jumps are to increasing addresses, thus forward
+in memory.
+
+The generated output has suffix free decoding so that the sign of leading
+bits in a link (a reference to a child node) indicate if it has a size of one,
+two or three bytes and if it is the last outgoing link from the actual node.
+A node label is terminated by a byte with the leading bit set.
+
+The generated byte array can described by the following BNF:
+
+<byte> ::= < 8-bit value in range [0x00-0xFF] >
+
+<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
+
+<offset1> ::= < byte in range [0x00-0x3F] >
+<offset2> ::= < byte in range [0x40-0x5F] >
+<offset3> ::= < byte in range [0x60-0x7F] >
+
+<end_offset1> ::= < byte in range [0x80-0xBF] >
+<end_offset2> ::= < byte in range [0xC0-0xDF] >
+<end_offset3> ::= < byte in range [0xE0-0xFF] >
+
+<prefix> ::= <char>
+
+<label> ::= <end_char>
+          | <char> <label>
+
+<end_label> ::= <return_value>
+          | <char> <end_label>
+
+<offset> ::= <offset1>
+           | <offset2> <byte>
+           | <offset3> <byte> <byte>
+
+<end_offset> ::= <end_offset1>
+               | <end_offset2> <byte>
+               | <end_offset3> <byte> <byte>
+
+<offsets> ::= <end_offset>
+            | <offset> <offsets>
+
+<source> ::= <offsets>
+
+<node> ::= <label> <offsets>
+         | <prefix> <node>
+         | <end_label>
+
+<dafsa> ::= <source>
+          | <dafsa> <node>
+
+Decoding:
+
+<char> -> printable 7-bit ASCII character
+<end_char> & 0x7F -> printable 7-bit ASCII character
+<return value> & 0x0F -> integer
+<offset1 & 0x3F> -> integer
+((<offset2> & 0x1F>) << 8) + <byte> -> integer
+((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
+
+end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
+offset2 and offset3 respectively.
+
+The first offset in a list of offsets is the distance in bytes between the
+offset itself and the first child node. Subsequent offsets are the distance
+between previous child node and next child node. Thus each offset links a node
+to a child node. The distance is always counted between start addresses, i.e.
+first byte in decoded offset or first byte in child node.
+
+Example 1:
+
+%%
+aa, 1
+a, 2
+%%
+
+The input is first parsed to a list of words:
+["aa1", "a2"]
+
+This produces the following graph:
+[root] --- a --- 0x02 --- [end]
+            |              /
+             |           /
+              - a --- 0x01
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[7] = {
+  0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
+
+ 1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
+ 2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
+
+ 3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
+ 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
+
+ 5: 0x61 <char>         label character 0x61 -> match "a"
+ 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+Example 2:
+
+%%
+aa, 1
+bbb, 2
+baa, 1
+%%
+
+The input is first parsed to a list of words:
+["aa1", "bbb2", "baa1"]
+
+This produces the following graph:
+[root] --- a --- a --- 0x01 --- [end]
+ |       /           /         /
+  |    /           /         /
+   - b --- b --- b --- 0x02
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[11] = {
+  0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
+ 1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
+
+ 2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
+ 3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
+ 4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
+
+ 5: 0x61 <char>         label character 0x61 -> match "a"
+ 6: 0x61 <char>         label character 0x61 -> match "a"
+ 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+ 8: 0x62 <char>         label character 0x62 -> match "b"
+ 9: 0x62 <char>         label character 0x62 -> match "b"
+10: 0x82 <return_value> 0x82 & 0x0F -> return 2
+"""
+import sys
+import struct
+
+from incremental_dafsa import Dafsa, Node
+
+
+class InputError(Exception):
+    """Exception raised for errors in the input file."""
+
+
+def top_sort(dafsa: Dafsa):
+    """Generates list of nodes in topological sort order."""
+    incoming = {}
+
+    def count_incoming(node: Node):
+        """Counts incoming references."""
+        if not node.is_end_node:
+            if id(node) not in incoming:
+                incoming[id(node)] = 1
+                for child in node.children.values():
+                    count_incoming(child)
+            else:
+                incoming[id(node)] += 1
+
+    for node in dafsa.root_node.children.values():
+        count_incoming(node)
+
+    for node in dafsa.root_node.children.values():
+        incoming[id(node)] -= 1
+
+    waiting = [
+        node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0
+    ]
+    nodes = []
+
+    while waiting:
+        node = waiting.pop()
+        assert incoming[id(node)] == 0
+        nodes.append(node)
+        for child in node.children.values():
+            if not child.is_end_node:
+                incoming[id(child)] -= 1
+                if incoming[id(child)] == 0:
+                    waiting.append(child)
+    return nodes
+
+
+def encode_links(node: Node, offsets, current):
+    """Encodes a list of children as one, two or three byte offsets."""
+    if next(iter(node.children.values())).is_end_node:
+        # This is an <end_label> node and no links follow such nodes
+        return []
+    guess = 3 * len(node.children)
+    assert node.children
+
+    children = sorted(node.children.values(), key=lambda x: -offsets[id(x)])
+    while True:
+        offset = current + guess
+        buf = []
+        for child in children:
+            last = len(buf)
+            distance = offset - offsets[id(child)]
+            assert distance > 0 and distance < (1 << 21)
+
+            if distance < (1 << 6):
+                # A 6-bit offset: "s0xxxxxx"
+                buf.append(distance)
+            elif distance < (1 << 13):
+                # A 13-bit offset: "s10xxxxxxxxxxxxx"
+                buf.append(0x40 | (distance >> 8))
+                buf.append(distance & 0xFF)
+            else:
+                # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
+                buf.append(0x60 | (distance >> 16))
+                buf.append((distance >> 8) & 0xFF)
+                buf.append(distance & 0xFF)
+            # Distance in first link is relative to following record.
+            # Distance in other links are relative to previous link.
+            offset -= distance
+        if len(buf) == guess:
+            break
+        guess = len(buf)
+    # Set most significant bit to mark end of links in this node.
+    buf[last] |= 1 << 7
+    buf.reverse()
+    return buf
+
+
+def encode_prefix(label):
+    """Encodes a node label as a list of bytes without a trailing high byte.
+
+    This method encodes a node if there is exactly one child  and the
+    child follows immediately after so that no jump is needed. This label
+    will then be a prefix to the label in the child node.
+    """
+    assert label
+    return [ord(c) for c in reversed(label)]
+
+
+def encode_label(label):
+    """Encodes a node label as a list of bytes with a trailing high byte >0x80."""
+    buf = encode_prefix(label)
+    # Set most significant bit to mark end of label in this node.
+    buf[0] |= 1 << 7
+    return buf
+
+
+def encode(dafsa: Dafsa):
+    """Encodes a DAFSA to a list of bytes"""
+    output = []
+    offsets = {}
+
+    for node in reversed(top_sort(dafsa)):
+        if (
+            len(node.children) == 1
+            and not next(iter(node.children.values())).is_end_node
+            and (offsets[id(next(iter(node.children.values())))] == len(output))
+        ):
+            output.extend(encode_prefix(node.character))
+        else:
+            output.extend(encode_links(node, offsets, len(output)))
+            output.extend(encode_label(node.character))
+        offsets[id(node)] = len(output)
+
+    output.extend(encode_links(dafsa.root_node, offsets, len(output)))
+    output.reverse()
+    return output
+
+
+def to_cxx(data, preamble=None):
+    """Generates C++ code from a list of encoded bytes."""
+    text = "/* This file is generated. DO NOT EDIT!\n\n"
+    text += "The byte array encodes a dictionary of strings and values. See "
+    text += "make_dafsa.py for documentation."
+    text += "*/\n\n"
+
+    if preamble:
+        text += preamble
+        text += "\n\n"
+
+    text += "const unsigned char kDafsa[%s] = {\n" % len(data)
+    for i in range(0, len(data), 12):
+        text += "  "
+        text += ", ".join("0x%02x" % byte for byte in data[i : i + 12])
+        text += ",\n"
+    text += "};\n"
+    return text
+
+
+def words_to_cxx(words, preamble=None):
+    """Generates C++ code from a word list"""
+    dafsa = Dafsa.from_tld_data(words)
+    return to_cxx(encode(dafsa), preamble)
+
+
+def words_to_bin(words):
+    """Generates bytes from a word list"""
+    dafsa = Dafsa.from_tld_data(words)
+    data = encode(dafsa)
+    return struct.pack("%dB" % len(data), *data)
+
+
+def parse_gperf(infile):
+    """Parses gperf file and extract strings and return code"""
+    lines = [line.strip() for line in infile]
+
+    # Extract the preamble.
+    first_delimeter = lines.index("%%")
+    preamble = "\n".join(lines[0:first_delimeter])
+
+    # Extract strings after the first '%%' and before the second '%%'.
+    begin = first_delimeter + 1
+    end = lines.index("%%", begin)
+    lines = lines[begin:end]
+    for line in lines:
+        if line[-3:-1] != ", ":
+            raise InputError('Expected "domainname, <digit>", found "%s"' % line)
+        # Technically the DAFSA format could support return values in range [0-31],
+        # but the values below are the only with a defined meaning.
+        if line[-1] not in "0124":
+            raise InputError(
+                'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1]
+            )
+    return (preamble, [line[:-3] + line[-1] for line in lines])
+
+
+def main(outfile, infile):
+    with open(infile, "r") as infile:
+        preamble, words = parse_gperf(infile)
+        outfile.write(words_to_cxx(words, preamble))
+    return 0
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("usage: %s infile outfile" % sys.argv[0])
+        sys.exit(1)
+
+    with open(sys.argv[2], "w") as outfile:
+        sys.exit(main(outfile, sys.argv[1]))