summaryrefslogtreecommitdiffstats
path: root/xpcom/ds/tools
diff options
context:
space:
mode:
Diffstat (limited to 'xpcom/ds/tools')
-rw-r--r--xpcom/ds/tools/incremental_dafsa.py509
-rw-r--r--xpcom/ds/tools/make_dafsa.py372
-rw-r--r--xpcom/ds/tools/perfecthash.py377
3 files changed, 1258 insertions, 0 deletions
diff --git a/xpcom/ds/tools/incremental_dafsa.py b/xpcom/ds/tools/incremental_dafsa.py
new file mode 100644
index 0000000000..1157c26850
--- /dev/null
+++ b/xpcom/ds/tools/incremental_dafsa.py
@@ -0,0 +1,509 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+Incremental algorithm for creating a "deterministic acyclic finite state
+automaton" (DAFSA). At the time of writing this algorithm, there was existing logic
+that depended on a different format for the DAFSA, so this contains convenience
+functions for converting to a compatible structure. This legacy format is defined
+in make_dafsa.py.
+"""
+
+from typing import Callable, Dict, List, Optional
+
+
+class Node:
+ children: Dict[str, "Node"]
+ parents: Dict[str, List["Node"]]
+ character: str
+ is_root_node: bool
+ is_end_node: bool
+
+ def __init__(self, character, is_root_node=False, is_end_node=False):
+ self.children = {}
+ self.parents = {}
+ self.character = character
+ self.is_root_node = is_root_node
+ self.is_end_node = is_end_node
+
+ def __str__(self):
+ """Produce a helpful string representation of this node.
+
+ This is expected to only be used for debugging.
+ The produced output is:
+
+ "c[def.] <123>"
+ ^ ^ ^
+ | | Internal python ID of the node (used for de-duping)
+ | |
+ | One possible path through the tree to the end
+ |
+ Current node character
+ """
+
+ if self.is_root_node:
+ return "<root>"
+ elif self.is_end_node:
+ return "<end>"
+
+ first_potential_match = ""
+ node = self
+ while node.children:
+ first_character = next(iter(node.children))
+ if first_character:
+ first_potential_match += first_character
+ node = node.children[first_character]
+
+ return "%s[%s] <%d>" % (self.character, first_potential_match, id(self))
+
+ def add_child(self, child):
+ self.children[child.character] = child
+ child.parents.setdefault(self.character, [])
+ child.parents[self.character].append(self)
+
+ def remove(self):
+ # remove() must only be called when this node has only a single parent, and that
+ # parent doesn't need this child anymore.
+ # The caller is expected to have performed this validation.
+ # (placing asserts here add a non-trivial performance hit)
+
+ # There's only a single parent, so only one list should be in the "parents" map
+ parent_list = next(iter(self.parents.values()))
+ self.remove_parent(parent_list[0])
+ for child in list(self.children.values()):
+ child.remove_parent(self)
+
+ def remove_parent(self, parent_node: "Node"):
+ parent_node.children.pop(self.character)
+ parents_for_character = self.parents[parent_node.character]
+ parents_for_character.remove(parent_node)
+ if not parents_for_character:
+ self.parents.pop(parent_node.character)
+
+ def copy_fork_node(self, fork_node: "Node", child_to_avoid: Optional["Node"]):
+ """Shallow-copy a node's children.
+
+ When adding a new word, sometimes previously-joined suffixes aren't perfect
+ matches any more. When this happens, some nodes need to be "copied" out.
+ For all non-end nodes, there's a child to avoid in the shallow-copy.
+ """
+
+ for child in fork_node.children.values():
+ if child is not child_to_avoid:
+ self.add_child(child)
+
+ def is_fork(self):
+ """Check if this node has multiple parents"""
+
+ if len(self.parents) == 0:
+ return False
+
+ if len(self.parents) > 1:
+ return True
+
+ return len(next(iter(self.parents.values()))) > 1
+
+ def is_replacement_for_prefix_end_node(self, old: "Node"):
+ """Check if this node is a valid replacement for an old end node.
+
+ A node is a valid replacement if it maintains all existing child paths while
+ adding the new child path needed for the new word.
+
+ Args:
+ old: node being replaced
+
+ Returns: True if this node is a valid replacement node.
+ """
+
+ if len(self.children) != len(old.children) + 1:
+ return False
+
+ for character, other_node in old.children.items():
+ this_node = self.children.get(character)
+ if other_node is not this_node:
+ return False
+
+ return True
+
+ def is_replacement_for_prefix_node(self, old: "Node"):
+ """Check if this node is a valid replacement for a non-end node.
+
+ A node is a valid replacement if it:
+ * Has one new child that the old node doesn't
+ * Is missing a child that the old node has
+ * Shares all other children
+
+ Returns: True if this node is a valid replacement node.
+ """
+
+ if len(self.children) != len(old.children):
+ return False
+
+ found_extra_child = False
+
+ for character, other_node in old.children.items():
+ this_node = self.children.get(character)
+ if other_node is not this_node:
+ if found_extra_child:
+ # Found two children in the old node that aren't in the new one,
+ # this isn't a valid replacement
+ return False
+ else:
+ found_extra_child = True
+
+ return found_extra_child
+
+
+class SuffixCursor:
+ index: int # Current position of the cursor within the DAFSA.
+ node: Node
+
+ def __init__(self, index, node):
+ self.index = index
+ self.node = node
+
+ def _query(self, character: str, check: Callable[[Node], bool]):
+ for node in self.node.parents.get(character, []):
+ if check(node):
+ self.index -= 1
+ self.node = node
+ return True
+ return False
+
+ def find_single_child(self, character):
+ """Find the next matching suffix node that has a single child.
+
+ Return True if such a node is found."""
+ return self._query(character, lambda node: len(node.children) == 1)
+
+ def find_end_of_prefix_replacement(self, end_of_prefix: Node):
+ """Find the next matching suffix node that replaces the old prefix-end node.
+
+ Return True if such a node is found."""
+ return self._query(
+ end_of_prefix.character,
+ lambda node: node.is_replacement_for_prefix_end_node(end_of_prefix),
+ )
+
+ def find_inside_of_prefix_replacement(self, prefix_node: Node):
+ """Find the next matching suffix node that replaces a node within the prefix.
+
+ Return True if such a node is found."""
+ return self._query(
+ prefix_node.character,
+ lambda node: node.is_replacement_for_prefix_node(prefix_node),
+ )
+
+
+class DafsaAppendStateMachine:
+ """State machine for adding a word to a Dafsa.
+
+ Each state returns a function reference to the "next state". States should be
+ invoked until "None" is returned, in which case the new word has been appended.
+
+ The prefix and suffix indexes are placed according to the currently-known valid
+ value (not the next value being investigated). Additionally, they are 0-indexed
+ against the root node (which sits behind the beginning of the string).
+
+ Let's imagine we're at the following state when adding, for example, the
+ word "mozilla.org":
+
+ mozilla.org
+ ^ ^ ^ ^
+ | | | |
+ / | | \
+ [root] | | [end] node
+ node | \
+ | suffix
+ \
+ prefix
+
+ In this state, the furthest prefix match we could find was:
+ [root] - m - o - z - i - l
+ The index of the prefix match is "5".
+
+ Additionally, we've been looking for suffix nodes, and we've already found:
+ r - g - [end]
+ The current suffix index is "10".
+ The next suffix node we'll attempt to find is at index "9".
+ """
+
+ root_node: Node
+ prefix_index: int
+ suffix_cursor: SuffixCursor
+ stack: List[Node]
+ word: str
+ suffix_overlaps_prefix: bool
+ first_fork_index: Optional[int]
+ _state: Callable
+
+ def __init__(self, word, root_node, end_node):
+ self.root_node = root_node
+ self.prefix_index = 0
+ self.suffix_cursor = SuffixCursor(len(word) + 1, end_node)
+ self.stack = [root_node]
+ self.word = word
+ self.suffix_overlaps_prefix = False
+ self.first_fork_index = None
+ self._state = self._find_prefix
+
+ def run(self):
+ """Run this state machine to completion, adding the new word."""
+ while self._state is not None:
+ self._state = self._state()
+
+ def _find_prefix(self):
+ """Find the longest existing prefix that matches the current word."""
+ prefix_node = self.root_node
+ while self.prefix_index < len(self.word):
+ next_character = self.word[self.prefix_index]
+ next_node = prefix_node.children.get(next_character)
+ if not next_node:
+ # We're finished finding the prefix, let's find the longest suffix
+ # match now.
+ return self._find_suffix_nodes_after_prefix
+
+ self.prefix_index += 1
+ prefix_node = next_node
+ self.stack.append(next_node)
+
+ if not self.first_fork_index and next_node.is_fork():
+ self.first_fork_index = self.prefix_index
+
+ # Deja vu, we've appended this string before. Since this string has
+ # already been appended, we don't have to do anything.
+ return None
+
+ def _find_suffix_nodes_after_prefix(self):
+ """Find the chain of suffix nodes for characters after the prefix."""
+ while self.suffix_cursor.index - 1 > self.prefix_index:
+ # To fetch the next character, we need to subtract two from the current
+ # suffix index. This is because:
+ # * The next suffix node is 1 node before our current node (subtract 1)
+ # * The suffix index includes the root node before the beginning of the
+ # string - it's like the string is 1-indexed (subtract 1 again).
+ next_character = self.word[self.suffix_cursor.index - 2]
+ if not self.suffix_cursor.find_single_child(next_character):
+ return self._add_new_nodes
+
+ if self.suffix_cursor.node is self.stack[-1]:
+ # The suffix match is overlapping with the prefix! This can happen in
+ # cases like:
+ # * "ab"
+ # * "abb"
+ # The suffix cursor is at the same node as the prefix match, but they're
+ # at different positions in the word.
+ #
+ # [root] - a - b - [end]
+ # ^
+ # / \
+ # / \
+ # prefix suffix
+ # \ /
+ # \ /
+ # VV
+ # "abb"
+ if not self.first_fork_index:
+ # There hasn't been a fork, so our prefix isn't shared. So, we
+ # can mark this node as a fork, since the repetition means
+ # that there's two paths that are now using this node
+ self.first_fork_index = self.prefix_index
+ return self._add_new_nodes
+
+ # Removes the link between the unique part of the prefix and the
+ # shared part of the prefix.
+ self.stack[self.first_fork_index].remove_parent(
+ self.stack[self.first_fork_index - 1]
+ )
+ self.suffix_overlaps_prefix = True
+
+ if self.first_fork_index is None:
+ return self._find_next_suffix_nodes
+ elif self.suffix_cursor.index - 1 == self.first_fork_index:
+ return self._find_next_suffix_node_at_prefix_end_at_fork
+ else:
+ return self._find_next_suffix_node_at_prefix_end_after_fork
+
+ def _find_next_suffix_node_at_prefix_end_at_fork(self):
+ """Find the next suffix node that replaces the end of the prefix.
+
+ In this state, the prefix_end node is the same as the first fork node.
+ Therefore, if a match can be found, the old prefix node can't be entirely
+ deleted since it's used elsewhere. Instead, just the link between our
+ unique prefix and the end of the fork is removed.
+ """
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ self.first_fork_index = None
+
+ if not self.suffix_overlaps_prefix:
+ existing_node.remove_parent(self.stack[self.prefix_index])
+ else:
+ # When the suffix overlaps the prefix, the old "parent link" was removed
+ # earlier in the "find_suffix_nodes_after_prefix" step.
+ self.suffix_overlaps_prefix = False
+
+ return self._find_next_suffix_nodes
+
+ def _find_next_suffix_node_at_prefix_end_after_fork(self):
+ """Find the next suffix node that replaces the end of the prefix.
+
+ In this state, the prefix_end node is after the first fork node.
+ Therefore, even if a match is found, we don't want to modify the replaced
+ prefix node since an unrelated word chain uses it.
+ """
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ if self.prefix_index == self.first_fork_index:
+ return self._find_next_suffix_node_within_prefix_at_fork
+ else:
+ return self._find_next_suffix_nodes_within_prefix_after_fork
+
+ def _find_next_suffix_node_within_prefix_at_fork(self):
+ """Find the next suffix node within a prefix.
+
+ In this state, we've already worked our way back and found nodes in the suffix
+ to replace prefix nodes after the fork node. We have now reached the fork node,
+ and if we find a replacement for it, then we can remove the link between it
+ and our then-unique prefix and clear the fork status.
+ """
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ self.first_fork_index = None
+
+ if not self.suffix_overlaps_prefix:
+ existing_node.remove_parent(self.stack[self.prefix_index])
+ else:
+ # When the suffix overlaps the prefix, the old "parent link" was removed
+ # earlier in the "find_suffix_nodes_after_prefix" step.
+ self.suffix_overlaps_prefix = False
+
+ return self._find_next_suffix_nodes
+
+ def _find_next_suffix_nodes_within_prefix_after_fork(self):
+ """Find the next suffix nodes within a prefix.
+
+ Finds suffix nodes to replace prefix nodes, but doesn't modify the prefix
+ nodes since they're after a fork (so, we're sharing prefix nodes with
+ other words and can't modify them).
+ """
+ while True:
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ if self.prefix_index == self.first_fork_index:
+ return self._find_next_suffix_node_within_prefix_at_fork
+
+ def _find_next_suffix_nodes(self):
+ """Find all remaining suffix nodes in the chain.
+
+ In this state, there's no (longer) any fork, so there's no other words
+ using our current prefix. Therefore, as we find replacement nodes as we
+ work our way backwards, we can remove the now-unused prefix nodes.
+ """
+ while True:
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ # This prefix node is wholly replaced by the new suffix node, so it can
+ # be deleted.
+ existing_node.remove()
+ self.prefix_index -= 1
+
+ def _add_new_nodes(self):
+ """Adds new nodes to support the new word.
+
+ Duplicates forked nodes to make room for new links, adds new nodes for new
+ characters, and splices the prefix to the suffix to finish embedding the new
+ word into the DAFSA.
+ """
+ if self.first_fork_index is not None:
+ front_node = _duplicate_fork_nodes(
+ self.stack,
+ self.first_fork_index,
+ self.prefix_index,
+ # if suffix_overlaps_parent, the parent link was removed
+ # earlier in the word-adding process.
+ remove_parent_link=not self.suffix_overlaps_prefix,
+ )
+ else:
+ front_node = self.stack[self.prefix_index]
+
+ new_text = self.word[self.prefix_index : self.suffix_cursor.index - 1]
+ for character in new_text:
+ new_node = Node(character)
+ front_node.add_child(new_node)
+ front_node = new_node
+
+ front_node.add_child(self.suffix_cursor.node)
+ return None # Done!
+
+
+def _duplicate_fork_nodes(stack, fork_index, prefix_index, remove_parent_link=True):
+ parent_node = stack[fork_index - 1]
+ if remove_parent_link:
+ # remove link to old chain that we're going to be copying
+ stack[fork_index].remove_parent(parent_node)
+
+ for index in range(fork_index, prefix_index + 1):
+ fork_node = stack[index]
+ replacement_node = Node(fork_node.character)
+ child_to_avoid = None
+ if index < len(stack) - 1:
+ # We're going to be manually replacing the next node in the stack,
+ # so don't connect it as a child.
+ child_to_avoid = stack[index + 1]
+
+ replacement_node.copy_fork_node(fork_node, child_to_avoid)
+ parent_node.add_child(replacement_node)
+ parent_node = replacement_node
+
+ return parent_node
+
+
+class Dafsa:
+ root_node: Node
+ end_node: Node
+
+ def __init__(self):
+ self.root_node = Node(None, is_root_node=True)
+ self.end_node = Node(None, is_end_node=True)
+
+ @classmethod
+ def from_tld_data(cls, lines):
+ """Create a dafsa for TLD data.
+
+ TLD data has a domain and a "type" enum. The source data encodes the type as a
+ text number, but the dafsa-consuming code assumes that the type is a raw byte
+ number (e.g.: "1" => 0x01).
+
+ This function acts as a helper, processing this TLD detail before creating a
+ standard dafsa.
+ """
+
+ dafsa = cls()
+ for i, word in enumerate(lines):
+ domain_number = word[-1]
+ # Convert type from string to byte representation
+ raw_domain_number = chr(ord(domain_number) & 0x0F)
+
+ word = "%s%s" % (word[:-1], raw_domain_number)
+ dafsa.append(word)
+ return dafsa
+
+ def append(self, word):
+ state_machine = DafsaAppendStateMachine(word, self.root_node, self.end_node)
+ state_machine.run()
diff --git a/xpcom/ds/tools/make_dafsa.py b/xpcom/ds/tools/make_dafsa.py
new file mode 100644
index 0000000000..a9cedf78a8
--- /dev/null
+++ b/xpcom/ds/tools/make_dafsa.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+A Deterministic acyclic finite state automaton (DAFSA) is a compact
+representation of an unordered word list (dictionary).
+
+http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
+
+This python program converts a list of strings to a byte array in C++.
+This python program fetches strings and return values from a gperf file
+and generates a C++ file with a byte array representing graph that can be
+used as a memory efficient replacement for the perfect hash table.
+
+The input strings are assumed to consist of printable 7-bit ASCII characters
+and the return values are assumed to be one digit integers.
+
+In this program a DAFSA is a diamond shaped graph starting at a common
+root node and ending at a common end node. All internal nodes contain
+a character and each word is represented by the characters in one path from
+the root node to the end node.
+
+The order of the operations is crucial since lookups will be performed
+starting from the source with no backtracking. Thus a node must have at
+most one child with a label starting by the same character. The output
+is also arranged so that all jumps are to increasing addresses, thus forward
+in memory.
+
+The generated output has suffix free decoding so that the sign of leading
+bits in a link (a reference to a child node) indicate if it has a size of one,
+two or three bytes and if it is the last outgoing link from the actual node.
+A node label is terminated by a byte with the leading bit set.
+
+The generated byte array can described by the following BNF:
+
+<byte> ::= < 8-bit value in range [0x00-0xFF] >
+
+<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
+
+<offset1> ::= < byte in range [0x00-0x3F] >
+<offset2> ::= < byte in range [0x40-0x5F] >
+<offset3> ::= < byte in range [0x60-0x7F] >
+
+<end_offset1> ::= < byte in range [0x80-0xBF] >
+<end_offset2> ::= < byte in range [0xC0-0xDF] >
+<end_offset3> ::= < byte in range [0xE0-0xFF] >
+
+<prefix> ::= <char>
+
+<label> ::= <end_char>
+ | <char> <label>
+
+<end_label> ::= <return_value>
+ | <char> <end_label>
+
+<offset> ::= <offset1>
+ | <offset2> <byte>
+ | <offset3> <byte> <byte>
+
+<end_offset> ::= <end_offset1>
+ | <end_offset2> <byte>
+ | <end_offset3> <byte> <byte>
+
+<offsets> ::= <end_offset>
+ | <offset> <offsets>
+
+<source> ::= <offsets>
+
+<node> ::= <label> <offsets>
+ | <prefix> <node>
+ | <end_label>
+
+<dafsa> ::= <source>
+ | <dafsa> <node>
+
+Decoding:
+
+<char> -> printable 7-bit ASCII character
+<end_char> & 0x7F -> printable 7-bit ASCII character
+<return value> & 0x0F -> integer
+<offset1 & 0x3F> -> integer
+((<offset2> & 0x1F>) << 8) + <byte> -> integer
+((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
+
+end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
+offset2 and offset3 respectively.
+
+The first offset in a list of offsets is the distance in bytes between the
+offset itself and the first child node. Subsequent offsets are the distance
+between previous child node and next child node. Thus each offset links a node
+to a child node. The distance is always counted between start addresses, i.e.
+first byte in decoded offset or first byte in child node.
+
+Example 1:
+
+%%
+aa, 1
+a, 2
+%%
+
+The input is first parsed to a list of words:
+["aa1", "a2"]
+
+This produces the following graph:
+[root] --- a --- 0x02 --- [end]
+ | /
+ | /
+ - a --- 0x01
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[7] = {
+ 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
+
+ 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
+ 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
+
+ 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
+ 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
+
+ 5: 0x61 <char> label character 0x61 -> match "a"
+ 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+Example 2:
+
+%%
+aa, 1
+bbb, 2
+baa, 1
+%%
+
+The input is first parsed to a list of words:
+["aa1", "bbb2", "baa1"]
+
+This produces the following graph:
+[root] --- a --- a --- 0x01 --- [end]
+ | / / /
+ | / / /
+ - b --- b --- b --- 0x02
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[11] = {
+ 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
+ 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
+
+ 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
+ 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
+ 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
+
+ 5: 0x61 <char> label character 0x61 -> match "a"
+ 6: 0x61 <char> label character 0x61 -> match "a"
+ 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+ 8: 0x62 <char> label character 0x62 -> match "b"
+ 9: 0x62 <char> label character 0x62 -> match "b"
+10: 0x82 <return_value> 0x82 & 0x0F -> return 2
+"""
+import struct
+import sys
+
+from incremental_dafsa import Dafsa, Node
+
+
+class InputError(Exception):
+ """Exception raised for errors in the input file."""
+
+
+def top_sort(dafsa: Dafsa):
+ """Generates list of nodes in topological sort order."""
+ incoming = {}
+
+ def count_incoming(node: Node):
+ """Counts incoming references."""
+ if not node.is_end_node:
+ if id(node) not in incoming:
+ incoming[id(node)] = 1
+ for child in node.children.values():
+ count_incoming(child)
+ else:
+ incoming[id(node)] += 1
+
+ for node in dafsa.root_node.children.values():
+ count_incoming(node)
+
+ for node in dafsa.root_node.children.values():
+ incoming[id(node)] -= 1
+
+ waiting = [
+ node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0
+ ]
+ nodes = []
+
+ while waiting:
+ node = waiting.pop()
+ assert incoming[id(node)] == 0
+ nodes.append(node)
+ for child in node.children.values():
+ if not child.is_end_node:
+ incoming[id(child)] -= 1
+ if incoming[id(child)] == 0:
+ waiting.append(child)
+ return nodes
+
+
+def encode_links(node: Node, offsets, current):
+ """Encodes a list of children as one, two or three byte offsets."""
+ if next(iter(node.children.values())).is_end_node:
+ # This is an <end_label> node and no links follow such nodes
+ return []
+ guess = 3 * len(node.children)
+ assert node.children
+
+ children = sorted(node.children.values(), key=lambda x: -offsets[id(x)])
+ while True:
+ offset = current + guess
+ buf = []
+ for child in children:
+ last = len(buf)
+ distance = offset - offsets[id(child)]
+ assert distance > 0 and distance < (1 << 21)
+
+ if distance < (1 << 6):
+ # A 6-bit offset: "s0xxxxxx"
+ buf.append(distance)
+ elif distance < (1 << 13):
+ # A 13-bit offset: "s10xxxxxxxxxxxxx"
+ buf.append(0x40 | (distance >> 8))
+ buf.append(distance & 0xFF)
+ else:
+ # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
+ buf.append(0x60 | (distance >> 16))
+ buf.append((distance >> 8) & 0xFF)
+ buf.append(distance & 0xFF)
+ # Distance in first link is relative to following record.
+ # Distance in other links are relative to previous link.
+ offset -= distance
+ if len(buf) == guess:
+ break
+ guess = len(buf)
+ # Set most significant bit to mark end of links in this node.
+ buf[last] |= 1 << 7
+ buf.reverse()
+ return buf
+
+
+def encode_prefix(label):
+ """Encodes a node label as a list of bytes without a trailing high byte.
+
+ This method encodes a node if there is exactly one child and the
+ child follows immediately after so that no jump is needed. This label
+ will then be a prefix to the label in the child node.
+ """
+ assert label
+ return [ord(c) for c in reversed(label)]
+
+
+def encode_label(label):
+ """Encodes a node label as a list of bytes with a trailing high byte >0x80."""
+ buf = encode_prefix(label)
+ # Set most significant bit to mark end of label in this node.
+ buf[0] |= 1 << 7
+ return buf
+
+
+def encode(dafsa: Dafsa):
+ """Encodes a DAFSA to a list of bytes"""
+ output = []
+ offsets = {}
+
+ for node in reversed(top_sort(dafsa)):
+ if (
+ len(node.children) == 1
+ and not next(iter(node.children.values())).is_end_node
+ and (offsets[id(next(iter(node.children.values())))] == len(output))
+ ):
+ output.extend(encode_prefix(node.character))
+ else:
+ output.extend(encode_links(node, offsets, len(output)))
+ output.extend(encode_label(node.character))
+ offsets[id(node)] = len(output)
+
+ output.extend(encode_links(dafsa.root_node, offsets, len(output)))
+ output.reverse()
+ return output
+
+
+def to_cxx(data, preamble=None):
+ """Generates C++ code from a list of encoded bytes."""
+ text = "/* This file is generated. DO NOT EDIT!\n\n"
+ text += "The byte array encodes a dictionary of strings and values. See "
+ text += "make_dafsa.py for documentation."
+ text += "*/\n\n"
+
+ if preamble:
+ text += preamble
+ text += "\n\n"
+
+ text += "const unsigned char kDafsa[%s] = {\n" % len(data)
+ for i in range(0, len(data), 12):
+ text += " "
+ text += ", ".join("0x%02x" % byte for byte in data[i : i + 12])
+ text += ",\n"
+ text += "};\n"
+ return text
+
+
+def words_to_cxx(words, preamble=None):
+ """Generates C++ code from a word list"""
+ dafsa = Dafsa.from_tld_data(words)
+ return to_cxx(encode(dafsa), preamble)
+
+
+def words_to_bin(words):
+ """Generates bytes from a word list"""
+ dafsa = Dafsa.from_tld_data(words)
+ data = encode(dafsa)
+ return struct.pack("%dB" % len(data), *data)
+
+
+def parse_gperf(infile):
+ """Parses gperf file and extract strings and return code"""
+ lines = [line.strip() for line in infile]
+
+ # Extract the preamble.
+ first_delimeter = lines.index("%%")
+ preamble = "\n".join(lines[0:first_delimeter])
+
+ # Extract strings after the first '%%' and before the second '%%'.
+ begin = first_delimeter + 1
+ end = lines.index("%%", begin)
+ lines = lines[begin:end]
+ for line in lines:
+ if line[-3:-1] != ", ":
+ raise InputError('Expected "domainname, <digit>", found "%s"' % line)
+ # Technically the DAFSA format could support return values in range [0-31],
+ # but the values below are the only with a defined meaning.
+ if line[-1] not in "0124":
+ raise InputError(
+ 'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1]
+ )
+ return (preamble, [line[:-3] + line[-1] for line in lines])
+
+
+def main(outfile, infile):
+ with open(infile, "r") as infile:
+ preamble, words = parse_gperf(infile)
+ outfile.write(words_to_cxx(words, preamble))
+ return 0
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 3:
+ print("usage: %s infile outfile" % sys.argv[0])
+ sys.exit(1)
+
+ with open(sys.argv[2], "w") as outfile:
+ sys.exit(main(outfile, sys.argv[1]))
diff --git a/xpcom/ds/tools/perfecthash.py b/xpcom/ds/tools/perfecthash.py
new file mode 100644
index 0000000000..929b643a30
--- /dev/null
+++ b/xpcom/ds/tools/perfecthash.py
@@ -0,0 +1,377 @@
+# perfecthash.py - Helper for generating perfect hash functions
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+A perfect hash function (PHF) is a function which maps distinct elements from a
+source set to a sequence of integers with no collisions.
+
+PHFs generated by this module use a 32-bit FNV hash to index an intermediate
+table. The value from that table is then used to run a second 32-bit FNV hash to
+get a final index.
+
+The algorithm works by starting with the largest set of conflicts, and testing
+intermediate table values until no conflicts are generated, allowing efficient
+index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119)
+
+NOTE: This perfect hash generator was designed to be simple, easy to follow, and
+maintainable, rather than being as optimized as possible, due to the relatively
+small dataset it was designed for. In the future we may want to optimize further.
+"""
+
+import textwrap
+from collections import namedtuple
+
+import six
+from mozbuild.util import ensure_bytes
+
+
+# Iteration over bytestrings works differently in Python 2 and 3; this function
+# captures the two possibilities. Returns an 'int' given the output of iterating
+# through a bytestring regardless of the input.
+def _ord(c):
+ if six.PY3:
+ return c
+ return ord(c)
+
+
+class PerfectHash(object):
+ """PerfectHash objects represent a computed perfect hash function, which
+ can be generated at compile time to provide highly efficient and compact
+ static HashTables.
+
+ Consumers must provide an intermediate table size to store the generated
+ hash function. Larger tables will generate more quickly, while smaller
+ tables will consume less space on disk."""
+
+ # 32-bit FNV offset basis and prime value.
+ # NOTE: Must match values in |PerfectHash.h|
+ FNV_OFFSET_BASIS = 0x811C9DC5
+ FNV_PRIME = 16777619
+ U32_MAX = 0xFFFFFFFF
+
+ # Bucket of entries which map into a given intermediate index.
+ Bucket = namedtuple("Bucket", "index entries")
+
+ def __init__(self, entries, size, validate=True, key=lambda e: e[0]):
+ """Create a new PerfectHash
+
+ @param entries set of entries to generate a PHF for
+ @param size size of the PHF intermediate table
+ @param validate test the generated hash function after generation
+ @param key function to get 'memoryview'-compatible key for an
+ entry. defaults to extracting the first element of an
+ entry 2-tuple.
+ """
+
+ assert 0 < len(entries) < self.U32_MAX, "bad # of entries!"
+ self._key = key
+
+ # Allocate the intermediate table and keys.
+ self.table = [0] * size
+ self.entries = [None] * len(entries)
+
+ # A set of Bucket values. Each bucket contains the index into the
+ # intermediate table, and a list of Entries which map into that bucket.
+ buckets = [self.Bucket(idx, []) for idx in range(size)]
+
+ # Determine which input strings map to which buckets in the table.
+ for entry in entries:
+ assert entry is not None, "None entry in entries"
+ assert self.key(entry).itemsize == 1, "non-byte key elements"
+ buckets[self._hash(self.key(entry)) % size].entries.append(entry)
+
+ # Sort the buckets such that the largest one comes first.
+ buckets.sort(key=lambda b: len(b.entries), reverse=True)
+
+ for bucket in buckets:
+ # Once we've reached an empty bucket, we're done.
+ if len(bucket.entries) == 0:
+ break
+
+ # Try values for the basis until we find one with no conflicts.
+ idx = 0
+ basis = 1
+ slots = []
+ while idx < len(bucket.entries):
+ slot = self._hash(self.key(bucket.entries[idx]), basis) % len(
+ self.entries
+ )
+ if self.entries[slot] is not None or slot in slots:
+ # There was a conflict, try the next basis.
+ basis += 1
+ idx = 0
+ del slots[:]
+ assert basis < self.U32_MAX, "table too small"
+ else:
+ slots.append(slot)
+ idx += 1
+
+ assert basis < self.U32_MAX
+
+ # We've found a basis which doesn't conflict
+ self.table[bucket.index] = basis
+ for slot, entry in zip(slots, bucket.entries):
+ self.entries[slot] = entry
+
+ # Validate that looking up each key succeeds
+ if validate:
+ for entry in entries:
+ assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry)
+
+ @classmethod
+ def _hash(cls, key, basis=FNV_OFFSET_BASIS):
+ """A basic FNV-based hash function. key is the memoryview to hash.
+ 32-bit FNV is used for indexing into the first table, and the value
+ stored in that table is used as the offset basis for indexing into the
+ values table."""
+ for byte in memoryview(ensure_bytes(key)):
+ obyte = _ord(byte)
+ basis ^= obyte # xor-in the byte
+ basis *= cls.FNV_PRIME # Multiply by the FNV prime
+ basis &= cls.U32_MAX # clamp to 32-bits
+ return basis
+
+ def key(self, entry):
+ return memoryview(ensure_bytes(self._key(entry)))
+
+ def get_raw_index(self, key):
+ """Determine the index in self.entries without validating"""
+ mid = self.table[self._hash(key) % len(self.table)]
+ return self._hash(key, mid) % len(self.entries)
+
+ def get_index(self, key):
+ """Given a key, determine the index in self.entries"""
+ idx = self.get_raw_index(key)
+ if memoryview(ensure_bytes(key)) != self.key(self.entries[idx]):
+ return None
+ return idx
+
+ def get_entry(self, key):
+ """Given a key, get the corresponding entry"""
+ idx = self.get_index(key)
+ if idx is None:
+ return None
+ return self.entries[idx]
+
+ @staticmethod
+ def _indent(text, amount=1):
+ return text.replace("\n", "\n" + (" " * amount))
+
+ def codegen(self, name, entry_type):
+ """Create a helper for codegening PHF logic"""
+ return CGHelper(self, name, entry_type)
+
+ def cxx_codegen(
+ self,
+ name,
+ entry_type,
+ lower_entry,
+ entries_name=None,
+ return_type=None,
+ return_entry="return entry;",
+ key_type="const char*",
+ key_bytes="aKey",
+ key_length="strlen(aKey)",
+ ):
+ """Generate complete C++ code for a get_entry-style method.
+
+ @param name Name for the entry getter function.
+ @param entry_type C++ type of each entry in static memory.
+ @param lower_entry Function called with each entry to convert it to a
+ C++ literal of type 'entry_type'.
+
+ # Optional Keyword Parameters:
+ @param entries_name Name for the generated entry table.
+
+ @param return_type C++ return type, default: 'const entry_type&'
+ @param return_entry Override the default behaviour for returning the
+ found entry. 'entry' is a reference to the found
+ entry, and 'aKey' is the lookup key. 'return_entry'
+ can be used for additional checks, e.g. for keys
+ not in the table.
+
+ @param key_type C++ key type, default: 'const char*'
+ @param key_bytes 'const char*' expression to get bytes for 'aKey'
+ @param key_length 'size_t' expression to get length of 'aKey'"""
+
+ if entries_name is None:
+ entries_name = "s%sEntries" % name
+
+ cg = self.codegen(entries_name, entry_type)
+ entries = cg.gen_entries(lower_entry)
+ getter = cg.gen_getter(
+ name, return_type, return_entry, key_type, key_bytes, key_length
+ )
+
+ return "%s\n\n%s" % (entries, getter)
+
+
+class CGHelper(object):
+ """Helper object for generating C++ code for a PerfectHash.
+ Created using PerfectHash.codegen()."""
+
+ def __init__(self, phf, entries_name, entry_type):
+ self.phf = phf
+ self.entries_name = entries_name
+ self.entry_type = entry_type
+
+ @staticmethod
+ def _indent(text, amount=1):
+ return text.replace("\n", "\n" + (" " * amount))
+
+ def basis_ty(self):
+ """Determine how big of an integer is needed for bases table."""
+ max_base = max(self.phf.table)
+ if max_base <= 0xFF:
+ return "uint8_t"
+ elif max_base <= 0xFFFF:
+ return "uint16_t"
+ return "uint32_t"
+
+ def basis_table(self, name="BASES"):
+ """Generate code for a static basis table"""
+ bases = ""
+ for idx, base in enumerate(self.phf.table):
+ if idx and idx % 16 == 0: # 16 bases per line
+ bases += "\n "
+ bases += "%4d," % base
+ return (
+ textwrap.dedent(
+ """\
+ static const %s %s[] = {
+ %s
+ };
+ """
+ )
+ % (self.basis_ty(), name, bases)
+ )
+
+ def gen_entries(self, lower_entry):
+ """Generate code for an entries table"""
+ entries = self._indent(
+ ",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries)
+ )
+ return (
+ textwrap.dedent(
+ """\
+ const %s %s[] = {
+ %s
+ };
+ """
+ )
+ % (self.entry_type, self.entries_name, entries)
+ )
+
+ def gen_getter(
+ self,
+ name,
+ return_type=None,
+ return_entry="return entry;",
+ key_type="const char*",
+ key_bytes="aKey",
+ key_length="strlen(aKey)",
+ ):
+ """Generate the code for a C++ getter.
+
+ @param name Name for the entry getter function.
+ @param return_type C++ return type, default: 'const entry_type&'
+ @param return_entry Override the default behaviour for returning the
+ found entry. 'entry' is a reference to the found
+ entry, and 'aKey' is the lookup key. 'return_entry'
+ can be used for additional checks, e.g. for keys
+ not in the table.
+
+ @param key_type C++ key type, default: 'const char*'
+ @param key_bytes 'const char*' expression to get bytes for 'aKey'
+ @param key_length 'size_t' expression to get length of 'aKey'"""
+
+ if return_type is None:
+ return_type = "const %s&" % self.entry_type
+
+ return (
+ textwrap.dedent(
+ """
+ %(return_type)s
+ %(name)s(%(key_type)s aKey)
+ {
+ %(basis_table)s
+
+ const char* bytes = %(key_bytes)s;
+ size_t length = %(key_length)s;
+ auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES,
+ %(entries_name)s);
+ %(return_entry)s
+ }
+ """
+ )
+ % {
+ "name": name,
+ "basis_table": self._indent(self.basis_table()),
+ "entries_name": self.entries_name,
+ "return_type": return_type,
+ "return_entry": self._indent(return_entry),
+ "key_type": key_type,
+ "key_bytes": key_bytes,
+ "key_length": key_length,
+ }
+ )
+
+ def gen_jslinearstr_getter(
+ self, name, return_type=None, return_entry="return entry;"
+ ):
+ """Generate code for a specialized getter taking JSLinearStrings.
+ This getter avoids copying the JS string, but only supports ASCII keys.
+
+ @param name Name for the entry getter function.
+ @param return_type C++ return type, default: 'const entry_type&'
+ @param return_entry Override the default behaviour for returning the
+ found entry. 'entry' is a reference to the found
+ entry, and 'aKey' is the lookup key. 'return_entry'
+ can be used for additional checks, e.g. for keys
+ not in the table."""
+
+ assert all(
+ _ord(b) <= 0x7F for e in self.phf.entries for b in self.phf.key(e)
+ ), "non-ASCII key"
+
+ if return_type is None:
+ return_type = "const %s&" % self.entry_type
+
+ return (
+ textwrap.dedent(
+ """
+ %(return_type)s
+ %(name)s(JSLinearString* aKey)
+ {
+ %(basis_table)s
+
+ size_t length = JS::GetLinearStringLength(aKey);
+
+ JS::AutoCheckCannotGC nogc;
+ if (JS::LinearStringHasLatin1Chars(aKey)) {
+ auto& entry = mozilla::perfecthash::Lookup(
+ JS::GetLatin1LinearStringChars(nogc, aKey),
+ length, BASES, %(entries_name)s);
+
+ %(return_entry)s
+ } else {
+ auto& entry = mozilla::perfecthash::Lookup(
+ JS::GetTwoByteLinearStringChars(nogc, aKey),
+ length, BASES, %(entries_name)s);
+
+ %(return_entry)s
+ }
+ }
+ """
+ )
+ % {
+ "name": name,
+ "basis_table": self._indent(self.basis_table()),
+ "entries_name": self.entries_name,
+ "return_type": return_type,
+ "return_entry": self._indent(return_entry, 2),
+ }
+ )