diff options
Diffstat (limited to 'xpcom/ds/tools')
-rw-r--r-- | xpcom/ds/tools/incremental_dafsa.py | 509 | ||||
-rw-r--r-- | xpcom/ds/tools/make_dafsa.py | 372 | ||||
-rw-r--r-- | xpcom/ds/tools/perfecthash.py | 377 |
3 files changed, 1258 insertions, 0 deletions
diff --git a/xpcom/ds/tools/incremental_dafsa.py b/xpcom/ds/tools/incremental_dafsa.py new file mode 100644 index 0000000000..1157c26850 --- /dev/null +++ b/xpcom/ds/tools/incremental_dafsa.py @@ -0,0 +1,509 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Incremental algorithm for creating a "deterministic acyclic finite state +automaton" (DAFSA). At the time of writing this algorithm, there was existing logic +that depended on a different format for the DAFSA, so this contains convenience +functions for converting to a compatible structure. This legacy format is defined +in make_dafsa.py. +""" + +from typing import Callable, Dict, List, Optional + + +class Node: + children: Dict[str, "Node"] + parents: Dict[str, List["Node"]] + character: str + is_root_node: bool + is_end_node: bool + + def __init__(self, character, is_root_node=False, is_end_node=False): + self.children = {} + self.parents = {} + self.character = character + self.is_root_node = is_root_node + self.is_end_node = is_end_node + + def __str__(self): + """Produce a helpful string representation of this node. + + This is expected to only be used for debugging. + The produced output is: + + "c[def.] <123>" + ^ ^ ^ + | | Internal python ID of the node (used for de-duping) + | | + | One possible path through the tree to the end + | + Current node character + """ + + if self.is_root_node: + return "<root>" + elif self.is_end_node: + return "<end>" + + first_potential_match = "" + node = self + while node.children: + first_character = next(iter(node.children)) + if first_character: + first_potential_match += first_character + node = node.children[first_character] + + return "%s[%s] <%d>" % (self.character, first_potential_match, id(self)) + + def add_child(self, child): + self.children[child.character] = child + child.parents.setdefault(self.character, []) + child.parents[self.character].append(self) + + def remove(self): + # remove() must only be called when this node has only a single parent, and that + # parent doesn't need this child anymore. + # The caller is expected to have performed this validation. + # (placing asserts here add a non-trivial performance hit) + + # There's only a single parent, so only one list should be in the "parents" map + parent_list = next(iter(self.parents.values())) + self.remove_parent(parent_list[0]) + for child in list(self.children.values()): + child.remove_parent(self) + + def remove_parent(self, parent_node: "Node"): + parent_node.children.pop(self.character) + parents_for_character = self.parents[parent_node.character] + parents_for_character.remove(parent_node) + if not parents_for_character: + self.parents.pop(parent_node.character) + + def copy_fork_node(self, fork_node: "Node", child_to_avoid: Optional["Node"]): + """Shallow-copy a node's children. + + When adding a new word, sometimes previously-joined suffixes aren't perfect + matches any more. When this happens, some nodes need to be "copied" out. + For all non-end nodes, there's a child to avoid in the shallow-copy. + """ + + for child in fork_node.children.values(): + if child is not child_to_avoid: + self.add_child(child) + + def is_fork(self): + """Check if this node has multiple parents""" + + if len(self.parents) == 0: + return False + + if len(self.parents) > 1: + return True + + return len(next(iter(self.parents.values()))) > 1 + + def is_replacement_for_prefix_end_node(self, old: "Node"): + """Check if this node is a valid replacement for an old end node. + + A node is a valid replacement if it maintains all existing child paths while + adding the new child path needed for the new word. + + Args: + old: node being replaced + + Returns: True if this node is a valid replacement node. + """ + + if len(self.children) != len(old.children) + 1: + return False + + for character, other_node in old.children.items(): + this_node = self.children.get(character) + if other_node is not this_node: + return False + + return True + + def is_replacement_for_prefix_node(self, old: "Node"): + """Check if this node is a valid replacement for a non-end node. + + A node is a valid replacement if it: + * Has one new child that the old node doesn't + * Is missing a child that the old node has + * Shares all other children + + Returns: True if this node is a valid replacement node. + """ + + if len(self.children) != len(old.children): + return False + + found_extra_child = False + + for character, other_node in old.children.items(): + this_node = self.children.get(character) + if other_node is not this_node: + if found_extra_child: + # Found two children in the old node that aren't in the new one, + # this isn't a valid replacement + return False + else: + found_extra_child = True + + return found_extra_child + + +class SuffixCursor: + index: int # Current position of the cursor within the DAFSA. + node: Node + + def __init__(self, index, node): + self.index = index + self.node = node + + def _query(self, character: str, check: Callable[[Node], bool]): + for node in self.node.parents.get(character, []): + if check(node): + self.index -= 1 + self.node = node + return True + return False + + def find_single_child(self, character): + """Find the next matching suffix node that has a single child. + + Return True if such a node is found.""" + return self._query(character, lambda node: len(node.children) == 1) + + def find_end_of_prefix_replacement(self, end_of_prefix: Node): + """Find the next matching suffix node that replaces the old prefix-end node. + + Return True if such a node is found.""" + return self._query( + end_of_prefix.character, + lambda node: node.is_replacement_for_prefix_end_node(end_of_prefix), + ) + + def find_inside_of_prefix_replacement(self, prefix_node: Node): + """Find the next matching suffix node that replaces a node within the prefix. + + Return True if such a node is found.""" + return self._query( + prefix_node.character, + lambda node: node.is_replacement_for_prefix_node(prefix_node), + ) + + +class DafsaAppendStateMachine: + """State machine for adding a word to a Dafsa. + + Each state returns a function reference to the "next state". States should be + invoked until "None" is returned, in which case the new word has been appended. + + The prefix and suffix indexes are placed according to the currently-known valid + value (not the next value being investigated). Additionally, they are 0-indexed + against the root node (which sits behind the beginning of the string). + + Let's imagine we're at the following state when adding, for example, the + word "mozilla.org": + + mozilla.org + ^ ^ ^ ^ + | | | | + / | | \ + [root] | | [end] node + node | \ + | suffix + \ + prefix + + In this state, the furthest prefix match we could find was: + [root] - m - o - z - i - l + The index of the prefix match is "5". + + Additionally, we've been looking for suffix nodes, and we've already found: + r - g - [end] + The current suffix index is "10". + The next suffix node we'll attempt to find is at index "9". + """ + + root_node: Node + prefix_index: int + suffix_cursor: SuffixCursor + stack: List[Node] + word: str + suffix_overlaps_prefix: bool + first_fork_index: Optional[int] + _state: Callable + + def __init__(self, word, root_node, end_node): + self.root_node = root_node + self.prefix_index = 0 + self.suffix_cursor = SuffixCursor(len(word) + 1, end_node) + self.stack = [root_node] + self.word = word + self.suffix_overlaps_prefix = False + self.first_fork_index = None + self._state = self._find_prefix + + def run(self): + """Run this state machine to completion, adding the new word.""" + while self._state is not None: + self._state = self._state() + + def _find_prefix(self): + """Find the longest existing prefix that matches the current word.""" + prefix_node = self.root_node + while self.prefix_index < len(self.word): + next_character = self.word[self.prefix_index] + next_node = prefix_node.children.get(next_character) + if not next_node: + # We're finished finding the prefix, let's find the longest suffix + # match now. + return self._find_suffix_nodes_after_prefix + + self.prefix_index += 1 + prefix_node = next_node + self.stack.append(next_node) + + if not self.first_fork_index and next_node.is_fork(): + self.first_fork_index = self.prefix_index + + # Deja vu, we've appended this string before. Since this string has + # already been appended, we don't have to do anything. + return None + + def _find_suffix_nodes_after_prefix(self): + """Find the chain of suffix nodes for characters after the prefix.""" + while self.suffix_cursor.index - 1 > self.prefix_index: + # To fetch the next character, we need to subtract two from the current + # suffix index. This is because: + # * The next suffix node is 1 node before our current node (subtract 1) + # * The suffix index includes the root node before the beginning of the + # string - it's like the string is 1-indexed (subtract 1 again). + next_character = self.word[self.suffix_cursor.index - 2] + if not self.suffix_cursor.find_single_child(next_character): + return self._add_new_nodes + + if self.suffix_cursor.node is self.stack[-1]: + # The suffix match is overlapping with the prefix! This can happen in + # cases like: + # * "ab" + # * "abb" + # The suffix cursor is at the same node as the prefix match, but they're + # at different positions in the word. + # + # [root] - a - b - [end] + # ^ + # / \ + # / \ + # prefix suffix + # \ / + # \ / + # VV + # "abb" + if not self.first_fork_index: + # There hasn't been a fork, so our prefix isn't shared. So, we + # can mark this node as a fork, since the repetition means + # that there's two paths that are now using this node + self.first_fork_index = self.prefix_index + return self._add_new_nodes + + # Removes the link between the unique part of the prefix and the + # shared part of the prefix. + self.stack[self.first_fork_index].remove_parent( + self.stack[self.first_fork_index - 1] + ) + self.suffix_overlaps_prefix = True + + if self.first_fork_index is None: + return self._find_next_suffix_nodes + elif self.suffix_cursor.index - 1 == self.first_fork_index: + return self._find_next_suffix_node_at_prefix_end_at_fork + else: + return self._find_next_suffix_node_at_prefix_end_after_fork + + def _find_next_suffix_node_at_prefix_end_at_fork(self): + """Find the next suffix node that replaces the end of the prefix. + + In this state, the prefix_end node is the same as the first fork node. + Therefore, if a match can be found, the old prefix node can't be entirely + deleted since it's used elsewhere. Instead, just the link between our + unique prefix and the end of the fork is removed. + """ + existing_node = self.stack[self.prefix_index] + if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node): + return self._add_new_nodes + + self.prefix_index -= 1 + self.first_fork_index = None + + if not self.suffix_overlaps_prefix: + existing_node.remove_parent(self.stack[self.prefix_index]) + else: + # When the suffix overlaps the prefix, the old "parent link" was removed + # earlier in the "find_suffix_nodes_after_prefix" step. + self.suffix_overlaps_prefix = False + + return self._find_next_suffix_nodes + + def _find_next_suffix_node_at_prefix_end_after_fork(self): + """Find the next suffix node that replaces the end of the prefix. + + In this state, the prefix_end node is after the first fork node. + Therefore, even if a match is found, we don't want to modify the replaced + prefix node since an unrelated word chain uses it. + """ + existing_node = self.stack[self.prefix_index] + if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node): + return self._add_new_nodes + + self.prefix_index -= 1 + if self.prefix_index == self.first_fork_index: + return self._find_next_suffix_node_within_prefix_at_fork + else: + return self._find_next_suffix_nodes_within_prefix_after_fork + + def _find_next_suffix_node_within_prefix_at_fork(self): + """Find the next suffix node within a prefix. + + In this state, we've already worked our way back and found nodes in the suffix + to replace prefix nodes after the fork node. We have now reached the fork node, + and if we find a replacement for it, then we can remove the link between it + and our then-unique prefix and clear the fork status. + """ + existing_node = self.stack[self.prefix_index] + if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node): + return self._add_new_nodes + + self.prefix_index -= 1 + self.first_fork_index = None + + if not self.suffix_overlaps_prefix: + existing_node.remove_parent(self.stack[self.prefix_index]) + else: + # When the suffix overlaps the prefix, the old "parent link" was removed + # earlier in the "find_suffix_nodes_after_prefix" step. + self.suffix_overlaps_prefix = False + + return self._find_next_suffix_nodes + + def _find_next_suffix_nodes_within_prefix_after_fork(self): + """Find the next suffix nodes within a prefix. + + Finds suffix nodes to replace prefix nodes, but doesn't modify the prefix + nodes since they're after a fork (so, we're sharing prefix nodes with + other words and can't modify them). + """ + while True: + existing_node = self.stack[self.prefix_index] + if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node): + return self._add_new_nodes + + self.prefix_index -= 1 + if self.prefix_index == self.first_fork_index: + return self._find_next_suffix_node_within_prefix_at_fork + + def _find_next_suffix_nodes(self): + """Find all remaining suffix nodes in the chain. + + In this state, there's no (longer) any fork, so there's no other words + using our current prefix. Therefore, as we find replacement nodes as we + work our way backwards, we can remove the now-unused prefix nodes. + """ + while True: + existing_node = self.stack[self.prefix_index] + if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node): + return self._add_new_nodes + + # This prefix node is wholly replaced by the new suffix node, so it can + # be deleted. + existing_node.remove() + self.prefix_index -= 1 + + def _add_new_nodes(self): + """Adds new nodes to support the new word. + + Duplicates forked nodes to make room for new links, adds new nodes for new + characters, and splices the prefix to the suffix to finish embedding the new + word into the DAFSA. + """ + if self.first_fork_index is not None: + front_node = _duplicate_fork_nodes( + self.stack, + self.first_fork_index, + self.prefix_index, + # if suffix_overlaps_parent, the parent link was removed + # earlier in the word-adding process. + remove_parent_link=not self.suffix_overlaps_prefix, + ) + else: + front_node = self.stack[self.prefix_index] + + new_text = self.word[self.prefix_index : self.suffix_cursor.index - 1] + for character in new_text: + new_node = Node(character) + front_node.add_child(new_node) + front_node = new_node + + front_node.add_child(self.suffix_cursor.node) + return None # Done! + + +def _duplicate_fork_nodes(stack, fork_index, prefix_index, remove_parent_link=True): + parent_node = stack[fork_index - 1] + if remove_parent_link: + # remove link to old chain that we're going to be copying + stack[fork_index].remove_parent(parent_node) + + for index in range(fork_index, prefix_index + 1): + fork_node = stack[index] + replacement_node = Node(fork_node.character) + child_to_avoid = None + if index < len(stack) - 1: + # We're going to be manually replacing the next node in the stack, + # so don't connect it as a child. + child_to_avoid = stack[index + 1] + + replacement_node.copy_fork_node(fork_node, child_to_avoid) + parent_node.add_child(replacement_node) + parent_node = replacement_node + + return parent_node + + +class Dafsa: + root_node: Node + end_node: Node + + def __init__(self): + self.root_node = Node(None, is_root_node=True) + self.end_node = Node(None, is_end_node=True) + + @classmethod + def from_tld_data(cls, lines): + """Create a dafsa for TLD data. + + TLD data has a domain and a "type" enum. The source data encodes the type as a + text number, but the dafsa-consuming code assumes that the type is a raw byte + number (e.g.: "1" => 0x01). + + This function acts as a helper, processing this TLD detail before creating a + standard dafsa. + """ + + dafsa = cls() + for i, word in enumerate(lines): + domain_number = word[-1] + # Convert type from string to byte representation + raw_domain_number = chr(ord(domain_number) & 0x0F) + + word = "%s%s" % (word[:-1], raw_domain_number) + dafsa.append(word) + return dafsa + + def append(self, word): + state_machine = DafsaAppendStateMachine(word, self.root_node, self.end_node) + state_machine.run() diff --git a/xpcom/ds/tools/make_dafsa.py b/xpcom/ds/tools/make_dafsa.py new file mode 100644 index 0000000000..a9cedf78a8 --- /dev/null +++ b/xpcom/ds/tools/make_dafsa.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +""" +A Deterministic acyclic finite state automaton (DAFSA) is a compact +representation of an unordered word list (dictionary). + +http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton + +This python program converts a list of strings to a byte array in C++. +This python program fetches strings and return values from a gperf file +and generates a C++ file with a byte array representing graph that can be +used as a memory efficient replacement for the perfect hash table. + +The input strings are assumed to consist of printable 7-bit ASCII characters +and the return values are assumed to be one digit integers. + +In this program a DAFSA is a diamond shaped graph starting at a common +root node and ending at a common end node. All internal nodes contain +a character and each word is represented by the characters in one path from +the root node to the end node. + +The order of the operations is crucial since lookups will be performed +starting from the source with no backtracking. Thus a node must have at +most one child with a label starting by the same character. The output +is also arranged so that all jumps are to increasing addresses, thus forward +in memory. + +The generated output has suffix free decoding so that the sign of leading +bits in a link (a reference to a child node) indicate if it has a size of one, +two or three bytes and if it is the last outgoing link from the actual node. +A node label is terminated by a byte with the leading bit set. + +The generated byte array can described by the following BNF: + +<byte> ::= < 8-bit value in range [0x00-0xFF] > + +<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > +<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > +<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > + +<offset1> ::= < byte in range [0x00-0x3F] > +<offset2> ::= < byte in range [0x40-0x5F] > +<offset3> ::= < byte in range [0x60-0x7F] > + +<end_offset1> ::= < byte in range [0x80-0xBF] > +<end_offset2> ::= < byte in range [0xC0-0xDF] > +<end_offset3> ::= < byte in range [0xE0-0xFF] > + +<prefix> ::= <char> + +<label> ::= <end_char> + | <char> <label> + +<end_label> ::= <return_value> + | <char> <end_label> + +<offset> ::= <offset1> + | <offset2> <byte> + | <offset3> <byte> <byte> + +<end_offset> ::= <end_offset1> + | <end_offset2> <byte> + | <end_offset3> <byte> <byte> + +<offsets> ::= <end_offset> + | <offset> <offsets> + +<source> ::= <offsets> + +<node> ::= <label> <offsets> + | <prefix> <node> + | <end_label> + +<dafsa> ::= <source> + | <dafsa> <node> + +Decoding: + +<char> -> printable 7-bit ASCII character +<end_char> & 0x7F -> printable 7-bit ASCII character +<return value> & 0x0F -> integer +<offset1 & 0x3F> -> integer +((<offset2> & 0x1F>) << 8) + <byte> -> integer +((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer + +end_offset1, end_offset2 and and_offset3 are decoded same as offset1, +offset2 and offset3 respectively. + +The first offset in a list of offsets is the distance in bytes between the +offset itself and the first child node. Subsequent offsets are the distance +between previous child node and next child node. Thus each offset links a node +to a child node. The distance is always counted between start addresses, i.e. +first byte in decoded offset or first byte in child node. + +Example 1: + +%% +aa, 1 +a, 2 +%% + +The input is first parsed to a list of words: +["aa1", "a2"] + +This produces the following graph: +[root] --- a --- 0x02 --- [end] + | / + | / + - a --- 0x01 + +A C++ representation of the compressed graph is generated: + +const unsigned char dafsa[7] = { + 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, +}; + +The bytes in the generated array has the following meaning: + + 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 + + 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" + 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 + + 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 + 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 + + 5: 0x61 <char> label character 0x61 -> match "a" + 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 + +Example 2: + +%% +aa, 1 +bbb, 2 +baa, 1 +%% + +The input is first parsed to a list of words: +["aa1", "bbb2", "baa1"] + +This produces the following graph: +[root] --- a --- a --- 0x01 --- [end] + | / / / + | / / / + - b --- b --- b --- 0x02 + +A C++ representation of the compressed graph is generated: + +const unsigned char dafsa[11] = { + 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, +}; + +The bytes in the generated array has the following meaning: + + 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 + 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 + + 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" + 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 + 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 + + 5: 0x61 <char> label character 0x61 -> match "a" + 6: 0x61 <char> label character 0x61 -> match "a" + 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 + + 8: 0x62 <char> label character 0x62 -> match "b" + 9: 0x62 <char> label character 0x62 -> match "b" +10: 0x82 <return_value> 0x82 & 0x0F -> return 2 +""" +import struct +import sys + +from incremental_dafsa import Dafsa, Node + + +class InputError(Exception): + """Exception raised for errors in the input file.""" + + +def top_sort(dafsa: Dafsa): + """Generates list of nodes in topological sort order.""" + incoming = {} + + def count_incoming(node: Node): + """Counts incoming references.""" + if not node.is_end_node: + if id(node) not in incoming: + incoming[id(node)] = 1 + for child in node.children.values(): + count_incoming(child) + else: + incoming[id(node)] += 1 + + for node in dafsa.root_node.children.values(): + count_incoming(node) + + for node in dafsa.root_node.children.values(): + incoming[id(node)] -= 1 + + waiting = [ + node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0 + ] + nodes = [] + + while waiting: + node = waiting.pop() + assert incoming[id(node)] == 0 + nodes.append(node) + for child in node.children.values(): + if not child.is_end_node: + incoming[id(child)] -= 1 + if incoming[id(child)] == 0: + waiting.append(child) + return nodes + + +def encode_links(node: Node, offsets, current): + """Encodes a list of children as one, two or three byte offsets.""" + if next(iter(node.children.values())).is_end_node: + # This is an <end_label> node and no links follow such nodes + return [] + guess = 3 * len(node.children) + assert node.children + + children = sorted(node.children.values(), key=lambda x: -offsets[id(x)]) + while True: + offset = current + guess + buf = [] + for child in children: + last = len(buf) + distance = offset - offsets[id(child)] + assert distance > 0 and distance < (1 << 21) + + if distance < (1 << 6): + # A 6-bit offset: "s0xxxxxx" + buf.append(distance) + elif distance < (1 << 13): + # A 13-bit offset: "s10xxxxxxxxxxxxx" + buf.append(0x40 | (distance >> 8)) + buf.append(distance & 0xFF) + else: + # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" + buf.append(0x60 | (distance >> 16)) + buf.append((distance >> 8) & 0xFF) + buf.append(distance & 0xFF) + # Distance in first link is relative to following record. + # Distance in other links are relative to previous link. + offset -= distance + if len(buf) == guess: + break + guess = len(buf) + # Set most significant bit to mark end of links in this node. + buf[last] |= 1 << 7 + buf.reverse() + return buf + + +def encode_prefix(label): + """Encodes a node label as a list of bytes without a trailing high byte. + + This method encodes a node if there is exactly one child and the + child follows immediately after so that no jump is needed. This label + will then be a prefix to the label in the child node. + """ + assert label + return [ord(c) for c in reversed(label)] + + +def encode_label(label): + """Encodes a node label as a list of bytes with a trailing high byte >0x80.""" + buf = encode_prefix(label) + # Set most significant bit to mark end of label in this node. + buf[0] |= 1 << 7 + return buf + + +def encode(dafsa: Dafsa): + """Encodes a DAFSA to a list of bytes""" + output = [] + offsets = {} + + for node in reversed(top_sort(dafsa)): + if ( + len(node.children) == 1 + and not next(iter(node.children.values())).is_end_node + and (offsets[id(next(iter(node.children.values())))] == len(output)) + ): + output.extend(encode_prefix(node.character)) + else: + output.extend(encode_links(node, offsets, len(output))) + output.extend(encode_label(node.character)) + offsets[id(node)] = len(output) + + output.extend(encode_links(dafsa.root_node, offsets, len(output))) + output.reverse() + return output + + +def to_cxx(data, preamble=None): + """Generates C++ code from a list of encoded bytes.""" + text = "/* This file is generated. DO NOT EDIT!\n\n" + text += "The byte array encodes a dictionary of strings and values. See " + text += "make_dafsa.py for documentation." + text += "*/\n\n" + + if preamble: + text += preamble + text += "\n\n" + + text += "const unsigned char kDafsa[%s] = {\n" % len(data) + for i in range(0, len(data), 12): + text += " " + text += ", ".join("0x%02x" % byte for byte in data[i : i + 12]) + text += ",\n" + text += "};\n" + return text + + +def words_to_cxx(words, preamble=None): + """Generates C++ code from a word list""" + dafsa = Dafsa.from_tld_data(words) + return to_cxx(encode(dafsa), preamble) + + +def words_to_bin(words): + """Generates bytes from a word list""" + dafsa = Dafsa.from_tld_data(words) + data = encode(dafsa) + return struct.pack("%dB" % len(data), *data) + + +def parse_gperf(infile): + """Parses gperf file and extract strings and return code""" + lines = [line.strip() for line in infile] + + # Extract the preamble. + first_delimeter = lines.index("%%") + preamble = "\n".join(lines[0:first_delimeter]) + + # Extract strings after the first '%%' and before the second '%%'. + begin = first_delimeter + 1 + end = lines.index("%%", begin) + lines = lines[begin:end] + for line in lines: + if line[-3:-1] != ", ": + raise InputError('Expected "domainname, <digit>", found "%s"' % line) + # Technically the DAFSA format could support return values in range [0-31], + # but the values below are the only with a defined meaning. + if line[-1] not in "0124": + raise InputError( + 'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1] + ) + return (preamble, [line[:-3] + line[-1] for line in lines]) + + +def main(outfile, infile): + with open(infile, "r") as infile: + preamble, words = parse_gperf(infile) + outfile.write(words_to_cxx(words, preamble)) + return 0 + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("usage: %s infile outfile" % sys.argv[0]) + sys.exit(1) + + with open(sys.argv[2], "w") as outfile: + sys.exit(main(outfile, sys.argv[1])) diff --git a/xpcom/ds/tools/perfecthash.py b/xpcom/ds/tools/perfecthash.py new file mode 100644 index 0000000000..929b643a30 --- /dev/null +++ b/xpcom/ds/tools/perfecthash.py @@ -0,0 +1,377 @@ +# perfecthash.py - Helper for generating perfect hash functions +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +A perfect hash function (PHF) is a function which maps distinct elements from a +source set to a sequence of integers with no collisions. + +PHFs generated by this module use a 32-bit FNV hash to index an intermediate +table. The value from that table is then used to run a second 32-bit FNV hash to +get a final index. + +The algorithm works by starting with the largest set of conflicts, and testing +intermediate table values until no conflicts are generated, allowing efficient +index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119) + +NOTE: This perfect hash generator was designed to be simple, easy to follow, and +maintainable, rather than being as optimized as possible, due to the relatively +small dataset it was designed for. In the future we may want to optimize further. +""" + +import textwrap +from collections import namedtuple + +import six +from mozbuild.util import ensure_bytes + + +# Iteration over bytestrings works differently in Python 2 and 3; this function +# captures the two possibilities. Returns an 'int' given the output of iterating +# through a bytestring regardless of the input. +def _ord(c): + if six.PY3: + return c + return ord(c) + + +class PerfectHash(object): + """PerfectHash objects represent a computed perfect hash function, which + can be generated at compile time to provide highly efficient and compact + static HashTables. + + Consumers must provide an intermediate table size to store the generated + hash function. Larger tables will generate more quickly, while smaller + tables will consume less space on disk.""" + + # 32-bit FNV offset basis and prime value. + # NOTE: Must match values in |PerfectHash.h| + FNV_OFFSET_BASIS = 0x811C9DC5 + FNV_PRIME = 16777619 + U32_MAX = 0xFFFFFFFF + + # Bucket of entries which map into a given intermediate index. + Bucket = namedtuple("Bucket", "index entries") + + def __init__(self, entries, size, validate=True, key=lambda e: e[0]): + """Create a new PerfectHash + + @param entries set of entries to generate a PHF for + @param size size of the PHF intermediate table + @param validate test the generated hash function after generation + @param key function to get 'memoryview'-compatible key for an + entry. defaults to extracting the first element of an + entry 2-tuple. + """ + + assert 0 < len(entries) < self.U32_MAX, "bad # of entries!" + self._key = key + + # Allocate the intermediate table and keys. + self.table = [0] * size + self.entries = [None] * len(entries) + + # A set of Bucket values. Each bucket contains the index into the + # intermediate table, and a list of Entries which map into that bucket. + buckets = [self.Bucket(idx, []) for idx in range(size)] + + # Determine which input strings map to which buckets in the table. + for entry in entries: + assert entry is not None, "None entry in entries" + assert self.key(entry).itemsize == 1, "non-byte key elements" + buckets[self._hash(self.key(entry)) % size].entries.append(entry) + + # Sort the buckets such that the largest one comes first. + buckets.sort(key=lambda b: len(b.entries), reverse=True) + + for bucket in buckets: + # Once we've reached an empty bucket, we're done. + if len(bucket.entries) == 0: + break + + # Try values for the basis until we find one with no conflicts. + idx = 0 + basis = 1 + slots = [] + while idx < len(bucket.entries): + slot = self._hash(self.key(bucket.entries[idx]), basis) % len( + self.entries + ) + if self.entries[slot] is not None or slot in slots: + # There was a conflict, try the next basis. + basis += 1 + idx = 0 + del slots[:] + assert basis < self.U32_MAX, "table too small" + else: + slots.append(slot) + idx += 1 + + assert basis < self.U32_MAX + + # We've found a basis which doesn't conflict + self.table[bucket.index] = basis + for slot, entry in zip(slots, bucket.entries): + self.entries[slot] = entry + + # Validate that looking up each key succeeds + if validate: + for entry in entries: + assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry) + + @classmethod + def _hash(cls, key, basis=FNV_OFFSET_BASIS): + """A basic FNV-based hash function. key is the memoryview to hash. + 32-bit FNV is used for indexing into the first table, and the value + stored in that table is used as the offset basis for indexing into the + values table.""" + for byte in memoryview(ensure_bytes(key)): + obyte = _ord(byte) + basis ^= obyte # xor-in the byte + basis *= cls.FNV_PRIME # Multiply by the FNV prime + basis &= cls.U32_MAX # clamp to 32-bits + return basis + + def key(self, entry): + return memoryview(ensure_bytes(self._key(entry))) + + def get_raw_index(self, key): + """Determine the index in self.entries without validating""" + mid = self.table[self._hash(key) % len(self.table)] + return self._hash(key, mid) % len(self.entries) + + def get_index(self, key): + """Given a key, determine the index in self.entries""" + idx = self.get_raw_index(key) + if memoryview(ensure_bytes(key)) != self.key(self.entries[idx]): + return None + return idx + + def get_entry(self, key): + """Given a key, get the corresponding entry""" + idx = self.get_index(key) + if idx is None: + return None + return self.entries[idx] + + @staticmethod + def _indent(text, amount=1): + return text.replace("\n", "\n" + (" " * amount)) + + def codegen(self, name, entry_type): + """Create a helper for codegening PHF logic""" + return CGHelper(self, name, entry_type) + + def cxx_codegen( + self, + name, + entry_type, + lower_entry, + entries_name=None, + return_type=None, + return_entry="return entry;", + key_type="const char*", + key_bytes="aKey", + key_length="strlen(aKey)", + ): + """Generate complete C++ code for a get_entry-style method. + + @param name Name for the entry getter function. + @param entry_type C++ type of each entry in static memory. + @param lower_entry Function called with each entry to convert it to a + C++ literal of type 'entry_type'. + + # Optional Keyword Parameters: + @param entries_name Name for the generated entry table. + + @param return_type C++ return type, default: 'const entry_type&' + @param return_entry Override the default behaviour for returning the + found entry. 'entry' is a reference to the found + entry, and 'aKey' is the lookup key. 'return_entry' + can be used for additional checks, e.g. for keys + not in the table. + + @param key_type C++ key type, default: 'const char*' + @param key_bytes 'const char*' expression to get bytes for 'aKey' + @param key_length 'size_t' expression to get length of 'aKey'""" + + if entries_name is None: + entries_name = "s%sEntries" % name + + cg = self.codegen(entries_name, entry_type) + entries = cg.gen_entries(lower_entry) + getter = cg.gen_getter( + name, return_type, return_entry, key_type, key_bytes, key_length + ) + + return "%s\n\n%s" % (entries, getter) + + +class CGHelper(object): + """Helper object for generating C++ code for a PerfectHash. + Created using PerfectHash.codegen().""" + + def __init__(self, phf, entries_name, entry_type): + self.phf = phf + self.entries_name = entries_name + self.entry_type = entry_type + + @staticmethod + def _indent(text, amount=1): + return text.replace("\n", "\n" + (" " * amount)) + + def basis_ty(self): + """Determine how big of an integer is needed for bases table.""" + max_base = max(self.phf.table) + if max_base <= 0xFF: + return "uint8_t" + elif max_base <= 0xFFFF: + return "uint16_t" + return "uint32_t" + + def basis_table(self, name="BASES"): + """Generate code for a static basis table""" + bases = "" + for idx, base in enumerate(self.phf.table): + if idx and idx % 16 == 0: # 16 bases per line + bases += "\n " + bases += "%4d," % base + return ( + textwrap.dedent( + """\ + static const %s %s[] = { + %s + }; + """ + ) + % (self.basis_ty(), name, bases) + ) + + def gen_entries(self, lower_entry): + """Generate code for an entries table""" + entries = self._indent( + ",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries) + ) + return ( + textwrap.dedent( + """\ + const %s %s[] = { + %s + }; + """ + ) + % (self.entry_type, self.entries_name, entries) + ) + + def gen_getter( + self, + name, + return_type=None, + return_entry="return entry;", + key_type="const char*", + key_bytes="aKey", + key_length="strlen(aKey)", + ): + """Generate the code for a C++ getter. + + @param name Name for the entry getter function. + @param return_type C++ return type, default: 'const entry_type&' + @param return_entry Override the default behaviour for returning the + found entry. 'entry' is a reference to the found + entry, and 'aKey' is the lookup key. 'return_entry' + can be used for additional checks, e.g. for keys + not in the table. + + @param key_type C++ key type, default: 'const char*' + @param key_bytes 'const char*' expression to get bytes for 'aKey' + @param key_length 'size_t' expression to get length of 'aKey'""" + + if return_type is None: + return_type = "const %s&" % self.entry_type + + return ( + textwrap.dedent( + """ + %(return_type)s + %(name)s(%(key_type)s aKey) + { + %(basis_table)s + + const char* bytes = %(key_bytes)s; + size_t length = %(key_length)s; + auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES, + %(entries_name)s); + %(return_entry)s + } + """ + ) + % { + "name": name, + "basis_table": self._indent(self.basis_table()), + "entries_name": self.entries_name, + "return_type": return_type, + "return_entry": self._indent(return_entry), + "key_type": key_type, + "key_bytes": key_bytes, + "key_length": key_length, + } + ) + + def gen_jslinearstr_getter( + self, name, return_type=None, return_entry="return entry;" + ): + """Generate code for a specialized getter taking JSLinearStrings. + This getter avoids copying the JS string, but only supports ASCII keys. + + @param name Name for the entry getter function. + @param return_type C++ return type, default: 'const entry_type&' + @param return_entry Override the default behaviour for returning the + found entry. 'entry' is a reference to the found + entry, and 'aKey' is the lookup key. 'return_entry' + can be used for additional checks, e.g. for keys + not in the table.""" + + assert all( + _ord(b) <= 0x7F for e in self.phf.entries for b in self.phf.key(e) + ), "non-ASCII key" + + if return_type is None: + return_type = "const %s&" % self.entry_type + + return ( + textwrap.dedent( + """ + %(return_type)s + %(name)s(JSLinearString* aKey) + { + %(basis_table)s + + size_t length = JS::GetLinearStringLength(aKey); + + JS::AutoCheckCannotGC nogc; + if (JS::LinearStringHasLatin1Chars(aKey)) { + auto& entry = mozilla::perfecthash::Lookup( + JS::GetLatin1LinearStringChars(nogc, aKey), + length, BASES, %(entries_name)s); + + %(return_entry)s + } else { + auto& entry = mozilla::perfecthash::Lookup( + JS::GetTwoByteLinearStringChars(nogc, aKey), + length, BASES, %(entries_name)s); + + %(return_entry)s + } + } + """ + ) + % { + "name": name, + "basis_table": self._indent(self.basis_table()), + "entries_name": self.entries_name, + "return_type": return_type, + "return_entry": self._indent(return_entry, 2), + } + ) |