summaryrefslogtreecommitdiffstats
path: root/xpcom/ds/tools/incremental_dafsa.py
diff options
context:
space:
mode:
Diffstat (limited to 'xpcom/ds/tools/incremental_dafsa.py')
-rw-r--r--xpcom/ds/tools/incremental_dafsa.py509
1 files changed, 509 insertions, 0 deletions
diff --git a/xpcom/ds/tools/incremental_dafsa.py b/xpcom/ds/tools/incremental_dafsa.py
new file mode 100644
index 0000000000..3c76109efe
--- /dev/null
+++ b/xpcom/ds/tools/incremental_dafsa.py
@@ -0,0 +1,509 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+Incremental algorithm for creating a "deterministic acyclic finite state
+automaton" (DAFSA). At the time of writing this algorithm, there was existing logic
+that depended on a different format for the DAFSA, so this contains convenience
+functions for converting to a compatible structure. This legacy format is defined
+in make_dafsa.py.
+"""
+
+from typing import List, Dict, Optional, Callable
+
+
+class Node:
+ children: Dict[str, "Node"]
+ parents: Dict[str, List["Node"]]
+ character: str
+ is_root_node: bool
+ is_end_node: bool
+
+ def __init__(self, character, is_root_node=False, is_end_node=False):
+ self.children = {}
+ self.parents = {}
+ self.character = character
+ self.is_root_node = is_root_node
+ self.is_end_node = is_end_node
+
+ def __str__(self):
+ """Produce a helpful string representation of this node.
+
+ This is expected to only be used for debugging.
+ The produced output is:
+
+ "c[def.] <123>"
+ ^ ^ ^
+ | | Internal python ID of the node (used for de-duping)
+ | |
+ | One possible path through the tree to the end
+ |
+ Current node character
+ """
+
+ if self.is_root_node:
+ return "<root>"
+ elif self.is_end_node:
+ return "<end>"
+
+ first_potential_match = ""
+ node = self
+ while node.children:
+ first_character = next(iter(node.children))
+ if first_character:
+ first_potential_match += first_character
+ node = node.children[first_character]
+
+ return "%s[%s] <%d>" % (self.character, first_potential_match, id(self))
+
+ def add_child(self, child):
+ self.children[child.character] = child
+ child.parents.setdefault(self.character, [])
+ child.parents[self.character].append(self)
+
+ def remove(self):
+ # remove() must only be called when this node has only a single parent, and that
+ # parent doesn't need this child anymore.
+ # The caller is expected to have performed this validation.
+ # (placing asserts here add a non-trivial performance hit)
+
+ # There's only a single parent, so only one list should be in the "parents" map
+ parent_list = next(iter(self.parents.values()))
+ self.remove_parent(parent_list[0])
+ for child in list(self.children.values()):
+ child.remove_parent(self)
+
+ def remove_parent(self, parent_node: "Node"):
+ parent_node.children.pop(self.character)
+ parents_for_character = self.parents[parent_node.character]
+ parents_for_character.remove(parent_node)
+ if not parents_for_character:
+ self.parents.pop(parent_node.character)
+
+ def copy_fork_node(self, fork_node: "Node", child_to_avoid: Optional["Node"]):
+ """Shallow-copy a node's children.
+
+ When adding a new word, sometimes previously-joined suffixes aren't perfect
+ matches any more. When this happens, some nodes need to be "copied" out.
+ For all non-end nodes, there's a child to avoid in the shallow-copy.
+ """
+
+ for child in fork_node.children.values():
+ if child is not child_to_avoid:
+ self.add_child(child)
+
+ def is_fork(self):
+ """Check if this node has multiple parents"""
+
+ if len(self.parents) == 0:
+ return False
+
+ if len(self.parents) > 1:
+ return True
+
+ return len(next(iter(self.parents.values()))) > 1
+
+ def is_replacement_for_prefix_end_node(self, old: "Node"):
+ """Check if this node is a valid replacement for an old end node.
+
+ A node is a valid replacement if it maintains all existing child paths while
+ adding the new child path needed for the new word.
+
+ Args:
+ old: node being replaced
+
+ Returns: True if this node is a valid replacement node.
+ """
+
+ if len(self.children) != len(old.children) + 1:
+ return False
+
+ for character, other_node in old.children.items():
+ this_node = self.children.get(character)
+ if other_node is not this_node:
+ return False
+
+ return True
+
+ def is_replacement_for_prefix_node(self, old: "Node"):
+ """Check if this node is a valid replacement for a non-end node.
+
+ A node is a valid replacement if it:
+ * Has one new child that the old node doesn't
+ * Is missing a child that the old node has
+ * Shares all other children
+
+ Returns: True if this node is a valid replacement node.
+ """
+
+ if len(self.children) != len(old.children):
+ return False
+
+ found_extra_child = False
+
+ for character, other_node in old.children.items():
+ this_node = self.children.get(character)
+ if other_node is not this_node:
+ if found_extra_child:
+ # Found two children in the old node that aren't in the new one,
+ # this isn't a valid replacement
+ return False
+ else:
+ found_extra_child = True
+
+ return found_extra_child
+
+
+class SuffixCursor:
+ index: int # Current position of the cursor within the DAFSA.
+ node: Node
+
+ def __init__(self, index, node):
+ self.index = index
+ self.node = node
+
+ def _query(self, character: str, check: Callable[[Node], bool]):
+ for node in self.node.parents.get(character, []):
+ if check(node):
+ self.index -= 1
+ self.node = node
+ return True
+ return False
+
+ def find_single_child(self, character):
+ """Find the next matching suffix node that has a single child.
+
+ Return True if such a node is found."""
+ return self._query(character, lambda node: len(node.children) == 1)
+
+ def find_end_of_prefix_replacement(self, end_of_prefix: Node):
+ """Find the next matching suffix node that replaces the old prefix-end node.
+
+ Return True if such a node is found."""
+ return self._query(
+ end_of_prefix.character,
+ lambda node: node.is_replacement_for_prefix_end_node(end_of_prefix),
+ )
+
+ def find_inside_of_prefix_replacement(self, prefix_node: Node):
+ """Find the next matching suffix node that replaces a node within the prefix.
+
+ Return True if such a node is found."""
+ return self._query(
+ prefix_node.character,
+ lambda node: node.is_replacement_for_prefix_node(prefix_node),
+ )
+
+
+class DafsaAppendStateMachine:
+ """State machine for adding a word to a Dafsa.
+
+ Each state returns a function reference to the "next state". States should be
+ invoked until "None" is returned, in which case the new word has been appended.
+
+ The prefix and suffix indexes are placed according to the currently-known valid
+ value (not the next value being investigated). Additionally, they are 0-indexed
+ against the root node (which sits behind the beginning of the string).
+
+ Let's imagine we're at the following state when adding, for example, the
+ word "mozilla.org":
+
+ mozilla.org
+ ^ ^ ^ ^
+ | | | |
+ / | | \
+ [root] | | [end] node
+ node | \
+ | suffix
+ \
+ prefix
+
+ In this state, the furthest prefix match we could find was:
+ [root] - m - o - z - i - l
+ The index of the prefix match is "5".
+
+ Additionally, we've been looking for suffix nodes, and we've already found:
+ r - g - [end]
+ The current suffix index is "10".
+ The next suffix node we'll attempt to find is at index "9".
+ """
+
+ root_node: Node
+ prefix_index: int
+ suffix_cursor: SuffixCursor
+ stack: List[Node]
+ word: str
+ suffix_overlaps_prefix: bool
+ first_fork_index: Optional[int]
+ _state: Callable
+
+ def __init__(self, word, root_node, end_node):
+ self.root_node = root_node
+ self.prefix_index = 0
+ self.suffix_cursor = SuffixCursor(len(word) + 1, end_node)
+ self.stack = [root_node]
+ self.word = word
+ self.suffix_overlaps_prefix = False
+ self.first_fork_index = None
+ self._state = self._find_prefix
+
+ def run(self):
+ """Run this state machine to completion, adding the new word."""
+ while self._state is not None:
+ self._state = self._state()
+
+ def _find_prefix(self):
+ """Find the longest existing prefix that matches the current word."""
+ prefix_node = self.root_node
+ while self.prefix_index < len(self.word):
+ next_character = self.word[self.prefix_index]
+ next_node = prefix_node.children.get(next_character)
+ if not next_node:
+ # We're finished finding the prefix, let's find the longest suffix
+ # match now.
+ return self._find_suffix_nodes_after_prefix
+
+ self.prefix_index += 1
+ prefix_node = next_node
+ self.stack.append(next_node)
+
+ if not self.first_fork_index and next_node.is_fork():
+ self.first_fork_index = self.prefix_index
+
+ # Deja vu, we've appended this string before. Since this string has
+ # already been appended, we don't have to do anything.
+ return None
+
+ def _find_suffix_nodes_after_prefix(self):
+ """Find the chain of suffix nodes for characters after the prefix."""
+ while self.suffix_cursor.index - 1 > self.prefix_index:
+ # To fetch the next character, we need to subtract two from the current
+ # suffix index. This is because:
+ # * The next suffix node is 1 node before our current node (subtract 1)
+ # * The suffix index includes the root node before the beginning of the
+ # string - it's like the string is 1-indexed (subtract 1 again).
+ next_character = self.word[self.suffix_cursor.index - 2]
+ if not self.suffix_cursor.find_single_child(next_character):
+ return self._add_new_nodes
+
+ if self.suffix_cursor.node is self.stack[-1]:
+ # The suffix match is overlapping with the prefix! This can happen in
+ # cases like:
+ # * "ab"
+ # * "abb"
+ # The suffix cursor is at the same node as the prefix match, but they're
+ # at different positions in the word.
+ #
+ # [root] - a - b - [end]
+ # ^
+ # / \
+ # / \
+ # prefix suffix
+ # \ /
+ # \ /
+ # VV
+ # "abb"
+ if not self.first_fork_index:
+ # There hasn't been a fork, so our prefix isn't shared. So, we
+ # can mark this node as a fork, since the repetition means
+ # that there's two paths that are now using this node
+ self.first_fork_index = self.prefix_index
+ return self._add_new_nodes
+
+ # Removes the link between the unique part of the prefix and the
+ # shared part of the prefix.
+ self.stack[self.first_fork_index].remove_parent(
+ self.stack[self.first_fork_index - 1]
+ )
+ self.suffix_overlaps_prefix = True
+
+ if self.first_fork_index is None:
+ return self._find_next_suffix_nodes
+ elif self.suffix_cursor.index - 1 == self.first_fork_index:
+ return self._find_next_suffix_node_at_prefix_end_at_fork
+ else:
+ return self._find_next_suffix_node_at_prefix_end_after_fork
+
+ def _find_next_suffix_node_at_prefix_end_at_fork(self):
+ """Find the next suffix node that replaces the end of the prefix.
+
+ In this state, the prefix_end node is the same as the first fork node.
+ Therefore, if a match can be found, the old prefix node can't be entirely
+ deleted since it's used elsewhere. Instead, just the link between our
+ unique prefix and the end of the fork is removed.
+ """
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ self.first_fork_index = None
+
+ if not self.suffix_overlaps_prefix:
+ existing_node.remove_parent(self.stack[self.prefix_index])
+ else:
+ # When the suffix overlaps the prefix, the old "parent link" was removed
+ # earlier in the "find_suffix_nodes_after_prefix" step.
+ self.suffix_overlaps_prefix = False
+
+ return self._find_next_suffix_nodes
+
+ def _find_next_suffix_node_at_prefix_end_after_fork(self):
+ """Find the next suffix node that replaces the end of the prefix.
+
+ In this state, the prefix_end node is after the first fork node.
+ Therefore, even if a match is found, we don't want to modify the replaced
+ prefix node since an unrelated word chain uses it.
+ """
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ if self.prefix_index == self.first_fork_index:
+ return self._find_next_suffix_node_within_prefix_at_fork
+ else:
+ return self._find_next_suffix_nodes_within_prefix_after_fork
+
+ def _find_next_suffix_node_within_prefix_at_fork(self):
+ """Find the next suffix node within a prefix.
+
+ In this state, we've already worked our way back and found nodes in the suffix
+ to replace prefix nodes after the fork node. We have now reached the fork node,
+ and if we find a replacement for it, then we can remove the link between it
+ and our then-unique prefix and clear the fork status.
+ """
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ self.first_fork_index = None
+
+ if not self.suffix_overlaps_prefix:
+ existing_node.remove_parent(self.stack[self.prefix_index])
+ else:
+ # When the suffix overlaps the prefix, the old "parent link" was removed
+ # earlier in the "find_suffix_nodes_after_prefix" step.
+ self.suffix_overlaps_prefix = False
+
+ return self._find_next_suffix_nodes
+
+ def _find_next_suffix_nodes_within_prefix_after_fork(self):
+ """Find the next suffix nodes within a prefix.
+
+ Finds suffix nodes to replace prefix nodes, but doesn't modify the prefix
+ nodes since they're after a fork (so, we're sharing prefix nodes with
+ other words and can't modify them).
+ """
+ while True:
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ self.prefix_index -= 1
+ if self.prefix_index == self.first_fork_index:
+ return self._find_next_suffix_node_within_prefix_at_fork
+
+ def _find_next_suffix_nodes(self):
+ """Find all remaining suffix nodes in the chain.
+
+ In this state, there's no (longer) any fork, so there's no other words
+ using our current prefix. Therefore, as we find replacement nodes as we
+ work our way backwards, we can remove the now-unused prefix nodes.
+ """
+ while True:
+ existing_node = self.stack[self.prefix_index]
+ if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+ return self._add_new_nodes
+
+ # This prefix node is wholly replaced by the new suffix node, so it can
+ # be deleted.
+ existing_node.remove()
+ self.prefix_index -= 1
+
+ def _add_new_nodes(self):
+ """Adds new nodes to support the new word.
+
+ Duplicates forked nodes to make room for new links, adds new nodes for new
+ characters, and splices the prefix to the suffix to finish embedding the new
+ word into the DAFSA.
+ """
+ if self.first_fork_index is not None:
+ front_node = _duplicate_fork_nodes(
+ self.stack,
+ self.first_fork_index,
+ self.prefix_index,
+ # if suffix_overlaps_parent, the parent link was removed
+ # earlier in the word-adding process.
+ remove_parent_link=not self.suffix_overlaps_prefix,
+ )
+ else:
+ front_node = self.stack[self.prefix_index]
+
+ new_text = self.word[self.prefix_index : self.suffix_cursor.index - 1]
+ for character in new_text:
+ new_node = Node(character)
+ front_node.add_child(new_node)
+ front_node = new_node
+
+ front_node.add_child(self.suffix_cursor.node)
+ return None # Done!
+
+
+def _duplicate_fork_nodes(stack, fork_index, prefix_index, remove_parent_link=True):
+ parent_node = stack[fork_index - 1]
+ if remove_parent_link:
+ # remove link to old chain that we're going to be copying
+ stack[fork_index].remove_parent(parent_node)
+
+ for index in range(fork_index, prefix_index + 1):
+ fork_node = stack[index]
+ replacement_node = Node(fork_node.character)
+ child_to_avoid = None
+ if index < len(stack) - 1:
+ # We're going to be manually replacing the next node in the stack,
+ # so don't connect it as a child.
+ child_to_avoid = stack[index + 1]
+
+ replacement_node.copy_fork_node(fork_node, child_to_avoid)
+ parent_node.add_child(replacement_node)
+ parent_node = replacement_node
+
+ return parent_node
+
+
+class Dafsa:
+ root_node: Node
+ end_node: Node
+
+ def __init__(self):
+ self.root_node = Node(None, is_root_node=True)
+ self.end_node = Node(None, is_end_node=True)
+
+ @classmethod
+ def from_tld_data(cls, lines):
+ """Create a dafsa for TLD data.
+
+ TLD data has a domain and a "type" enum. The source data encodes the type as a
+ text number, but the dafsa-consuming code assumes that the type is a raw byte
+ number (e.g.: "1" => 0x01).
+
+ This function acts as a helper, processing this TLD detail before creating a
+ standard dafsa.
+ """
+
+ dafsa = cls()
+ for i, word in enumerate(lines):
+ domain_number = word[-1]
+ # Convert type from string to byte representation
+ raw_domain_number = chr(ord(domain_number) & 0x0F)
+
+ word = "%s%s" % (word[:-1], raw_domain_number)
+ dafsa.append(word)
+ return dafsa
+
+ def append(self, word):
+ state_machine = DafsaAppendStateMachine(word, self.root_node, self.end_node)
+ state_machine.run()