Adding upstream version 115.7.0esr.upstream/115.7.0esr upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /xpcom/ds/tools
parent: Initial commit. (diff)
download: firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
3 files changed, 1252 insertions, 0 deletions
diff --git a/xpcom/ds/tools/incremental_dafsa.py b/xpcom/ds/tools/incremental_dafsa.py
new file mode 100644
index 0000000000..1157c26850
--- /dev/null
+++ b/xpcom/ds/tools/incremental_dafsa.py
@@ -0,0 +1,509 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+Incremental algorithm for creating a "deterministic acyclic finite state
+automaton" (DAFSA). At the time of writing this algorithm, there was existing logic
+that depended on a different format for the DAFSA, so this contains convenience
+functions for converting to a compatible structure. This legacy format is defined
+in make_dafsa.py.
+"""
+
+from typing import Callable, Dict, List, Optional
+
+
+class Node:
+    children: Dict[str, "Node"]
+    parents: Dict[str, List["Node"]]
+    character: str
+    is_root_node: bool
+    is_end_node: bool
+
+    def __init__(self, character, is_root_node=False, is_end_node=False):
+        self.children = {}
+        self.parents = {}
+        self.character = character
+        self.is_root_node = is_root_node
+        self.is_end_node = is_end_node
+
+    def __str__(self):
+        """Produce a helpful string representation of this node.
+
+        This is expected to only be used for debugging.
+        The produced output is:
+
+        "c[def.] <123>"
+         ^ ^      ^
+         | |      Internal python ID of the node (used for de-duping)
+         | |
+         | One possible path through the tree to the end
+         |
+         Current node character
+        """
+
+        if self.is_root_node:
+            return "<root>"
+        elif self.is_end_node:
+            return "<end>"
+
+        first_potential_match = ""
+        node = self
+        while node.children:
+            first_character = next(iter(node.children))
+            if first_character:
+                first_potential_match += first_character
+            node = node.children[first_character]
+
+        return "%s[%s] <%d>" % (self.character, first_potential_match, id(self))
+
+    def add_child(self, child):
+        self.children[child.character] = child
+        child.parents.setdefault(self.character, [])
+        child.parents[self.character].append(self)
+
+    def remove(self):
+        # remove() must only be called when this node has only a single parent, and that
+        # parent doesn't need this child anymore.
+        # The caller is expected to have performed this validation.
+        # (placing asserts here add a non-trivial performance hit)
+
+        # There's only a single parent, so only one list should be in the "parents" map
+        parent_list = next(iter(self.parents.values()))
+        self.remove_parent(parent_list[0])
+        for child in list(self.children.values()):
+            child.remove_parent(self)
+
+    def remove_parent(self, parent_node: "Node"):
+        parent_node.children.pop(self.character)
+        parents_for_character = self.parents[parent_node.character]
+        parents_for_character.remove(parent_node)
+        if not parents_for_character:
+            self.parents.pop(parent_node.character)
+
+    def copy_fork_node(self, fork_node: "Node", child_to_avoid: Optional["Node"]):
+        """Shallow-copy a node's children.
+
+        When adding a new word, sometimes previously-joined suffixes aren't perfect
+        matches any more. When this happens, some nodes need to be "copied" out.
+        For all non-end nodes, there's a child to avoid in the shallow-copy.
+        """
+
+        for child in fork_node.children.values():
+            if child is not child_to_avoid:
+                self.add_child(child)
+
+    def is_fork(self):
+        """Check if this node has multiple parents"""
+
+        if len(self.parents) == 0:
+            return False
+
+        if len(self.parents) > 1:
+            return True
+
+        return len(next(iter(self.parents.values()))) > 1
+
+    def is_replacement_for_prefix_end_node(self, old: "Node"):
+        """Check if this node is a valid replacement for an old end node.
+
+        A node is a valid replacement if it maintains all existing child paths while
+        adding the new child path needed for the new word.
+
+        Args:
+            old: node being replaced
+
+        Returns: True if this node is a valid replacement node.
+        """
+
+        if len(self.children) != len(old.children) + 1:
+            return False
+
+        for character, other_node in old.children.items():
+            this_node = self.children.get(character)
+            if other_node is not this_node:
+                return False
+
+        return True
+
+    def is_replacement_for_prefix_node(self, old: "Node"):
+        """Check if this node is a valid replacement for a non-end node.
+
+        A node is a valid replacement if it:
+        * Has one new child that the old node doesn't
+        * Is missing a child that the old node has
+        * Shares all other children
+
+        Returns: True if this node is a valid replacement node.
+        """
+
+        if len(self.children) != len(old.children):
+            return False
+
+        found_extra_child = False
+
+        for character, other_node in old.children.items():
+            this_node = self.children.get(character)
+            if other_node is not this_node:
+                if found_extra_child:
+                    # Found two children in the old node that aren't in the new one,
+                    # this isn't a valid replacement
+                    return False
+                else:
+                    found_extra_child = True
+
+        return found_extra_child
+
+
+class SuffixCursor:
+    index: int  # Current position of the cursor within the DAFSA.
+    node: Node
+
+    def __init__(self, index, node):
+        self.index = index
+        self.node = node
+
+    def _query(self, character: str, check: Callable[[Node], bool]):
+        for node in self.node.parents.get(character, []):
+            if check(node):
+                self.index -= 1
+                self.node = node
+                return True
+        return False
+
+    def find_single_child(self, character):
+        """Find the next matching suffix node that has a single child.
+
+        Return True if such a node is found."""
+        return self._query(character, lambda node: len(node.children) == 1)
+
+    def find_end_of_prefix_replacement(self, end_of_prefix: Node):
+        """Find the next matching suffix node that replaces the old prefix-end node.
+
+        Return True if such a node is found."""
+        return self._query(
+            end_of_prefix.character,
+            lambda node: node.is_replacement_for_prefix_end_node(end_of_prefix),
+        )
+
+    def find_inside_of_prefix_replacement(self, prefix_node: Node):
+        """Find the next matching suffix node that replaces a node within the prefix.
+
+        Return True if such a node is found."""
+        return self._query(
+            prefix_node.character,
+            lambda node: node.is_replacement_for_prefix_node(prefix_node),
+        )
+
+
+class DafsaAppendStateMachine:
+    """State machine for adding a word to a Dafsa.
+
+    Each state returns a function reference to the "next state". States should be
+    invoked until "None" is returned, in which case the new word has been appended.
+
+    The prefix and suffix indexes are placed according to the currently-known valid
+    value (not the next value being investigated). Additionally, they are 0-indexed
+    against the root node (which sits behind the beginning of the string).
+
+    Let's imagine we're at the following state when adding, for example, the
+    word "mozilla.org":
+
+        mozilla.org
+       ^    ^    ^ ^
+       |    |    | |
+      /     |    | \
+    [root]  |    |  [end] node
+    node    |    \
+            |     suffix
+            \
+             prefix
+
+    In this state, the furthest prefix match we could find was:
+        [root] - m - o - z - i - l
+    The index of the prefix match is "5".
+
+    Additionally, we've been looking for suffix nodes, and we've already found:
+        r - g - [end]
+    The current suffix index is "10".
+    The next suffix node we'll attempt to find is at index "9".
+    """
+
+    root_node: Node
+    prefix_index: int
+    suffix_cursor: SuffixCursor
+    stack: List[Node]
+    word: str
+    suffix_overlaps_prefix: bool
+    first_fork_index: Optional[int]
+    _state: Callable
+
+    def __init__(self, word, root_node, end_node):
+        self.root_node = root_node
+        self.prefix_index = 0
+        self.suffix_cursor = SuffixCursor(len(word) + 1, end_node)
+        self.stack = [root_node]
+        self.word = word
+        self.suffix_overlaps_prefix = False
+        self.first_fork_index = None
+        self._state = self._find_prefix
+
+    def run(self):
+        """Run this state machine to completion, adding the new word."""
+        while self._state is not None:
+            self._state = self._state()
+
+    def _find_prefix(self):
+        """Find the longest existing prefix that matches the current word."""
+        prefix_node = self.root_node
+        while self.prefix_index < len(self.word):
+            next_character = self.word[self.prefix_index]
+            next_node = prefix_node.children.get(next_character)
+            if not next_node:
+                # We're finished finding the prefix, let's find the longest suffix
+                # match now.
+                return self._find_suffix_nodes_after_prefix
+
+            self.prefix_index += 1
+            prefix_node = next_node
+            self.stack.append(next_node)
+
+            if not self.first_fork_index and next_node.is_fork():
+                self.first_fork_index = self.prefix_index
+
+        # Deja vu, we've appended this string before. Since this string has
+        # already been appended, we don't have to do anything.
+        return None
+
+    def _find_suffix_nodes_after_prefix(self):
+        """Find the chain of suffix nodes for characters after the prefix."""
+        while self.suffix_cursor.index - 1 > self.prefix_index:
+            # To fetch the next character, we need to subtract two from the current
+            # suffix index. This is because:
+            # * The next suffix node is 1 node before our current node (subtract 1)
+            # * The suffix index includes the root node before the beginning of the
+            #   string - it's like the string is 1-indexed (subtract 1 again).
+            next_character = self.word[self.suffix_cursor.index - 2]
+            if not self.suffix_cursor.find_single_child(next_character):
+                return self._add_new_nodes
+
+            if self.suffix_cursor.node is self.stack[-1]:
+                # The suffix match is overlapping with the prefix! This can happen in
+                # cases like:
+                # * "ab"
+                # * "abb"
+                # The suffix cursor is at the same node as the prefix match, but they're
+                # at different positions in the word.
+                #
+                # [root] - a - b - [end]
+                #              ^
+                #             / \
+                #            /   \
+                #      prefix     suffix
+                #            \    /
+                #             \  /
+                #              VV
+                #            "abb"
+                if not self.first_fork_index:
+                    # There hasn't been a fork, so our prefix isn't shared. So, we
+                    # can mark this node as a fork, since the repetition means
+                    # that there's two paths that are now using this node
+                    self.first_fork_index = self.prefix_index
+                    return self._add_new_nodes
+
+                # Removes the link between the unique part of the prefix and the
+                # shared part of the prefix.
+                self.stack[self.first_fork_index].remove_parent(
+                    self.stack[self.first_fork_index - 1]
+                )
+                self.suffix_overlaps_prefix = True
+
+        if self.first_fork_index is None:
+            return self._find_next_suffix_nodes
+        elif self.suffix_cursor.index - 1 == self.first_fork_index:
+            return self._find_next_suffix_node_at_prefix_end_at_fork
+        else:
+            return self._find_next_suffix_node_at_prefix_end_after_fork
+
+    def _find_next_suffix_node_at_prefix_end_at_fork(self):
+        """Find the next suffix node that replaces the end of the prefix.
+
+        In this state, the prefix_end node is the same as the first fork node.
+        Therefore, if a match can be found, the old prefix node can't be entirely
+        deleted since it's used elsewhere. Instead, just the link between our
+        unique prefix and the end of the fork is removed.
+        """
+        existing_node = self.stack[self.prefix_index]
+        if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+            return self._add_new_nodes
+
+        self.prefix_index -= 1
+        self.first_fork_index = None
+
+        if not self.suffix_overlaps_prefix:
+            existing_node.remove_parent(self.stack[self.prefix_index])
+        else:
+            # When the suffix overlaps the prefix, the old "parent link" was removed
+            # earlier in the "find_suffix_nodes_after_prefix" step.
+            self.suffix_overlaps_prefix = False
+
+        return self._find_next_suffix_nodes
+
+    def _find_next_suffix_node_at_prefix_end_after_fork(self):
+        """Find the next suffix node that replaces the end of the prefix.
+
+        In this state, the prefix_end node is after the first fork node.
+        Therefore, even if a match is found, we don't want to modify the replaced
+        prefix node since an unrelated word chain uses it.
+        """
+        existing_node = self.stack[self.prefix_index]
+        if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+            return self._add_new_nodes
+
+        self.prefix_index -= 1
+        if self.prefix_index == self.first_fork_index:
+            return self._find_next_suffix_node_within_prefix_at_fork
+        else:
+            return self._find_next_suffix_nodes_within_prefix_after_fork
+
+    def _find_next_suffix_node_within_prefix_at_fork(self):
+        """Find the next suffix node within a prefix.
+
+        In this state, we've already worked our way back and found nodes in the suffix
+        to replace prefix nodes after the fork node. We have now reached the fork node,
+        and if we find a replacement for it, then we can remove the link between it
+        and our then-unique prefix and clear the fork status.
+        """
+        existing_node = self.stack[self.prefix_index]
+        if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node):
+            return self._add_new_nodes
+
+        self.prefix_index -= 1
+        self.first_fork_index = None
+
+        if not self.suffix_overlaps_prefix:
+            existing_node.remove_parent(self.stack[self.prefix_index])
+        else:
+            # When the suffix overlaps the prefix, the old "parent link" was removed
+            # earlier in the "find_suffix_nodes_after_prefix" step.
+            self.suffix_overlaps_prefix = False
+
+        return self._find_next_suffix_nodes
+
+    def _find_next_suffix_nodes_within_prefix_after_fork(self):
+        """Find the next suffix nodes within a prefix.
+
+        Finds suffix nodes to replace prefix nodes, but doesn't modify the prefix
+        nodes since they're after a fork (so, we're sharing prefix nodes with
+        other words and can't modify them).
+        """
+        while True:
+            existing_node = self.stack[self.prefix_index]
+            if not self.suffix_cursor.find_inside_of_prefix_replacement(existing_node):
+                return self._add_new_nodes
+
+            self.prefix_index -= 1
+            if self.prefix_index == self.first_fork_index:
+                return self._find_next_suffix_node_within_prefix_at_fork
+
+    def _find_next_suffix_nodes(self):
+        """Find all remaining suffix nodes in the chain.
+
+        In this state, there's no (longer) any fork, so there's no other words
+        using our current prefix. Therefore, as we find replacement nodes as we
+        work our way backwards, we can remove the now-unused prefix nodes.
+        """
+        while True:
+            existing_node = self.stack[self.prefix_index]
+            if not self.suffix_cursor.find_end_of_prefix_replacement(existing_node):
+                return self._add_new_nodes
+
+            # This prefix node is wholly replaced by the new suffix node, so it can
+            # be deleted.
+            existing_node.remove()
+            self.prefix_index -= 1
+
+    def _add_new_nodes(self):
+        """Adds new nodes to support the new word.
+
+        Duplicates forked nodes to make room for new links, adds new nodes for new
+        characters, and splices the prefix to the suffix to finish embedding the new
+        word into the DAFSA.
+        """
+        if self.first_fork_index is not None:
+            front_node = _duplicate_fork_nodes(
+                self.stack,
+                self.first_fork_index,
+                self.prefix_index,
+                # if suffix_overlaps_parent, the parent link was removed
+                # earlier in the word-adding process.
+                remove_parent_link=not self.suffix_overlaps_prefix,
+            )
+        else:
+            front_node = self.stack[self.prefix_index]
+
+        new_text = self.word[self.prefix_index : self.suffix_cursor.index - 1]
+        for character in new_text:
+            new_node = Node(character)
+            front_node.add_child(new_node)
+            front_node = new_node
+
+        front_node.add_child(self.suffix_cursor.node)
+        return None  # Done!
+
+
+def _duplicate_fork_nodes(stack, fork_index, prefix_index, remove_parent_link=True):
+    parent_node = stack[fork_index - 1]
+    if remove_parent_link:
+        # remove link to old chain that we're going to be copying
+        stack[fork_index].remove_parent(parent_node)
+
+    for index in range(fork_index, prefix_index + 1):
+        fork_node = stack[index]
+        replacement_node = Node(fork_node.character)
+        child_to_avoid = None
+        if index < len(stack) - 1:
+            # We're going to be manually replacing the next node in the stack,
+            # so don't connect it as a child.
+            child_to_avoid = stack[index + 1]
+
+        replacement_node.copy_fork_node(fork_node, child_to_avoid)
+        parent_node.add_child(replacement_node)
+        parent_node = replacement_node
+
+    return parent_node
+
+
+class Dafsa:
+    root_node: Node
+    end_node: Node
+
+    def __init__(self):
+        self.root_node = Node(None, is_root_node=True)
+        self.end_node = Node(None, is_end_node=True)
+
+    @classmethod
+    def from_tld_data(cls, lines):
+        """Create a dafsa for TLD data.
+
+        TLD data has a domain and a "type" enum. The source data encodes the type as a
+        text number, but the dafsa-consuming code assumes that the type is a raw byte
+        number (e.g.: "1" => 0x01).
+
+        This function acts as a helper, processing this TLD detail before creating a
+        standard dafsa.
+        """
+
+        dafsa = cls()
+        for i, word in enumerate(lines):
+            domain_number = word[-1]
+            # Convert type from string to byte representation
+            raw_domain_number = chr(ord(domain_number) & 0x0F)
+
+            word = "%s%s" % (word[:-1], raw_domain_number)
+            dafsa.append(word)
+        return dafsa
+
+    def append(self, word):
+        state_machine = DafsaAppendStateMachine(word, self.root_node, self.end_node)
+        state_machine.run()
diff --git a/xpcom/ds/tools/make_dafsa.py b/xpcom/ds/tools/make_dafsa.py
new file mode 100644
index 0000000000..a9cedf78a8
--- /dev/null
+++ b/xpcom/ds/tools/make_dafsa.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+A Deterministic acyclic finite state automaton (DAFSA) is a compact
+representation of an unordered word list (dictionary).
+
+http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
+
+This python program converts a list of strings to a byte array in C++.
+This python program fetches strings and return values from a gperf file
+and generates a C++ file with a byte array representing graph that can be
+used as a memory efficient replacement for the perfect hash table.
+
+The input strings are assumed to consist of printable 7-bit ASCII characters
+and the return values are assumed to be one digit integers.
+
+In this program a DAFSA is a diamond shaped graph starting at a common
+root node and ending at a common end node. All internal nodes contain
+a character and each word is represented by the characters in one path from
+the root node to the end node.
+
+The order of the operations is crucial since lookups will be performed
+starting from the source with no backtracking. Thus a node must have at
+most one child with a label starting by the same character. The output
+is also arranged so that all jumps are to increasing addresses, thus forward
+in memory.
+
+The generated output has suffix free decoding so that the sign of leading
+bits in a link (a reference to a child node) indicate if it has a size of one,
+two or three bytes and if it is the last outgoing link from the actual node.
+A node label is terminated by a byte with the leading bit set.
+
+The generated byte array can described by the following BNF:
+
+<byte> ::= < 8-bit value in range [0x00-0xFF] >
+
+<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
+
+<offset1> ::= < byte in range [0x00-0x3F] >
+<offset2> ::= < byte in range [0x40-0x5F] >
+<offset3> ::= < byte in range [0x60-0x7F] >
+
+<end_offset1> ::= < byte in range [0x80-0xBF] >
+<end_offset2> ::= < byte in range [0xC0-0xDF] >
+<end_offset3> ::= < byte in range [0xE0-0xFF] >
+
+<prefix> ::= <char>
+
+<label> ::= <end_char>
+          | <char> <label>
+
+<end_label> ::= <return_value>
+          | <char> <end_label>
+
+<offset> ::= <offset1>
+           | <offset2> <byte>
+           | <offset3> <byte> <byte>
+
+<end_offset> ::= <end_offset1>
+               | <end_offset2> <byte>
+               | <end_offset3> <byte> <byte>
+
+<offsets> ::= <end_offset>
+            | <offset> <offsets>
+
+<source> ::= <offsets>
+
+<node> ::= <label> <offsets>
+         | <prefix> <node>
+         | <end_label>
+
+<dafsa> ::= <source>
+          | <dafsa> <node>
+
+Decoding:
+
+<char> -> printable 7-bit ASCII character
+<end_char> & 0x7F -> printable 7-bit ASCII character
+<return value> & 0x0F -> integer
+<offset1 & 0x3F> -> integer
+((<offset2> & 0x1F>) << 8) + <byte> -> integer
+((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
+
+end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
+offset2 and offset3 respectively.
+
+The first offset in a list of offsets is the distance in bytes between the
+offset itself and the first child node. Subsequent offsets are the distance
+between previous child node and next child node. Thus each offset links a node
+to a child node. The distance is always counted between start addresses, i.e.
+first byte in decoded offset or first byte in child node.
+
+Example 1:
+
+%%
+aa, 1
+a, 2
+%%
+
+The input is first parsed to a list of words:
+["aa1", "a2"]
+
+This produces the following graph:
+[root] --- a --- 0x02 --- [end]
+            |              /
+             |           /
+              - a --- 0x01
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[7] = {
+  0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
+
+ 1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
+ 2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
+
+ 3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
+ 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
+
+ 5: 0x61 <char>         label character 0x61 -> match "a"
+ 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+Example 2:
+
+%%
+aa, 1
+bbb, 2
+baa, 1
+%%
+
+The input is first parsed to a list of words:
+["aa1", "bbb2", "baa1"]
+
+This produces the following graph:
+[root] --- a --- a --- 0x01 --- [end]
+ |       /           /         /
+  |    /           /         /
+   - b --- b --- b --- 0x02
+
+A C++ representation of the compressed graph is generated:
+
+const unsigned char dafsa[11] = {
+  0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
+};
+
+The bytes in the generated array has the following meaning:
+
+ 0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
+ 1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
+
+ 2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
+ 3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
+ 4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
+
+ 5: 0x61 <char>         label character 0x61 -> match "a"
+ 6: 0x61 <char>         label character 0x61 -> match "a"
+ 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
+
+ 8: 0x62 <char>         label character 0x62 -> match "b"
+ 9: 0x62 <char>         label character 0x62 -> match "b"
+10: 0x82 <return_value> 0x82 & 0x0F -> return 2
+"""
+import struct
+import sys
+
+from incremental_dafsa import Dafsa, Node
+
+
+class InputError(Exception):
+    """Exception raised for errors in the input file."""
+
+
+def top_sort(dafsa: Dafsa):
+    """Generates list of nodes in topological sort order."""
+    incoming = {}
+
+    def count_incoming(node: Node):
+        """Counts incoming references."""
+        if not node.is_end_node:
+            if id(node) not in incoming:
+                incoming[id(node)] = 1
+                for child in node.children.values():
+                    count_incoming(child)
+            else:
+                incoming[id(node)] += 1
+
+    for node in dafsa.root_node.children.values():
+        count_incoming(node)
+
+    for node in dafsa.root_node.children.values():
+        incoming[id(node)] -= 1
+
+    waiting = [
+        node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0
+    ]
+    nodes = []
+
+    while waiting:
+        node = waiting.pop()
+        assert incoming[id(node)] == 0
+        nodes.append(node)
+        for child in node.children.values():
+            if not child.is_end_node:
+                incoming[id(child)] -= 1
+                if incoming[id(child)] == 0:
+                    waiting.append(child)
+    return nodes
+
+
+def encode_links(node: Node, offsets, current):
+    """Encodes a list of children as one, two or three byte offsets."""
+    if next(iter(node.children.values())).is_end_node:
+        # This is an <end_label> node and no links follow such nodes
+        return []
+    guess = 3 * len(node.children)
+    assert node.children
+
+    children = sorted(node.children.values(), key=lambda x: -offsets[id(x)])
+    while True:
+        offset = current + guess
+        buf = []
+        for child in children:
+            last = len(buf)
+            distance = offset - offsets[id(child)]
+            assert distance > 0 and distance < (1 << 21)
+
+            if distance < (1 << 6):
+                # A 6-bit offset: "s0xxxxxx"
+                buf.append(distance)
+            elif distance < (1 << 13):
+                # A 13-bit offset: "s10xxxxxxxxxxxxx"
+                buf.append(0x40 | (distance >> 8))
+                buf.append(distance & 0xFF)
+            else:
+                # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
+                buf.append(0x60 | (distance >> 16))
+                buf.append((distance >> 8) & 0xFF)
+                buf.append(distance & 0xFF)
+            # Distance in first link is relative to following record.
+            # Distance in other links are relative to previous link.
+            offset -= distance
+        if len(buf) == guess:
+            break
+        guess = len(buf)
+    # Set most significant bit to mark end of links in this node.
+    buf[last] |= 1 << 7
+    buf.reverse()
+    return buf
+
+
+def encode_prefix(label):
+    """Encodes a node label as a list of bytes without a trailing high byte.
+
+    This method encodes a node if there is exactly one child  and the
+    child follows immediately after so that no jump is needed. This label
+    will then be a prefix to the label in the child node.
+    """
+    assert label
+    return [ord(c) for c in reversed(label)]
+
+
+def encode_label(label):
+    """Encodes a node label as a list of bytes with a trailing high byte >0x80."""
+    buf = encode_prefix(label)
+    # Set most significant bit to mark end of label in this node.
+    buf[0] |= 1 << 7
+    return buf
+
+
+def encode(dafsa: Dafsa):
+    """Encodes a DAFSA to a list of bytes"""
+    output = []
+    offsets = {}
+
+    for node in reversed(top_sort(dafsa)):
+        if (
+            len(node.children) == 1
+            and not next(iter(node.children.values())).is_end_node
+            and (offsets[id(next(iter(node.children.values())))] == len(output))
+        ):
+            output.extend(encode_prefix(node.character))
+        else:
+            output.extend(encode_links(node, offsets, len(output)))
+            output.extend(encode_label(node.character))
+        offsets[id(node)] = len(output)
+
+    output.extend(encode_links(dafsa.root_node, offsets, len(output)))
+    output.reverse()
+    return output
+
+
+def to_cxx(data, preamble=None):
+    """Generates C++ code from a list of encoded bytes."""
+    text = "/* This file is generated. DO NOT EDIT!\n\n"
+    text += "The byte array encodes a dictionary of strings and values. See "
+    text += "make_dafsa.py for documentation."
+    text += "*/\n\n"
+
+    if preamble:
+        text += preamble
+        text += "\n\n"
+
+    text += "const unsigned char kDafsa[%s] = {\n" % len(data)
+    for i in range(0, len(data), 12):
+        text += "  "
+        text += ", ".join("0x%02x" % byte for byte in data[i : i + 12])
+        text += ",\n"
+    text += "};\n"
+    return text
+
+
+def words_to_cxx(words, preamble=None):
+    """Generates C++ code from a word list"""
+    dafsa = Dafsa.from_tld_data(words)
+    return to_cxx(encode(dafsa), preamble)
+
+
+def words_to_bin(words):
+    """Generates bytes from a word list"""
+    dafsa = Dafsa.from_tld_data(words)
+    data = encode(dafsa)
+    return struct.pack("%dB" % len(data), *data)
+
+
+def parse_gperf(infile):
+    """Parses gperf file and extract strings and return code"""
+    lines = [line.strip() for line in infile]
+
+    # Extract the preamble.
+    first_delimeter = lines.index("%%")
+    preamble = "\n".join(lines[0:first_delimeter])
+
+    # Extract strings after the first '%%' and before the second '%%'.
+    begin = first_delimeter + 1
+    end = lines.index("%%", begin)
+    lines = lines[begin:end]
+    for line in lines:
+        if line[-3:-1] != ", ":
+            raise InputError('Expected "domainname, <digit>", found "%s"' % line)
+        # Technically the DAFSA format could support return values in range [0-31],
+        # but the values below are the only with a defined meaning.
+        if line[-1] not in "0124":
+            raise InputError(
+                'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1]
+            )
+    return (preamble, [line[:-3] + line[-1] for line in lines])
+
+
+def main(outfile, infile):
+    with open(infile, "r") as infile:
+        preamble, words = parse_gperf(infile)
+        outfile.write(words_to_cxx(words, preamble))
+    return 0
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("usage: %s infile outfile" % sys.argv[0])
+        sys.exit(1)
+
+    with open(sys.argv[2], "w") as outfile:
+        sys.exit(main(outfile, sys.argv[1]))
diff --git a/xpcom/ds/tools/perfecthash.py b/xpcom/ds/tools/perfecthash.py
new file mode 100644
index 0000000000..b4e964a125
--- /dev/null
+++ b/xpcom/ds/tools/perfecthash.py
@@ -0,0 +1,371 @@
+# perfecthash.py - Helper for generating perfect hash functions
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+A perfect hash function (PHF) is a function which maps distinct elements from a
+source set to a sequence of integers with no collisions.
+
+PHFs generated by this module use a 32-bit FNV hash to index an intermediate
+table. The value from that table is then used to run a second 32-bit FNV hash to
+get a final index.
+
+The algorithm works by starting with the largest set of conflicts, and testing
+intermediate table values until no conflicts are generated, allowing efficient
+index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119)
+
+NOTE: This perfect hash generator was designed to be simple, easy to follow, and
+maintainable, rather than being as optimized as possible, due to the relatively
+small dataset it was designed for. In the future we may want to optimize further.
+"""
+
+import textwrap
+from collections import namedtuple
+
+import six
+from mozbuild.util import ensure_bytes
+
+
+# Iteration over bytestrings works differently in Python 2 and 3; this function
+# captures the two possibilities. Returns an 'int' given the output of iterating
+# through a bytestring regardless of the input.
+def _ord(c):
+    if six.PY3:
+        return c
+    return ord(c)
+
+
+class PerfectHash(object):
+    """PerfectHash objects represent a computed perfect hash function, which
+    can be generated at compile time to provide highly efficient and compact
+    static HashTables.
+
+    Consumers must provide an intermediate table size to store the generated
+    hash function. Larger tables will generate more quickly, while smaller
+    tables will consume less space on disk."""
+
+    # 32-bit FNV offset basis and prime value.
+    # NOTE: Must match values in |PerfectHash.h|
+    FNV_OFFSET_BASIS = 0x811C9DC5
+    FNV_PRIME = 16777619
+    U32_MAX = 0xFFFFFFFF
+
+    # Bucket of entries which map into a given intermediate index.
+    Bucket = namedtuple("Bucket", "index entries")
+
+    def __init__(self, entries, size, validate=True, key=lambda e: e[0]):
+        """Create a new PerfectHash
+
+        @param entries  set of entries to generate a PHF for
+        @param size     size of the PHF intermediate table
+        @param validate test the generated hash function after generation
+        @param key      function to get 'memoryview'-compatible key for an
+                        entry. defaults to extracting the first element of an
+                        entry 2-tuple.
+        """
+
+        assert 0 < len(entries) < self.U32_MAX, "bad # of entries!"
+        self._key = key
+
+        # Allocate the intermediate table and keys.
+        self.table = [0] * size
+        self.entries = [None] * len(entries)
+
+        # A set of Bucket values. Each bucket contains the index into the
+        # intermediate table, and a list of Entries which map into that bucket.
+        buckets = [self.Bucket(idx, []) for idx in range(size)]
+
+        # Determine which input strings map to which buckets in the table.
+        for entry in entries:
+            assert entry is not None, "None entry in entries"
+            assert self.key(entry).itemsize == 1, "non-byte key elements"
+            buckets[self._hash(self.key(entry)) % size].entries.append(entry)
+
+        # Sort the buckets such that the largest one comes first.
+        buckets.sort(key=lambda b: len(b.entries), reverse=True)
+
+        for bucket in buckets:
+            # Once we've reached an empty bucket, we're done.
+            if len(bucket.entries) == 0:
+                break
+
+            # Try values for the basis until we find one with no conflicts.
+            idx = 0
+            basis = 1
+            slots = []
+            while idx < len(bucket.entries):
+                slot = self._hash(self.key(bucket.entries[idx]), basis) % len(
+                    self.entries
+                )
+                if self.entries[slot] is not None or slot in slots:
+                    # There was a conflict, try the next basis.
+                    basis += 1
+                    idx = 0
+                    del slots[:]
+                    assert basis < self.U32_MAX, "table too small"
+                else:
+                    slots.append(slot)
+                    idx += 1
+
+            assert basis < self.U32_MAX
+
+            # We've found a basis which doesn't conflict
+            self.table[bucket.index] = basis
+            for slot, entry in zip(slots, bucket.entries):
+                self.entries[slot] = entry
+
+        # Validate that looking up each key succeeds
+        if validate:
+            for entry in entries:
+                assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry)
+
+    @classmethod
+    def _hash(cls, key, basis=FNV_OFFSET_BASIS):
+        """A basic FNV-based hash function. key is the memoryview to hash.
+        32-bit FNV is used for indexing into the first table, and the value
+        stored in that table is used as the offset basis for indexing into the
+        values table."""
+        for byte in memoryview(ensure_bytes(key)):
+            obyte = _ord(byte)
+            basis ^= obyte  # xor-in the byte
+            basis *= cls.FNV_PRIME  # Multiply by the FNV prime
+            basis &= cls.U32_MAX  # clamp to 32-bits
+        return basis
+
+    def key(self, entry):
+        return memoryview(ensure_bytes(self._key(entry)))
+
+    def get_raw_index(self, key):
+        """Determine the index in self.entries without validating"""
+        mid = self.table[self._hash(key) % len(self.table)]
+        return self._hash(key, mid) % len(self.entries)
+
+    def get_index(self, key):
+        """Given a key, determine the index in self.entries"""
+        idx = self.get_raw_index(key)
+        if memoryview(ensure_bytes(key)) != self.key(self.entries[idx]):
+            return None
+        return idx
+
+    def get_entry(self, key):
+        """Given a key, get the corresponding entry"""
+        idx = self.get_index(key)
+        if idx is None:
+            return None
+        return self.entries[idx]
+
+    @staticmethod
+    def _indent(text, amount=1):
+        return text.replace("\n", "\n" + ("  " * amount))
+
+    def codegen(self, name, entry_type):
+        """Create a helper for codegening PHF logic"""
+        return CGHelper(self, name, entry_type)
+
+    def cxx_codegen(
+        self,
+        name,
+        entry_type,
+        lower_entry,
+        entries_name=None,
+        return_type=None,
+        return_entry="return entry;",
+        key_type="const char*",
+        key_bytes="aKey",
+        key_length="strlen(aKey)",
+    ):
+        """Generate complete C++ code for a get_entry-style method.
+
+        @param name         Name for the entry getter function.
+        @param entry_type   C++ type of each entry in static memory.
+        @param lower_entry  Function called with each entry to convert it to a
+                            C++ literal of type 'entry_type'.
+
+        # Optional Keyword Parameters:
+        @param entries_name Name for the generated entry table.
+
+        @param return_type  C++ return type, default: 'const entry_type&'
+        @param return_entry Override the default behaviour for returning the
+                            found entry. 'entry' is a reference to the found
+                            entry, and 'aKey' is the lookup key. 'return_entry'
+                            can be used for additional checks, e.g. for keys
+                            not in the table.
+
+        @param key_type     C++ key type, default: 'const char*'
+        @param key_bytes    'const char*' expression to get bytes for 'aKey'
+        @param key_length   'size_t' expression to get length of 'aKey'"""
+
+        if entries_name is None:
+            entries_name = "s%sEntries" % name
+
+        cg = self.codegen(entries_name, entry_type)
+        entries = cg.gen_entries(lower_entry)
+        getter = cg.gen_getter(
+            name, return_type, return_entry, key_type, key_bytes, key_length
+        )
+
+        return "%s\n\n%s" % (entries, getter)
+
+
+class CGHelper(object):
+    """Helper object for generating C++ code for a PerfectHash.
+    Created using PerfectHash.codegen()."""
+
+    def __init__(self, phf, entries_name, entry_type):
+        self.phf = phf
+        self.entries_name = entries_name
+        self.entry_type = entry_type
+
+    @staticmethod
+    def _indent(text, amount=1):
+        return text.replace("\n", "\n" + ("  " * amount))
+
+    def basis_ty(self):
+        """Determine how big of an integer is needed for bases table."""
+        max_base = max(self.phf.table)
+        if max_base <= 0xFF:
+            return "uint8_t"
+        elif max_base <= 0xFFFF:
+            return "uint16_t"
+        return "uint32_t"
+
+    def basis_table(self, name="BASES"):
+        """Generate code for a static basis table"""
+        bases = ""
+        for idx, base in enumerate(self.phf.table):
+            if idx and idx % 16 == 0:  # 16 bases per line
+                bases += "\n  "
+            bases += "%4d," % base
+        return (
+            textwrap.dedent(
+                """\
+            static const %s %s[] = {
+              %s
+            };
+            """
+            )
+            % (self.basis_ty(), name, bases)
+        )
+
+    def gen_entries(self, lower_entry):
+        """Generate code for an entries table"""
+        entries = self._indent(
+            ",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries)
+        )
+        return (
+            textwrap.dedent(
+                """\
+            const %s %s[] = {
+              %s
+            };
+            """
+            )
+            % (self.entry_type, self.entries_name, entries)
+        )
+
+    def gen_getter(
+        self,
+        name,
+        return_type=None,
+        return_entry="return entry;",
+        key_type="const char*",
+        key_bytes="aKey",
+        key_length="strlen(aKey)",
+    ):
+        """Generate the code for a C++ getter.
+
+        @param name         Name for the entry getter function.
+        @param return_type  C++ return type, default: 'const entry_type&'
+        @param return_entry Override the default behaviour for returning the
+                            found entry. 'entry' is a reference to the found
+                            entry, and 'aKey' is the lookup key. 'return_entry'
+                            can be used for additional checks, e.g. for keys
+                            not in the table.
+
+        @param key_type     C++ key type, default: 'const char*'
+        @param key_bytes    'const char*' expression to get bytes for 'aKey'
+        @param key_length   'size_t' expression to get length of 'aKey'"""
+
+        if return_type is None:
+            return_type = "const %s&" % self.entry_type
+
+        return textwrap.dedent(
+            """
+            %(return_type)s
+            %(name)s(%(key_type)s aKey)
+            {
+              %(basis_table)s
+
+              const char* bytes = %(key_bytes)s;
+              size_t length = %(key_length)s;
+              auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES,
+                                                         %(entries_name)s);
+              %(return_entry)s
+            }
+            """
+        ) % {
+            "name": name,
+            "basis_table": self._indent(self.basis_table()),
+            "entries_name": self.entries_name,
+            "return_type": return_type,
+            "return_entry": self._indent(return_entry),
+            "key_type": key_type,
+            "key_bytes": key_bytes,
+            "key_length": key_length,
+        }
+
+    def gen_jslinearstr_getter(
+        self, name, return_type=None, return_entry="return entry;"
+    ):
+        """Generate code for a specialized getter taking JSLinearStrings.
+        This getter avoids copying the JS string, but only supports ASCII keys.
+
+        @param name         Name for the entry getter function.
+        @param return_type  C++ return type, default: 'const entry_type&'
+        @param return_entry Override the default behaviour for returning the
+                            found entry. 'entry' is a reference to the found
+                            entry, and 'aKey' is the lookup key. 'return_entry'
+                            can be used for additional checks, e.g. for keys
+                            not in the table."""
+
+        assert all(
+            _ord(b) <= 0x7F for e in self.phf.entries for b in self.phf.key(e)
+        ), "non-ASCII key"
+
+        if return_type is None:
+            return_type = "const %s&" % self.entry_type
+
+        return textwrap.dedent(
+            """
+            %(return_type)s
+            %(name)s(JSLinearString* aKey)
+            {
+              %(basis_table)s
+
+              size_t length = JS::GetLinearStringLength(aKey);
+
+              JS::AutoCheckCannotGC nogc;
+              if (JS::LinearStringHasLatin1Chars(aKey)) {
+                auto& entry = mozilla::perfecthash::Lookup(
+                  JS::GetLatin1LinearStringChars(nogc, aKey),
+                  length, BASES, %(entries_name)s);
+
+                %(return_entry)s
+              } else {
+                auto& entry = mozilla::perfecthash::Lookup(
+                  JS::GetTwoByteLinearStringChars(nogc, aKey),
+                  length, BASES, %(entries_name)s);
+
+                %(return_entry)s
+              }
+            }
+            """
+        ) % {
+            "name": name,
+            "basis_table": self._indent(self.basis_table()),
+            "entries_name": self.entries_name,
+            "return_type": return_type,
+            "return_entry": self._indent(return_entry, 2),
+        }
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /xpcom/ds/tools
parent	Initial commit. (diff)
download	firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip