Adding upstream version 110.0.1.upstream/110.0.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 09:22:09 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 09:22:09 +0000
commit: 43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree: 620249daf56c0258faa40cbdcf9cfba06de2a846 /xpcom/ds/tools/perfecthash.py
parent: Initial commit. (diff)
download: firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz
firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip
1 files changed, 371 insertions, 0 deletions
diff --git a/xpcom/ds/tools/perfecthash.py b/xpcom/ds/tools/perfecthash.py
new file mode 100644
index 0000000000..b4e964a125
--- /dev/null
+++ b/xpcom/ds/tools/perfecthash.py
@@ -0,0 +1,371 @@
+# perfecthash.py - Helper for generating perfect hash functions
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+A perfect hash function (PHF) is a function which maps distinct elements from a
+source set to a sequence of integers with no collisions.
+
+PHFs generated by this module use a 32-bit FNV hash to index an intermediate
+table. The value from that table is then used to run a second 32-bit FNV hash to
+get a final index.
+
+The algorithm works by starting with the largest set of conflicts, and testing
+intermediate table values until no conflicts are generated, allowing efficient
+index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119)
+
+NOTE: This perfect hash generator was designed to be simple, easy to follow, and
+maintainable, rather than being as optimized as possible, due to the relatively
+small dataset it was designed for. In the future we may want to optimize further.
+"""
+
+import textwrap
+from collections import namedtuple
+
+import six
+from mozbuild.util import ensure_bytes
+
+
+# Iteration over bytestrings works differently in Python 2 and 3; this function
+# captures the two possibilities. Returns an 'int' given the output of iterating
+# through a bytestring regardless of the input.
+def _ord(c):
+    if six.PY3:
+        return c
+    return ord(c)
+
+
+class PerfectHash(object):
+    """PerfectHash objects represent a computed perfect hash function, which
+    can be generated at compile time to provide highly efficient and compact
+    static HashTables.
+
+    Consumers must provide an intermediate table size to store the generated
+    hash function. Larger tables will generate more quickly, while smaller
+    tables will consume less space on disk."""
+
+    # 32-bit FNV offset basis and prime value.
+    # NOTE: Must match values in |PerfectHash.h|
+    FNV_OFFSET_BASIS = 0x811C9DC5
+    FNV_PRIME = 16777619
+    U32_MAX = 0xFFFFFFFF
+
+    # Bucket of entries which map into a given intermediate index.
+    Bucket = namedtuple("Bucket", "index entries")
+
+    def __init__(self, entries, size, validate=True, key=lambda e: e[0]):
+        """Create a new PerfectHash
+
+        @param entries  set of entries to generate a PHF for
+        @param size     size of the PHF intermediate table
+        @param validate test the generated hash function after generation
+        @param key      function to get 'memoryview'-compatible key for an
+                        entry. defaults to extracting the first element of an
+                        entry 2-tuple.
+        """
+
+        assert 0 < len(entries) < self.U32_MAX, "bad # of entries!"
+        self._key = key
+
+        # Allocate the intermediate table and keys.
+        self.table = [0] * size
+        self.entries = [None] * len(entries)
+
+        # A set of Bucket values. Each bucket contains the index into the
+        # intermediate table, and a list of Entries which map into that bucket.
+        buckets = [self.Bucket(idx, []) for idx in range(size)]
+
+        # Determine which input strings map to which buckets in the table.
+        for entry in entries:
+            assert entry is not None, "None entry in entries"
+            assert self.key(entry).itemsize == 1, "non-byte key elements"
+            buckets[self._hash(self.key(entry)) % size].entries.append(entry)
+
+        # Sort the buckets such that the largest one comes first.
+        buckets.sort(key=lambda b: len(b.entries), reverse=True)
+
+        for bucket in buckets:
+            # Once we've reached an empty bucket, we're done.
+            if len(bucket.entries) == 0:
+                break
+
+            # Try values for the basis until we find one with no conflicts.
+            idx = 0
+            basis = 1
+            slots = []
+            while idx < len(bucket.entries):
+                slot = self._hash(self.key(bucket.entries[idx]), basis) % len(
+                    self.entries
+                )
+                if self.entries[slot] is not None or slot in slots:
+                    # There was a conflict, try the next basis.
+                    basis += 1
+                    idx = 0
+                    del slots[:]
+                    assert basis < self.U32_MAX, "table too small"
+                else:
+                    slots.append(slot)
+                    idx += 1
+
+            assert basis < self.U32_MAX
+
+            # We've found a basis which doesn't conflict
+            self.table[bucket.index] = basis
+            for slot, entry in zip(slots, bucket.entries):
+                self.entries[slot] = entry
+
+        # Validate that looking up each key succeeds
+        if validate:
+            for entry in entries:
+                assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry)
+
+    @classmethod
+    def _hash(cls, key, basis=FNV_OFFSET_BASIS):
+        """A basic FNV-based hash function. key is the memoryview to hash.
+        32-bit FNV is used for indexing into the first table, and the value
+        stored in that table is used as the offset basis for indexing into the
+        values table."""
+        for byte in memoryview(ensure_bytes(key)):
+            obyte = _ord(byte)
+            basis ^= obyte  # xor-in the byte
+            basis *= cls.FNV_PRIME  # Multiply by the FNV prime
+            basis &= cls.U32_MAX  # clamp to 32-bits
+        return basis
+
+    def key(self, entry):
+        return memoryview(ensure_bytes(self._key(entry)))
+
+    def get_raw_index(self, key):
+        """Determine the index in self.entries without validating"""
+        mid = self.table[self._hash(key) % len(self.table)]
+        return self._hash(key, mid) % len(self.entries)
+
+    def get_index(self, key):
+        """Given a key, determine the index in self.entries"""
+        idx = self.get_raw_index(key)
+        if memoryview(ensure_bytes(key)) != self.key(self.entries[idx]):
+            return None
+        return idx
+
+    def get_entry(self, key):
+        """Given a key, get the corresponding entry"""
+        idx = self.get_index(key)
+        if idx is None:
+            return None
+        return self.entries[idx]
+
+    @staticmethod
+    def _indent(text, amount=1):
+        return text.replace("\n", "\n" + ("  " * amount))
+
+    def codegen(self, name, entry_type):
+        """Create a helper for codegening PHF logic"""
+        return CGHelper(self, name, entry_type)
+
+    def cxx_codegen(
+        self,
+        name,
+        entry_type,
+        lower_entry,
+        entries_name=None,
+        return_type=None,
+        return_entry="return entry;",
+        key_type="const char*",
+        key_bytes="aKey",
+        key_length="strlen(aKey)",
+    ):
+        """Generate complete C++ code for a get_entry-style method.
+
+        @param name         Name for the entry getter function.
+        @param entry_type   C++ type of each entry in static memory.
+        @param lower_entry  Function called with each entry to convert it to a
+                            C++ literal of type 'entry_type'.
+
+        # Optional Keyword Parameters:
+        @param entries_name Name for the generated entry table.
+
+        @param return_type  C++ return type, default: 'const entry_type&'
+        @param return_entry Override the default behaviour for returning the
+                            found entry. 'entry' is a reference to the found
+                            entry, and 'aKey' is the lookup key. 'return_entry'
+                            can be used for additional checks, e.g. for keys
+                            not in the table.
+
+        @param key_type     C++ key type, default: 'const char*'
+        @param key_bytes    'const char*' expression to get bytes for 'aKey'
+        @param key_length   'size_t' expression to get length of 'aKey'"""
+
+        if entries_name is None:
+            entries_name = "s%sEntries" % name
+
+        cg = self.codegen(entries_name, entry_type)
+        entries = cg.gen_entries(lower_entry)
+        getter = cg.gen_getter(
+            name, return_type, return_entry, key_type, key_bytes, key_length
+        )
+
+        return "%s\n\n%s" % (entries, getter)
+
+
+class CGHelper(object):
+    """Helper object for generating C++ code for a PerfectHash.
+    Created using PerfectHash.codegen()."""
+
+    def __init__(self, phf, entries_name, entry_type):
+        self.phf = phf
+        self.entries_name = entries_name
+        self.entry_type = entry_type
+
+    @staticmethod
+    def _indent(text, amount=1):
+        return text.replace("\n", "\n" + ("  " * amount))
+
+    def basis_ty(self):
+        """Determine how big of an integer is needed for bases table."""
+        max_base = max(self.phf.table)
+        if max_base <= 0xFF:
+            return "uint8_t"
+        elif max_base <= 0xFFFF:
+            return "uint16_t"
+        return "uint32_t"
+
+    def basis_table(self, name="BASES"):
+        """Generate code for a static basis table"""
+        bases = ""
+        for idx, base in enumerate(self.phf.table):
+            if idx and idx % 16 == 0:  # 16 bases per line
+                bases += "\n  "
+            bases += "%4d," % base
+        return (
+            textwrap.dedent(
+                """\
+            static const %s %s[] = {
+              %s
+            };
+            """
+            )
+            % (self.basis_ty(), name, bases)
+        )
+
+    def gen_entries(self, lower_entry):
+        """Generate code for an entries table"""
+        entries = self._indent(
+            ",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries)
+        )
+        return (
+            textwrap.dedent(
+                """\
+            const %s %s[] = {
+              %s
+            };
+            """
+            )
+            % (self.entry_type, self.entries_name, entries)
+        )
+
+    def gen_getter(
+        self,
+        name,
+        return_type=None,
+        return_entry="return entry;",
+        key_type="const char*",
+        key_bytes="aKey",
+        key_length="strlen(aKey)",
+    ):
+        """Generate the code for a C++ getter.
+
+        @param name         Name for the entry getter function.
+        @param return_type  C++ return type, default: 'const entry_type&'
+        @param return_entry Override the default behaviour for returning the
+                            found entry. 'entry' is a reference to the found
+                            entry, and 'aKey' is the lookup key. 'return_entry'
+                            can be used for additional checks, e.g. for keys
+                            not in the table.
+
+        @param key_type     C++ key type, default: 'const char*'
+        @param key_bytes    'const char*' expression to get bytes for 'aKey'
+        @param key_length   'size_t' expression to get length of 'aKey'"""
+
+        if return_type is None:
+            return_type = "const %s&" % self.entry_type
+
+        return textwrap.dedent(
+            """
+            %(return_type)s
+            %(name)s(%(key_type)s aKey)
+            {
+              %(basis_table)s
+
+              const char* bytes = %(key_bytes)s;
+              size_t length = %(key_length)s;
+              auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES,
+                                                         %(entries_name)s);
+              %(return_entry)s
+            }
+            """
+        ) % {
+            "name": name,
+            "basis_table": self._indent(self.basis_table()),
+            "entries_name": self.entries_name,
+            "return_type": return_type,
+            "return_entry": self._indent(return_entry),
+            "key_type": key_type,
+            "key_bytes": key_bytes,
+            "key_length": key_length,
+        }
+
+    def gen_jslinearstr_getter(
+        self, name, return_type=None, return_entry="return entry;"
+    ):
+        """Generate code for a specialized getter taking JSLinearStrings.
+        This getter avoids copying the JS string, but only supports ASCII keys.
+
+        @param name         Name for the entry getter function.
+        @param return_type  C++ return type, default: 'const entry_type&'
+        @param return_entry Override the default behaviour for returning the
+                            found entry. 'entry' is a reference to the found
+                            entry, and 'aKey' is the lookup key. 'return_entry'
+                            can be used for additional checks, e.g. for keys
+                            not in the table."""
+
+        assert all(
+            _ord(b) <= 0x7F for e in self.phf.entries for b in self.phf.key(e)
+        ), "non-ASCII key"
+
+        if return_type is None:
+            return_type = "const %s&" % self.entry_type
+
+        return textwrap.dedent(
+            """
+            %(return_type)s
+            %(name)s(JSLinearString* aKey)
+            {
+              %(basis_table)s
+
+              size_t length = JS::GetLinearStringLength(aKey);
+
+              JS::AutoCheckCannotGC nogc;
+              if (JS::LinearStringHasLatin1Chars(aKey)) {
+                auto& entry = mozilla::perfecthash::Lookup(
+                  JS::GetLatin1LinearStringChars(nogc, aKey),
+                  length, BASES, %(entries_name)s);
+
+                %(return_entry)s
+              } else {
+                auto& entry = mozilla::perfecthash::Lookup(
+                  JS::GetTwoByteLinearStringChars(nogc, aKey),
+                  length, BASES, %(entries_name)s);
+
+                %(return_entry)s
+              }
+            }
+            """
+        ) % {
+            "name": name,
+            "basis_table": self._indent(self.basis_table()),
+            "entries_name": self.entries_name,
+            "return_type": return_type,
+            "return_entry": self._indent(return_entry, 2),
+        }
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 09:22:09 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 09:22:09 +0000
commit	43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree	620249daf56c0258faa40cbdcf9cfba06de2a846 /xpcom/ds/tools/perfecthash.py
parent	Initial commit. (diff)
download	firefox-43a97878ce14b72f0981164f87f2e35e14151312.tar.xz firefox-43a97878ce14b72f0981164f87f2e35e14151312.zip