summaryrefslogtreecommitdiffstats
path: root/xpcom/ds/tools/perfecthash.py
diff options
context:
space:
mode:
Diffstat (limited to 'xpcom/ds/tools/perfecthash.py')
-rw-r--r--xpcom/ds/tools/perfecthash.py370
1 files changed, 370 insertions, 0 deletions
diff --git a/xpcom/ds/tools/perfecthash.py b/xpcom/ds/tools/perfecthash.py
new file mode 100644
index 0000000000..c4f8906248
--- /dev/null
+++ b/xpcom/ds/tools/perfecthash.py
@@ -0,0 +1,370 @@
+# perfecthash.py - Helper for generating perfect hash functions
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+A perfect hash function (PHF) is a function which maps distinct elements from a
+source set to a sequence of integers with no collisions.
+
+PHFs generated by this module use a 32-bit FNV hash to index an intermediate
+table. The value from that table is then used to run a second 32-bit FNV hash to
+get a final index.
+
+The algorithm works by starting with the largest set of conflicts, and testing
+intermediate table values until no conflicts are generated, allowing efficient
+index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119)
+
+NOTE: This perfect hash generator was designed to be simple, easy to follow, and
+maintainable, rather than being as optimized as possible, due to the relatively
+small dataset it was designed for. In the future we may want to optimize further.
+"""
+
+from collections import namedtuple
+from mozbuild.util import ensure_bytes
+import textwrap
+import six
+
+
+# Iteration over bytestrings works differently in Python 2 and 3; this function
+# captures the two possibilities. Returns an 'int' given the output of iterating
+# through a bytestring regardless of the input.
+def _ord(c):
+ if six.PY3:
+ return c
+ return ord(c)
+
+
+class PerfectHash(object):
+ """PerfectHash objects represent a computed perfect hash function, which
+ can be generated at compile time to provide highly efficient and compact
+ static HashTables.
+
+ Consumers must provide an intermediate table size to store the generated
+ hash function. Larger tables will generate more quickly, while smaller
+ tables will consume less space on disk."""
+
+ # 32-bit FNV offset basis and prime value.
+ # NOTE: Must match values in |PerfectHash.h|
+ FNV_OFFSET_BASIS = 0x811C9DC5
+ FNV_PRIME = 16777619
+ U32_MAX = 0xFFFFFFFF
+
+ # Bucket of entries which map into a given intermediate index.
+ Bucket = namedtuple("Bucket", "index entries")
+
+ def __init__(self, entries, size, validate=True, key=lambda e: e[0]):
+ """Create a new PerfectHash
+
+ @param entries set of entries to generate a PHF for
+ @param size size of the PHF intermediate table
+ @param validate test the generated hash function after generation
+ @param key function to get 'memoryview'-compatible key for an
+ entry. defaults to extracting the first element of an
+ entry 2-tuple.
+ """
+
+ assert 0 < len(entries) < self.U32_MAX, "bad # of entries!"
+ self._key = key
+
+ # Allocate the intermediate table and keys.
+ self.table = [0] * size
+ self.entries = [None] * len(entries)
+
+ # A set of Bucket values. Each bucket contains the index into the
+ # intermediate table, and a list of Entries which map into that bucket.
+ buckets = [self.Bucket(idx, []) for idx in range(size)]
+
+ # Determine which input strings map to which buckets in the table.
+ for entry in entries:
+ assert entry is not None, "None entry in entries"
+ assert self.key(entry).itemsize == 1, "non-byte key elements"
+ buckets[self._hash(self.key(entry)) % size].entries.append(entry)
+
+ # Sort the buckets such that the largest one comes first.
+ buckets.sort(key=lambda b: len(b.entries), reverse=True)
+
+ for bucket in buckets:
+ # Once we've reached an empty bucket, we're done.
+ if len(bucket.entries) == 0:
+ break
+
+ # Try values for the basis until we find one with no conflicts.
+ idx = 0
+ basis = 1
+ slots = []
+ while idx < len(bucket.entries):
+ slot = self._hash(self.key(bucket.entries[idx]), basis) % len(
+ self.entries
+ )
+ if self.entries[slot] is not None or slot in slots:
+ # There was a conflict, try the next basis.
+ basis += 1
+ idx = 0
+ del slots[:]
+ assert basis < self.U32_MAX, "table too small"
+ else:
+ slots.append(slot)
+ idx += 1
+
+ assert basis < self.U32_MAX
+
+ # We've found a basis which doesn't conflict
+ self.table[bucket.index] = basis
+ for slot, entry in zip(slots, bucket.entries):
+ self.entries[slot] = entry
+
+ # Validate that looking up each key succeeds
+ if validate:
+ for entry in entries:
+ assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry)
+
+ @classmethod
+ def _hash(cls, key, basis=FNV_OFFSET_BASIS):
+ """A basic FNV-based hash function. key is the memoryview to hash.
+ 32-bit FNV is used for indexing into the first table, and the value
+ stored in that table is used as the offset basis for indexing into the
+ values table."""
+ for byte in memoryview(key):
+ obyte = _ord(byte)
+ basis ^= obyte # xor-in the byte
+ basis *= cls.FNV_PRIME # Multiply by the FNV prime
+ basis &= cls.U32_MAX # clamp to 32-bits
+ return basis
+
+ def key(self, entry):
+ return memoryview(ensure_bytes(self._key(entry)))
+
+ def get_raw_index(self, key):
+ """Determine the index in self.entries without validating"""
+ mid = self.table[self._hash(key) % len(self.table)]
+ return self._hash(key, mid) % len(self.entries)
+
+ def get_index(self, key):
+ """Given a key, determine the index in self.entries"""
+ idx = self.get_raw_index(key)
+ if memoryview(key) != self.key(self.entries[idx]):
+ return None
+ return idx
+
+ def get_entry(self, key):
+ """Given a key, get the corresponding entry"""
+ idx = self.get_index(key)
+ if idx is None:
+ return None
+ return self.entries[idx]
+
+ @staticmethod
+ def _indent(text, amount=1):
+ return text.replace("\n", "\n" + (" " * amount))
+
+ def codegen(self, name, entry_type):
+ """Create a helper for codegening PHF logic"""
+ return CGHelper(self, name, entry_type)
+
+ def cxx_codegen(
+ self,
+ name,
+ entry_type,
+ lower_entry,
+ entries_name=None,
+ return_type=None,
+ return_entry="return entry;",
+ key_type="const char*",
+ key_bytes="aKey",
+ key_length="strlen(aKey)",
+ ):
+ """Generate complete C++ code for a get_entry-style method.
+
+ @param name Name for the entry getter function.
+ @param entry_type C++ type of each entry in static memory.
+ @param lower_entry Function called with each entry to convert it to a
+ C++ literal of type 'entry_type'.
+
+ # Optional Keyword Parameters:
+ @param entries_name Name for the generated entry table.
+
+ @param return_type C++ return type, default: 'const entry_type&'
+ @param return_entry Override the default behaviour for returning the
+ found entry. 'entry' is a reference to the found
+ entry, and 'aKey' is the lookup key. 'return_entry'
+ can be used for additional checks, e.g. for keys
+ not in the table.
+
+ @param key_type C++ key type, default: 'const char*'
+ @param key_bytes 'const char*' expression to get bytes for 'aKey'
+ @param key_length 'size_t' expression to get length of 'aKey'"""
+
+ if entries_name is None:
+ entries_name = "s%sEntries" % name
+
+ cg = self.codegen(entries_name, entry_type)
+ entries = cg.gen_entries(lower_entry)
+ getter = cg.gen_getter(
+ name, return_type, return_entry, key_type, key_bytes, key_length
+ )
+
+ return "%s\n\n%s" % (entries, getter)
+
+
+class CGHelper(object):
+ """Helper object for generating C++ code for a PerfectHash.
+ Created using PerfectHash.codegen()."""
+
+ def __init__(self, phf, entries_name, entry_type):
+ self.phf = phf
+ self.entries_name = entries_name
+ self.entry_type = entry_type
+
+ @staticmethod
+ def _indent(text, amount=1):
+ return text.replace("\n", "\n" + (" " * amount))
+
+ def basis_ty(self):
+ """Determine how big of an integer is needed for bases table."""
+ max_base = max(self.phf.table)
+ if max_base <= 0xFF:
+ return "uint8_t"
+ elif max_base <= 0xFFFF:
+ return "uint16_t"
+ return "uint32_t"
+
+ def basis_table(self, name="BASES"):
+ """Generate code for a static basis table"""
+ bases = ""
+ for idx, base in enumerate(self.phf.table):
+ if idx and idx % 16 == 0: # 16 bases per line
+ bases += "\n "
+ bases += "%4d," % base
+ return (
+ textwrap.dedent(
+ """\
+ static const %s %s[] = {
+ %s
+ };
+ """
+ )
+ % (self.basis_ty(), name, bases)
+ )
+
+ def gen_entries(self, lower_entry):
+ """Generate code for an entries table"""
+ entries = self._indent(
+ ",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries)
+ )
+ return (
+ textwrap.dedent(
+ """\
+ const %s %s[] = {
+ %s
+ };
+ """
+ )
+ % (self.entry_type, self.entries_name, entries)
+ )
+
+ def gen_getter(
+ self,
+ name,
+ return_type=None,
+ return_entry="return entry;",
+ key_type="const char*",
+ key_bytes="aKey",
+ key_length="strlen(aKey)",
+ ):
+ """Generate the code for a C++ getter.
+
+ @param name Name for the entry getter function.
+ @param return_type C++ return type, default: 'const entry_type&'
+ @param return_entry Override the default behaviour for returning the
+ found entry. 'entry' is a reference to the found
+ entry, and 'aKey' is the lookup key. 'return_entry'
+ can be used for additional checks, e.g. for keys
+ not in the table.
+
+ @param key_type C++ key type, default: 'const char*'
+ @param key_bytes 'const char*' expression to get bytes for 'aKey'
+ @param key_length 'size_t' expression to get length of 'aKey'"""
+
+ if return_type is None:
+ return_type = "const %s&" % self.entry_type
+
+ return textwrap.dedent(
+ """
+ %(return_type)s
+ %(name)s(%(key_type)s aKey)
+ {
+ %(basis_table)s
+
+ const char* bytes = %(key_bytes)s;
+ size_t length = %(key_length)s;
+ auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES,
+ %(entries_name)s);
+ %(return_entry)s
+ }
+ """
+ ) % {
+ "name": name,
+ "basis_table": self._indent(self.basis_table()),
+ "entries_name": self.entries_name,
+ "return_type": return_type,
+ "return_entry": self._indent(return_entry),
+ "key_type": key_type,
+ "key_bytes": key_bytes,
+ "key_length": key_length,
+ }
+
+ def gen_jslinearstr_getter(
+ self, name, return_type=None, return_entry="return entry;"
+ ):
+ """Generate code for a specialized getter taking JSLinearStrings.
+ This getter avoids copying the JS string, but only supports ASCII keys.
+
+ @param name Name for the entry getter function.
+ @param return_type C++ return type, default: 'const entry_type&'
+ @param return_entry Override the default behaviour for returning the
+ found entry. 'entry' is a reference to the found
+ entry, and 'aKey' is the lookup key. 'return_entry'
+ can be used for additional checks, e.g. for keys
+ not in the table."""
+
+ assert all(
+ _ord(b) <= 0x7F for e in self.phf.entries for b in self.phf.key(e)
+ ), "non-ASCII key"
+
+ if return_type is None:
+ return_type = "const %s&" % self.entry_type
+
+ return textwrap.dedent(
+ """
+ %(return_type)s
+ %(name)s(JSLinearString* aKey)
+ {
+ %(basis_table)s
+
+ size_t length = JS::GetLinearStringLength(aKey);
+
+ JS::AutoCheckCannotGC nogc;
+ if (JS::LinearStringHasLatin1Chars(aKey)) {
+ auto& entry = mozilla::perfecthash::Lookup(
+ JS::GetLatin1LinearStringChars(nogc, aKey),
+ length, BASES, %(entries_name)s);
+
+ %(return_entry)s
+ } else {
+ auto& entry = mozilla::perfecthash::Lookup(
+ JS::GetTwoByteLinearStringChars(nogc, aKey),
+ length, BASES, %(entries_name)s);
+
+ %(return_entry)s
+ }
+ }
+ """
+ ) % {
+ "name": name,
+ "basis_table": self._indent(self.basis_table()),
+ "entries_name": self.entries_name,
+ "return_type": return_type,
+ "return_entry": self._indent(return_entry, 2),
+ }