diff options
Diffstat (limited to 'xpcom/ds/tools/perfecthash.py')
-rw-r--r-- | xpcom/ds/tools/perfecthash.py | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/xpcom/ds/tools/perfecthash.py b/xpcom/ds/tools/perfecthash.py new file mode 100644 index 0000000000..929b643a30 --- /dev/null +++ b/xpcom/ds/tools/perfecthash.py @@ -0,0 +1,377 @@ +# perfecthash.py - Helper for generating perfect hash functions +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +A perfect hash function (PHF) is a function which maps distinct elements from a +source set to a sequence of integers with no collisions. + +PHFs generated by this module use a 32-bit FNV hash to index an intermediate +table. The value from that table is then used to run a second 32-bit FNV hash to +get a final index. + +The algorithm works by starting with the largest set of conflicts, and testing +intermediate table values until no conflicts are generated, allowing efficient +index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119) + +NOTE: This perfect hash generator was designed to be simple, easy to follow, and +maintainable, rather than being as optimized as possible, due to the relatively +small dataset it was designed for. In the future we may want to optimize further. +""" + +import textwrap +from collections import namedtuple + +import six +from mozbuild.util import ensure_bytes + + +# Iteration over bytestrings works differently in Python 2 and 3; this function +# captures the two possibilities. Returns an 'int' given the output of iterating +# through a bytestring regardless of the input. +def _ord(c): + if six.PY3: + return c + return ord(c) + + +class PerfectHash(object): + """PerfectHash objects represent a computed perfect hash function, which + can be generated at compile time to provide highly efficient and compact + static HashTables. + + Consumers must provide an intermediate table size to store the generated + hash function. Larger tables will generate more quickly, while smaller + tables will consume less space on disk.""" + + # 32-bit FNV offset basis and prime value. + # NOTE: Must match values in |PerfectHash.h| + FNV_OFFSET_BASIS = 0x811C9DC5 + FNV_PRIME = 16777619 + U32_MAX = 0xFFFFFFFF + + # Bucket of entries which map into a given intermediate index. + Bucket = namedtuple("Bucket", "index entries") + + def __init__(self, entries, size, validate=True, key=lambda e: e[0]): + """Create a new PerfectHash + + @param entries set of entries to generate a PHF for + @param size size of the PHF intermediate table + @param validate test the generated hash function after generation + @param key function to get 'memoryview'-compatible key for an + entry. defaults to extracting the first element of an + entry 2-tuple. + """ + + assert 0 < len(entries) < self.U32_MAX, "bad # of entries!" + self._key = key + + # Allocate the intermediate table and keys. + self.table = [0] * size + self.entries = [None] * len(entries) + + # A set of Bucket values. Each bucket contains the index into the + # intermediate table, and a list of Entries which map into that bucket. + buckets = [self.Bucket(idx, []) for idx in range(size)] + + # Determine which input strings map to which buckets in the table. + for entry in entries: + assert entry is not None, "None entry in entries" + assert self.key(entry).itemsize == 1, "non-byte key elements" + buckets[self._hash(self.key(entry)) % size].entries.append(entry) + + # Sort the buckets such that the largest one comes first. + buckets.sort(key=lambda b: len(b.entries), reverse=True) + + for bucket in buckets: + # Once we've reached an empty bucket, we're done. + if len(bucket.entries) == 0: + break + + # Try values for the basis until we find one with no conflicts. + idx = 0 + basis = 1 + slots = [] + while idx < len(bucket.entries): + slot = self._hash(self.key(bucket.entries[idx]), basis) % len( + self.entries + ) + if self.entries[slot] is not None or slot in slots: + # There was a conflict, try the next basis. + basis += 1 + idx = 0 + del slots[:] + assert basis < self.U32_MAX, "table too small" + else: + slots.append(slot) + idx += 1 + + assert basis < self.U32_MAX + + # We've found a basis which doesn't conflict + self.table[bucket.index] = basis + for slot, entry in zip(slots, bucket.entries): + self.entries[slot] = entry + + # Validate that looking up each key succeeds + if validate: + for entry in entries: + assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry) + + @classmethod + def _hash(cls, key, basis=FNV_OFFSET_BASIS): + """A basic FNV-based hash function. key is the memoryview to hash. + 32-bit FNV is used for indexing into the first table, and the value + stored in that table is used as the offset basis for indexing into the + values table.""" + for byte in memoryview(ensure_bytes(key)): + obyte = _ord(byte) + basis ^= obyte # xor-in the byte + basis *= cls.FNV_PRIME # Multiply by the FNV prime + basis &= cls.U32_MAX # clamp to 32-bits + return basis + + def key(self, entry): + return memoryview(ensure_bytes(self._key(entry))) + + def get_raw_index(self, key): + """Determine the index in self.entries without validating""" + mid = self.table[self._hash(key) % len(self.table)] + return self._hash(key, mid) % len(self.entries) + + def get_index(self, key): + """Given a key, determine the index in self.entries""" + idx = self.get_raw_index(key) + if memoryview(ensure_bytes(key)) != self.key(self.entries[idx]): + return None + return idx + + def get_entry(self, key): + """Given a key, get the corresponding entry""" + idx = self.get_index(key) + if idx is None: + return None + return self.entries[idx] + + @staticmethod + def _indent(text, amount=1): + return text.replace("\n", "\n" + (" " * amount)) + + def codegen(self, name, entry_type): + """Create a helper for codegening PHF logic""" + return CGHelper(self, name, entry_type) + + def cxx_codegen( + self, + name, + entry_type, + lower_entry, + entries_name=None, + return_type=None, + return_entry="return entry;", + key_type="const char*", + key_bytes="aKey", + key_length="strlen(aKey)", + ): + """Generate complete C++ code for a get_entry-style method. + + @param name Name for the entry getter function. + @param entry_type C++ type of each entry in static memory. + @param lower_entry Function called with each entry to convert it to a + C++ literal of type 'entry_type'. + + # Optional Keyword Parameters: + @param entries_name Name for the generated entry table. + + @param return_type C++ return type, default: 'const entry_type&' + @param return_entry Override the default behaviour for returning the + found entry. 'entry' is a reference to the found + entry, and 'aKey' is the lookup key. 'return_entry' + can be used for additional checks, e.g. for keys + not in the table. + + @param key_type C++ key type, default: 'const char*' + @param key_bytes 'const char*' expression to get bytes for 'aKey' + @param key_length 'size_t' expression to get length of 'aKey'""" + + if entries_name is None: + entries_name = "s%sEntries" % name + + cg = self.codegen(entries_name, entry_type) + entries = cg.gen_entries(lower_entry) + getter = cg.gen_getter( + name, return_type, return_entry, key_type, key_bytes, key_length + ) + + return "%s\n\n%s" % (entries, getter) + + +class CGHelper(object): + """Helper object for generating C++ code for a PerfectHash. + Created using PerfectHash.codegen().""" + + def __init__(self, phf, entries_name, entry_type): + self.phf = phf + self.entries_name = entries_name + self.entry_type = entry_type + + @staticmethod + def _indent(text, amount=1): + return text.replace("\n", "\n" + (" " * amount)) + + def basis_ty(self): + """Determine how big of an integer is needed for bases table.""" + max_base = max(self.phf.table) + if max_base <= 0xFF: + return "uint8_t" + elif max_base <= 0xFFFF: + return "uint16_t" + return "uint32_t" + + def basis_table(self, name="BASES"): + """Generate code for a static basis table""" + bases = "" + for idx, base in enumerate(self.phf.table): + if idx and idx % 16 == 0: # 16 bases per line + bases += "\n " + bases += "%4d," % base + return ( + textwrap.dedent( + """\ + static const %s %s[] = { + %s + }; + """ + ) + % (self.basis_ty(), name, bases) + ) + + def gen_entries(self, lower_entry): + """Generate code for an entries table""" + entries = self._indent( + ",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries) + ) + return ( + textwrap.dedent( + """\ + const %s %s[] = { + %s + }; + """ + ) + % (self.entry_type, self.entries_name, entries) + ) + + def gen_getter( + self, + name, + return_type=None, + return_entry="return entry;", + key_type="const char*", + key_bytes="aKey", + key_length="strlen(aKey)", + ): + """Generate the code for a C++ getter. + + @param name Name for the entry getter function. + @param return_type C++ return type, default: 'const entry_type&' + @param return_entry Override the default behaviour for returning the + found entry. 'entry' is a reference to the found + entry, and 'aKey' is the lookup key. 'return_entry' + can be used for additional checks, e.g. for keys + not in the table. + + @param key_type C++ key type, default: 'const char*' + @param key_bytes 'const char*' expression to get bytes for 'aKey' + @param key_length 'size_t' expression to get length of 'aKey'""" + + if return_type is None: + return_type = "const %s&" % self.entry_type + + return ( + textwrap.dedent( + """ + %(return_type)s + %(name)s(%(key_type)s aKey) + { + %(basis_table)s + + const char* bytes = %(key_bytes)s; + size_t length = %(key_length)s; + auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES, + %(entries_name)s); + %(return_entry)s + } + """ + ) + % { + "name": name, + "basis_table": self._indent(self.basis_table()), + "entries_name": self.entries_name, + "return_type": return_type, + "return_entry": self._indent(return_entry), + "key_type": key_type, + "key_bytes": key_bytes, + "key_length": key_length, + } + ) + + def gen_jslinearstr_getter( + self, name, return_type=None, return_entry="return entry;" + ): + """Generate code for a specialized getter taking JSLinearStrings. + This getter avoids copying the JS string, but only supports ASCII keys. + + @param name Name for the entry getter function. + @param return_type C++ return type, default: 'const entry_type&' + @param return_entry Override the default behaviour for returning the + found entry. 'entry' is a reference to the found + entry, and 'aKey' is the lookup key. 'return_entry' + can be used for additional checks, e.g. for keys + not in the table.""" + + assert all( + _ord(b) <= 0x7F for e in self.phf.entries for b in self.phf.key(e) + ), "non-ASCII key" + + if return_type is None: + return_type = "const %s&" % self.entry_type + + return ( + textwrap.dedent( + """ + %(return_type)s + %(name)s(JSLinearString* aKey) + { + %(basis_table)s + + size_t length = JS::GetLinearStringLength(aKey); + + JS::AutoCheckCannotGC nogc; + if (JS::LinearStringHasLatin1Chars(aKey)) { + auto& entry = mozilla::perfecthash::Lookup( + JS::GetLatin1LinearStringChars(nogc, aKey), + length, BASES, %(entries_name)s); + + %(return_entry)s + } else { + auto& entry = mozilla::perfecthash::Lookup( + JS::GetTwoByteLinearStringChars(nogc, aKey), + length, BASES, %(entries_name)s); + + %(return_entry)s + } + } + """ + ) + % { + "name": name, + "basis_table": self._indent(self.basis_table()), + "entries_name": self.entries_name, + "return_type": return_type, + "return_entry": self._indent(return_entry, 2), + } + ) |