366 lines
13 KiB
Python
366 lines
13 KiB
Python
# perfecthash.py - Helper for generating perfect hash functions
|
|
#
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
"""
|
|
A perfect hash function (PHF) is a function which maps distinct elements from a
|
|
source set to a sequence of integers with no collisions.
|
|
|
|
PHFs generated by this module use a 32-bit FNV hash to index an intermediate
|
|
table. The value from that table is then used to run a second 32-bit FNV hash to
|
|
get a final index.
|
|
|
|
The algorithm works by starting with the largest set of conflicts, and testing
|
|
intermediate table values until no conflicts are generated, allowing efficient
|
|
index lookup at runtime. (see also: http://stevehanov.ca/blog/index.php?id=119)
|
|
|
|
NOTE: This perfect hash generator was designed to be simple, easy to follow, and
|
|
maintainable, rather than being as optimized as possible, due to the relatively
|
|
small dataset it was designed for. In the future we may want to optimize further.
|
|
"""
|
|
|
|
import textwrap
|
|
from collections import namedtuple
|
|
|
|
|
|
class PerfectHash:
|
|
"""PerfectHash objects represent a computed perfect hash function, which
|
|
can be generated at compile time to provide highly efficient and compact
|
|
static HashTables.
|
|
|
|
Consumers must provide an intermediate table size to store the generated
|
|
hash function. Larger tables will generate more quickly, while smaller
|
|
tables will consume less space on disk."""
|
|
|
|
# 32-bit FNV offset basis and prime value.
|
|
# NOTE: Must match values in |PerfectHash.h|
|
|
FNV_OFFSET_BASIS = 0x811C9DC5
|
|
FNV_PRIME = 16777619
|
|
U32_MAX = 0xFFFFFFFF
|
|
|
|
# Bucket of entries which map into a given intermediate index.
|
|
Bucket = namedtuple("Bucket", "index entries")
|
|
|
|
def __init__(self, entries, size, validate=True, key=lambda e: e[0]):
|
|
"""Create a new PerfectHash
|
|
|
|
@param entries set of entries to generate a PHF for
|
|
@param size size of the PHF intermediate table
|
|
@param validate test the generated hash function after generation
|
|
@param key function to get 'memoryview'-compatible key for an
|
|
entry. defaults to extracting the first element of an
|
|
entry 2-tuple.
|
|
"""
|
|
|
|
assert 0 < len(entries) < self.U32_MAX, "bad # of entries!"
|
|
self._key = key
|
|
|
|
# Allocate the intermediate table and keys.
|
|
self.table = [0] * size
|
|
self.entries = [None] * len(entries)
|
|
|
|
# A set of Bucket values. Each bucket contains the index into the
|
|
# intermediate table, and a list of Entries which map into that bucket.
|
|
buckets = [self.Bucket(idx, []) for idx in range(size)]
|
|
|
|
# Determine which input strings map to which buckets in the table.
|
|
for entry in entries:
|
|
assert entry is not None, "None entry in entries"
|
|
assert self.key(entry).itemsize == 1, "non-byte key elements"
|
|
buckets[self._hash(self.key(entry)) % size].entries.append(entry)
|
|
|
|
# Sort the buckets such that the largest one comes first.
|
|
buckets.sort(key=lambda b: len(b.entries), reverse=True)
|
|
|
|
for bucket in buckets:
|
|
# Once we've reached an empty bucket, we're done.
|
|
if not bucket.entries:
|
|
break
|
|
|
|
# Try values for the basis until we find one with no conflicts.
|
|
idx = 0
|
|
basis = 1
|
|
slots = []
|
|
while idx < len(bucket.entries):
|
|
slot = self._hash(self.key(bucket.entries[idx]), basis) % len(
|
|
self.entries
|
|
)
|
|
if self.entries[slot] is not None or slot in slots:
|
|
# There was a conflict, try the next basis.
|
|
basis += 1
|
|
idx = 0
|
|
slots.clear()
|
|
assert basis < self.U32_MAX, "table too small"
|
|
else:
|
|
slots.append(slot)
|
|
idx += 1
|
|
|
|
assert basis < self.U32_MAX
|
|
|
|
# We've found a basis which doesn't conflict
|
|
self.table[bucket.index] = basis
|
|
for slot, entry in zip(slots, bucket.entries):
|
|
self.entries[slot] = entry
|
|
|
|
# Validate that looking up each key succeeds
|
|
if validate:
|
|
for entry in entries:
|
|
assert self.get_entry(self.key(entry)), "get_entry(%s)" % repr(entry)
|
|
|
|
@classmethod
|
|
def _hash(cls, key, basis=FNV_OFFSET_BASIS):
|
|
"""A basic FNV-based hash function. key is the memoryview to hash.
|
|
32-bit FNV is used for indexing into the first table, and the value
|
|
stored in that table is used as the offset basis for indexing into the
|
|
values table."""
|
|
FNV_PRIME = cls.FNV_PRIME
|
|
U32_MAX = cls.U32_MAX
|
|
for obyte in memoryview(key):
|
|
basis ^= obyte # xor-in the byte
|
|
basis *= FNV_PRIME # Multiply by the FNV prime
|
|
basis &= U32_MAX # clamp to 32-bits
|
|
return basis
|
|
|
|
def key(self, entry):
|
|
return memoryview(self._key(entry))
|
|
|
|
def get_raw_index(self, key):
|
|
"""Determine the index in self.entries without validating"""
|
|
mid = self.table[self._hash(key) % len(self.table)]
|
|
return self._hash(key, mid) % len(self.entries)
|
|
|
|
def get_index(self, key):
|
|
"""Given a key, determine the index in self.entries"""
|
|
idx = self.get_raw_index(key)
|
|
if memoryview(key) != self.key(self.entries[idx]):
|
|
return None
|
|
return idx
|
|
|
|
def get_entry(self, key):
|
|
"""Given a key, get the corresponding entry"""
|
|
idx = self.get_index(key)
|
|
if idx is None:
|
|
return None
|
|
return self.entries[idx]
|
|
|
|
@staticmethod
|
|
def _indent(text, amount=1):
|
|
return text.replace("\n", "\n" + (" " * amount))
|
|
|
|
def codegen(self, name, entry_type):
|
|
"""Create a helper for codegening PHF logic"""
|
|
return CGHelper(self, name, entry_type)
|
|
|
|
def cxx_codegen(
|
|
self,
|
|
name,
|
|
entry_type,
|
|
lower_entry,
|
|
entries_name=None,
|
|
return_type=None,
|
|
return_entry="return entry;",
|
|
key_type="const char*",
|
|
key_bytes="aKey",
|
|
key_length="strlen(aKey)",
|
|
):
|
|
"""Generate complete C++ code for a get_entry-style method.
|
|
|
|
@param name Name for the entry getter function.
|
|
@param entry_type C++ type of each entry in static memory.
|
|
@param lower_entry Function called with each entry to convert it to a
|
|
C++ literal of type 'entry_type'.
|
|
|
|
# Optional Keyword Parameters:
|
|
@param entries_name Name for the generated entry table.
|
|
|
|
@param return_type C++ return type, default: 'const entry_type&'
|
|
@param return_entry Override the default behaviour for returning the
|
|
found entry. 'entry' is a reference to the found
|
|
entry, and 'aKey' is the lookup key. 'return_entry'
|
|
can be used for additional checks, e.g. for keys
|
|
not in the table.
|
|
|
|
@param key_type C++ key type, default: 'const char*'
|
|
@param key_bytes 'const char*' expression to get bytes for 'aKey'
|
|
@param key_length 'size_t' expression to get length of 'aKey'"""
|
|
|
|
if entries_name is None:
|
|
entries_name = "s%sEntries" % name
|
|
|
|
cg = self.codegen(entries_name, entry_type)
|
|
entries = cg.gen_entries(lower_entry)
|
|
getter = cg.gen_getter(
|
|
name, return_type, return_entry, key_type, key_bytes, key_length
|
|
)
|
|
|
|
return "%s\n\n%s" % (entries, getter)
|
|
|
|
|
|
class CGHelper:
|
|
"""Helper object for generating C++ code for a PerfectHash.
|
|
Created using PerfectHash.codegen()."""
|
|
|
|
def __init__(self, phf, entries_name, entry_type):
|
|
self.phf = phf
|
|
self.entries_name = entries_name
|
|
self.entry_type = entry_type
|
|
|
|
@staticmethod
|
|
def _indent(text, amount=1):
|
|
return text.replace("\n", "\n" + (" " * amount))
|
|
|
|
def basis_ty(self):
|
|
"""Determine how big of an integer is needed for bases table."""
|
|
max_base = max(self.phf.table)
|
|
if max_base <= 0xFF:
|
|
return "uint8_t"
|
|
elif max_base <= 0xFFFF:
|
|
return "uint16_t"
|
|
return "uint32_t"
|
|
|
|
def basis_table(self, name="BASES"):
|
|
"""Generate code for a static basis table"""
|
|
bases = ""
|
|
for idx, base in enumerate(self.phf.table):
|
|
if idx and idx % 16 == 0: # 16 bases per line
|
|
bases += "\n "
|
|
bases += "%4d," % base
|
|
return (
|
|
textwrap.dedent(
|
|
"""\
|
|
static const %s %s[] = {
|
|
%s
|
|
};
|
|
"""
|
|
)
|
|
% (self.basis_ty(), name, bases)
|
|
)
|
|
|
|
def gen_entries(self, lower_entry):
|
|
"""Generate code for an entries table"""
|
|
entries = self._indent(
|
|
",\n".join(lower_entry(entry).rstrip() for entry in self.phf.entries)
|
|
)
|
|
return (
|
|
textwrap.dedent(
|
|
"""\
|
|
const %s %s[] = {
|
|
%s
|
|
};
|
|
"""
|
|
)
|
|
% (self.entry_type, self.entries_name, entries)
|
|
)
|
|
|
|
def gen_getter(
|
|
self,
|
|
name,
|
|
return_type=None,
|
|
return_entry="return entry;",
|
|
key_type="const char*",
|
|
key_bytes="aKey",
|
|
key_length="strlen(aKey)",
|
|
):
|
|
"""Generate the code for a C++ getter.
|
|
|
|
@param name Name for the entry getter function.
|
|
@param return_type C++ return type, default: 'const entry_type&'
|
|
@param return_entry Override the default behaviour for returning the
|
|
found entry. 'entry' is a reference to the found
|
|
entry, and 'aKey' is the lookup key. 'return_entry'
|
|
can be used for additional checks, e.g. for keys
|
|
not in the table.
|
|
|
|
@param key_type C++ key type, default: 'const char*'
|
|
@param key_bytes 'const char*' expression to get bytes for 'aKey'
|
|
@param key_length 'size_t' expression to get length of 'aKey'"""
|
|
|
|
if return_type is None:
|
|
return_type = "const %s&" % self.entry_type
|
|
|
|
return (
|
|
textwrap.dedent(
|
|
"""
|
|
%(return_type)s
|
|
%(name)s(%(key_type)s aKey)
|
|
{
|
|
%(basis_table)s
|
|
|
|
const char* bytes = %(key_bytes)s;
|
|
size_t length = %(key_length)s;
|
|
auto& entry = mozilla::perfecthash::Lookup(bytes, length, BASES,
|
|
%(entries_name)s);
|
|
%(return_entry)s
|
|
}
|
|
"""
|
|
)
|
|
% {
|
|
"name": name,
|
|
"basis_table": self._indent(self.basis_table()),
|
|
"entries_name": self.entries_name,
|
|
"return_type": return_type,
|
|
"return_entry": self._indent(return_entry),
|
|
"key_type": key_type,
|
|
"key_bytes": key_bytes,
|
|
"key_length": key_length,
|
|
}
|
|
)
|
|
|
|
def gen_jslinearstr_getter(
|
|
self, name, return_type=None, return_entry="return entry;"
|
|
):
|
|
"""Generate code for a specialized getter taking JSLinearStrings.
|
|
This getter avoids copying the JS string, but only supports ASCII keys.
|
|
|
|
@param name Name for the entry getter function.
|
|
@param return_type C++ return type, default: 'const entry_type&'
|
|
@param return_entry Override the default behaviour for returning the
|
|
found entry. 'entry' is a reference to the found
|
|
entry, and 'aKey' is the lookup key. 'return_entry'
|
|
can be used for additional checks, e.g. for keys
|
|
not in the table."""
|
|
|
|
assert all(
|
|
b <= 0x7F for e in self.phf.entries for b in self.phf.key(e)
|
|
), "non-ASCII key"
|
|
|
|
if return_type is None:
|
|
return_type = "const %s&" % self.entry_type
|
|
|
|
return (
|
|
textwrap.dedent(
|
|
"""
|
|
%(return_type)s
|
|
%(name)s(JSLinearString* aKey)
|
|
{
|
|
%(basis_table)s
|
|
|
|
size_t length = JS::GetLinearStringLength(aKey);
|
|
|
|
JS::AutoCheckCannotGC nogc;
|
|
if (JS::LinearStringHasLatin1Chars(aKey)) {
|
|
auto& entry = mozilla::perfecthash::Lookup(
|
|
JS::GetLatin1LinearStringChars(nogc, aKey),
|
|
length, BASES, %(entries_name)s);
|
|
|
|
%(return_entry)s
|
|
} else {
|
|
auto& entry = mozilla::perfecthash::Lookup(
|
|
JS::GetTwoByteLinearStringChars(nogc, aKey),
|
|
length, BASES, %(entries_name)s);
|
|
|
|
%(return_entry)s
|
|
}
|
|
}
|
|
"""
|
|
)
|
|
% {
|
|
"name": name,
|
|
"basis_table": self._indent(self.basis_table()),
|
|
"entries_name": self.entries_name,
|
|
"return_type": return_type,
|
|
"return_entry": self._indent(return_entry, 2),
|
|
}
|
|
)
|