diff options
Diffstat (limited to 'triehash')
-rw-r--r-- | triehash/.travis.yml | 14 | ||||
-rw-r--r-- | triehash/LICENSE.md | 17 | ||||
-rw-r--r-- | triehash/README.md | 58 | ||||
-rw-r--r-- | triehash/tests/framework.sh | 84 | ||||
-rwxr-xr-x | triehash/tests/run-tests.sh | 22 | ||||
-rwxr-xr-x | triehash/tests/test-basic | 245 | ||||
-rwxr-xr-x | triehash/tests/test-case-insensitive | 109 | ||||
-rwxr-xr-x | triehash/tests/test-enum-include-name-flags | 129 | ||||
-rwxr-xr-x | triehash/tests/test-multi-byte-level | 427 | ||||
-rwxr-xr-x | triehash/triehash.pl | 691 |
10 files changed, 1796 insertions, 0 deletions
diff --git a/triehash/.travis.yml b/triehash/.travis.yml new file mode 100644 index 0000000..c66ea96 --- /dev/null +++ b/triehash/.travis.yml @@ -0,0 +1,14 @@ +language: perl +perl: + - "5.24" + - "5.22" + - "5.20" + - "5.18" + - "5.16" + - "5.14" +install: + - cpanm --quiet --notest --skip-satisfied Devel::Cover Devel::Cover::Report::Codecov +script: + - ./tests/run-tests.sh +after_script: + - cover -report codecov diff --git a/triehash/LICENSE.md b/triehash/LICENSE.md new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/triehash/LICENSE.md @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/triehash/README.md b/triehash/README.md new file mode 100644 index 0000000..d6f6e7f --- /dev/null +++ b/triehash/README.md @@ -0,0 +1,58 @@ +# Order-preserving minimal perfect hash function generator + +Build order-preserving minimal perfect hash functions. + +[![codecov](https://codecov.io/gh/julian-klode/triehash/branch/master/graph/badge.svg)](https://codecov.io/gh/julian-klode/triehash) +[![Build Status](https://travis-ci.org/julian-klode/triehash.svg?branch=master)](https://travis-ci.org/julian-klode/triehash) + +## Performance + +Performance was evaluated against other hash functions. As an input set, the +fields of Debian Packages and Sources files was used, and each hash function +was run 1,000,000 times for each word. The byte count of the words were then +summed up and divided by the total number of nanoseconds each function ran, so +all speeds below are given in bytes per nanosecond, AKA gigabyte per second. + +arch/function|jak-x230 (amd64)|backup (amd64)|asachi.d.o (arm64)|asachi.d.o (armel)|asachi.d.o (armhf)|plummer.d.o (ppc64el)|eller.d.o (mipsel) +-------------|----------------|--------------|------------------|------------------|------------------|---------------------|------------------ +Trie | 2.4 | 1.9 | 1.2 | 0.9 | 0.8 | 2.0 | 0.2 +Trie (*) | 2.2 | 1.7 | 0.8 | 0.7 | 0.7 | 1.8 | 0.2 +re2c | 1.7 | 1.3 | 0.9 | 0.9 | 0.7 | 1.6 | 0.2 +re2c (*) | 1.2 | 0.9 | 0.6 | 0.6 | 0.5 | 1.1 | 0.1 +gperf (*) | 0.7 | 0.5 | 0.2 | 0.2 | 0.2 | 0.5 | 0.1 +gperf | 1.3 | 0.9 | 0.3 | 0.3 | 0.2 | 0.4 | 0.1 +djb (*) | 0.7 | 0.5 | 0.3 | 0.3 | 0.3 | 0.5 | 0.1 +djb (**) | 1.0 | 0.7 | 0.4 | 0.5 | 0.5 | 0.6 | 0.2 +djb | 0.9 | 0.7 | 0.5 | 0.5 | 0.5 | 0.7 | 0.2 +apt (*) | 1.2 | 0.9 | 0.7 | 0.7 | 0.7 | 1.1 | 0.2 +apt (**) | 2.3 | 1.7 | 0.7 | 0.9 | 0.8 | 1.9 | 0.2 + +And transposed: + +function/arch |Trie |Trie (*) |re2c |re2c (*) |gperf (*)|gperf |djb (*) |djb (**) |djb |apt (*) |apt (**) +---------------------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|--------- +jak-x230 (amd64) | 2.4| 2.2| 1.7| 1.2| 0.7| 1.3| 0.7| 1.0| 0.9| 1.2| 2.3 +backup (amd64) | 1.9| 1.7| 1.3| 0.9| 0.5| 0.9| 0.5| 0.7| 0.7| 0.9| 1.7 +asachi.d.o (arm64) | 1.2| 0.8| 0.9| 0.6| 0.2| 0.3| 0.3| 0.4| 0.5| 0.7| 0.7 +asachi.d.o (armel) | 0.9| 0.7| 0.9| 0.6| 0.2| 0.3| 0.3| 0.5| 0.5| 0.7| 0.9 +asachi.d.o (armhf) | 0.8| 0.7| 0.7| 0.5| 0.2| 0.2| 0.3| 0.5| 0.5| 0.7| 0.8 +plummer.d.o (ppc64el)| 2.0| 1.8| 1.6| 1.1| 0.5| 0.4| 0.5| 0.6| 0.7| 1.1| 1.9 +eller.d.o (mipsel) | 0.2| 0.2| 0.2| 0.1| 0.1| 0.1| 0.1| 0.2| 0.2| 0.2| 0.2 + + +Legend: + +* The (*) variants are case-insensitive, (**) are more optimised versions + of the (*) versions. +* DJB (*) is a DJB Hash with naive lowercase conversion, DJB (**) just ORs one + bit into each value to get alphabetical characters to be lowercase +* APT (*) is the AlphaHash function from APT which hashes the last 8 bytes in a + word in a case-insensitive manner. APT (**) is the same function unrolled. +* All hosts except the x230 are Debian porterboxes. The x230 has a Core i5-3320M, + barriere has an Opteron 23xx. + +Notes: + +* The overhead is larger than needed on some platforms due to gcc inserting + unneeded zero extend instructions, see: + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77729 diff --git a/triehash/tests/framework.sh b/triehash/tests/framework.sh new file mode 100644 index 0000000..51d4580 --- /dev/null +++ b/triehash/tests/framework.sh @@ -0,0 +1,84 @@ +#!/bin/sh +# Simple integration test framework + +set -e + + +cleanup() { + rm -f test.output test.c test.h test.tree +} + +dumpone() { + if [ -e "$@" ]; then + echo "Content of $@:" + cat "$@" | sed "s#^#\t#g" + fi + +} + +dump() { + dumpone test.output + dumpone test.c + dumpone test.h + dumpone test.tree + return 1 +} + +testsuccess() { + [ "$INNER" ] || cleanup + [ "$INNER" ] || echo "Testing success of $@" + if ! "$@" > test.output 2>&1; then + echo "ERROR: Running $@ failed with error $?, messages were:" >&2 + dump + return 1 + fi +} + +testfailure() { + [ "$INNER" ] || cleanup + [ "$INNER" ] || echo "Testing failure of $@" + if "$@" > test.output 2>&1; then + echo "ERROR: Running $@ unexpectedly succeeded, messages were:" >&2 + dump + return 1 + fi +} + +testfileequal() { + [ "$INNER" ] || echo "Testing output of $2" + printf "%b\n" "$1" > expected + if ! diff -u "expected" "$2" > test.diff; then + echo "ERROR: Differences between expected output and and $2:" >&2 + cat test.diff | sed "s#^#\t#g" + dump + return 1 + fi +} + +testgrep() { + [ "$INNER" ] || echo "Testing grep $@" + INNER=1 testsuccess grep "$@" + unset INNER +} + +testsuccessequal() { + expect="$1" + shift + cleanup + echo "Testing success and output of $@" + INNER=1 testsuccess "$@" + INNER=1 testfileequal "$expect" test.output + unset INNER +} + + +WORDS="Word-_0 +Word = 42 +VeryLongWord +Label ~ Word2 += -9" + +triehash() { + printf "%b\n" "$WORDS" | perl -MDevel::Cover=-summary,0,-silent,1 $(dirname $(dirname $(readlink -f $0)))/triehash.pl "$@" || return $? + return $? +} diff --git a/triehash/tests/run-tests.sh b/triehash/tests/run-tests.sh new file mode 100755 index 0000000..b9c1ec3 --- /dev/null +++ b/triehash/tests/run-tests.sh @@ -0,0 +1,22 @@ +#!/bin/sh +DIR=$(dirname $(readlink -f $0)) + +# Let's go into triehash.pl's dir +cd $(dirname "$DIR") + +rm -rf cover_db + +count=$(cd "$DIR" && echo test-* | wc -w) +i=1 + +for test in $DIR/test-*; do + echo "[$i/$count] Running testcase $test" + if ! $test > test.summary 2>&1; then + cat test.summary + exit 1 + fi + i=$((i + 1)) +done + + +cover diff --git a/triehash/tests/test-basic b/triehash/tests/test-basic new file mode 100755 index 0000000..19cb086 --- /dev/null +++ b/triehash/tests/test-basic @@ -0,0 +1,245 @@ +#!/bin/sh +. $(dirname $(readlink -f $0))/framework.sh + +# Check for non-existing files +testfailure triehash -C /does/not/exist1 -H /does/not/exist1 /does/not/exist/input + +# Check that we can specify - for -C and -H +testsuccessequal "#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum PerfectKey { + Unknown = -1, +}; +static enum PerfectKey PerfectHash(const char *string, size_t length); +static enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + default: + return Unknown; + } +} +#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C - -H - + +# Check that split files work +testsuccess triehash -C test.c -H test.h --multi-byte=0 +testfileequal "#include \"test.h\" + enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + default: + return Unknown; + } +}" test.c +testfileequal "#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum PerfectKey { + Unknown = -1, +}; + enum PerfectKey PerfectHash(const char *string, size_t length); +#endif /* TRIE_HASH_PerfectHash */" test.h + + +# Check the C code generator +testsuccess triehash -C test.c -H test.c /dev/stdin +testfileequal "#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum PerfectKey { + VeryLongWord = 43, + Word = 42, + Word___0 = 0, + Label = 44, + Unknown = -9, +}; +static enum PerfectKey PerfectHash(const char *string, size_t length); +#ifdef __GNUC__ +typedef uint16_t __attribute__((aligned (1))) triehash_uu16; +typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1]; +typedef uint32_t __attribute__((aligned (1))) triehash_uu32; +typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1]; +typedef uint64_t __attribute__((aligned (1))) triehash_uu64; +typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1]; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define onechar(c, s, l) (((uint64_t)(c)) << (s)) +#else +#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s)) +#endif +#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE) +#define TRIE_HASH_MULTI_BYTE +#endif +#endif /*GNUC */ +#ifdef TRIE_HASH_MULTI_BYTE +static enum PerfectKey PerfectHash4(const char *string) +{ + switch(*((triehash_uu32*) &string[0])) { + case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32): + return Word; + } + return Unknown; +} +static enum PerfectKey PerfectHash5(const char *string) +{ + switch(*((triehash_uu32*) &string[0])) { + case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32): + switch(string[4]) { + case 0| onechar('2', 0, 8): + return Label; + } + } + return Unknown; +} +static enum PerfectKey PerfectHash7(const char *string) +{ + switch(*((triehash_uu32*) &string[0])) { + case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32): + switch(string[4]) { + case 0| onechar('-', 0, 8): + switch(string[5]) { + case 0| onechar('_', 0, 8): + switch(string[6]) { + case 0| onechar('0', 0, 8): + return Word___0; + } + } + } + } + return Unknown; +} +static enum PerfectKey PerfectHash12(const char *string) +{ + switch(*((triehash_uu64*) &string[0])) { + case 0| onechar('V', 0, 64)| onechar('e', 8, 64)| onechar('r', 16, 64)| onechar('y', 24, 64)| onechar('L', 32, 64)| onechar('o', 40, 64)| onechar('n', 48, 64)| onechar('g', 56, 64): + switch(*((triehash_uu32*) &string[8])) { + case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32): + return VeryLongWord; + } + } + return Unknown; +} +#else +static enum PerfectKey PerfectHash4(const char *string) +{ + switch(string[0]) { + case 'W': + switch(string[1]) { + case 'o': + switch(string[2]) { + case 'r': + switch(string[3]) { + case 'd': + return Word; + } + } + } + } + return Unknown; +} +static enum PerfectKey PerfectHash5(const char *string) +{ + switch(string[0]) { + case 'W': + switch(string[1]) { + case 'o': + switch(string[2]) { + case 'r': + switch(string[3]) { + case 'd': + switch(string[4]) { + case '2': + return Label; + } + } + } + } + } + return Unknown; +} +static enum PerfectKey PerfectHash7(const char *string) +{ + switch(string[0]) { + case 'W': + switch(string[1]) { + case 'o': + switch(string[2]) { + case 'r': + switch(string[3]) { + case 'd': + switch(string[4]) { + case '-': + switch(string[5]) { + case '_': + switch(string[6]) { + case '0': + return Word___0; + } + } + } + } + } + } + } + return Unknown; +} +static enum PerfectKey PerfectHash12(const char *string) +{ + switch(string[0]) { + case 'V': + switch(string[1]) { + case 'e': + switch(string[2]) { + case 'r': + switch(string[3]) { + case 'y': + switch(string[4]) { + case 'L': + switch(string[5]) { + case 'o': + switch(string[6]) { + case 'n': + switch(string[7]) { + case 'g': + switch(string[8]) { + case 'W': + switch(string[9]) { + case 'o': + switch(string[10]) { + case 'r': + switch(string[11]) { + case 'd': + return VeryLongWord; + } + } + } + } + } + } + } + } + } + } + } + } + return Unknown; +} +#endif /* TRIE_HASH_MULTI_BYTE */ +static enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + case 4: + return PerfectHash4(string); + case 5: + return PerfectHash5(string); + case 7: + return PerfectHash7(string); + case 12: + return PerfectHash12(string); + default: + return Unknown; + } +} +#endif /* TRIE_HASH_PerfectHash */" test.c diff --git a/triehash/tests/test-case-insensitive b/triehash/tests/test-case-insensitive new file mode 100755 index 0000000..25ab2dd --- /dev/null +++ b/triehash/tests/test-case-insensitive @@ -0,0 +1,109 @@ +#!/bin/sh +. $(dirname $(readlink -f $0))/framework.sh + +WORDS="Halllo\nH-lllo\nHalll1" + +# Case-insensitive test +testsuccessequal "#include \"/dev/null\" +#ifdef __GNUC__ +typedef uint16_t __attribute__((aligned (1))) triehash_uu16; +typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1]; +typedef uint32_t __attribute__((aligned (1))) triehash_uu32; +typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1]; +typedef uint64_t __attribute__((aligned (1))) triehash_uu64; +typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1]; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define onechar(c, s, l) (((uint64_t)(c)) << (s)) +#else +#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s)) +#endif +#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE) +#define TRIE_HASH_MULTI_BYTE +#endif +#endif /*GNUC */ +#ifdef TRIE_HASH_MULTI_BYTE +static enum PerfectKey PerfectHash6(const char *string) +{ + switch(string[0] | 0x20) { + case 0| onechar('h', 0, 8): + switch(string[1]) { + case 0| onechar('-', 0, 8): + switch(*((triehash_uu32*) &string[2]) | 0x20202020) { + case 0| onechar('l', 0, 32)| onechar('l', 8, 32)| onechar('l', 16, 32)| onechar('o', 24, 32): + return H_lllo; + } + break; + case 0| onechar('a', 0, 8): + case 0| onechar('A', 0, 8): + switch(*((triehash_uu16*) &string[2]) | 0x2020) { + case 0| onechar('l', 0, 16)| onechar('l', 8, 16): + switch(string[4] | 0x20) { + case 0| onechar('l', 0, 8): + switch(string[5]) { + case 0| onechar('1', 0, 8): + return Halll1; + break; + case 0| onechar('o', 0, 8): + case 0| onechar('O', 0, 8): + return Halllo; + } + } + } + } + } + return Unknown; +} +#else +static enum PerfectKey PerfectHash6(const char *string) +{ + switch(string[0] | 0x20) { + case 'h': + switch(string[1]) { + case '-': + switch(string[2] | 0x20) { + case 'l': + switch(string[3] | 0x20) { + case 'l': + switch(string[4] | 0x20) { + case 'l': + switch(string[5] | 0x20) { + case 'o': + return H_lllo; + } + } + } + } + break; + case 'a': + case 'A': + switch(string[2] | 0x20) { + case 'l': + switch(string[3] | 0x20) { + case 'l': + switch(string[4] | 0x20) { + case 'l': + switch(string[5]) { + case '1': + return Halll1; + break; + case 'o': + case 'O': + return Halllo; + } + } + } + } + } + } + return Unknown; +} +#endif /* TRIE_HASH_MULTI_BYTE */ + enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + case 6: + return PerfectHash6(string); + default: + return Unknown; + } +}" triehash --multi-byte=3210 --ignore-case -H /dev/null /dev/stdin diff --git a/triehash/tests/test-enum-include-name-flags b/triehash/tests/test-enum-include-name-flags new file mode 100755 index 0000000..33bd97c --- /dev/null +++ b/triehash/tests/test-enum-include-name-flags @@ -0,0 +1,129 @@ +#!/bin/sh +. $(dirname $(readlink -f $0))/framework.sh + +# Need a short word, we really just need to check if the labels work +WORDS=w + +testsuccessequal "\ +#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +#include <foo.h> +enum PerfectKey { + w = 0, + Unknown = -1, +}; + enum PerfectKey PerfectHash(const char *string, size_t length); +#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 -C /dev/null --include="<foo.h>" /dev/stdin + +# Check for --enum-class support +testsuccessequal "\ +#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum class PerfectKey { + w = 0, + Unknown = -1, +}; +static enum PerfectKey PerfectHash(const char *string, size_t length); +static enum PerfectKey PerfectHash1(const char *string) +{ + switch(string[0]) { + case 'w': + return PerfectKey::w; + } + return PerfectKey::Unknown; +} +static enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + case 1: + return PerfectHash1(string); + default: + return PerfectKey::Unknown; + } +} +#endif /* TRIE_HASH_PerfectHash */" triehash --multi-byte=0 --enum-class /dev/stdin + +# Check for --enum-name support +testsuccessequal "\ +#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum Foo { + Unknown = -1, +}; +static enum Foo PerfectHash(const char *string, size_t length); +static enum Foo PerfectHash(const char *string, size_t length) +{ + switch (length) { + default: + return Unknown; + } +} +#endif /* TRIE_HASH_PerfectHash */\ +" triehash --multi-byte=0 --enum-name="Foo" + +# Check for --enum-class support +testsuccessequal "\ +#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum class Foo::Bar { + Unknown = -1, +}; +static enum Foo::Bar PerfectHash(const char *string, size_t length); +static enum Foo::Bar PerfectHash(const char *string, size_t length) +{ + switch (length) { + default: + return Foo::Bar::Unknown; + } +} +#endif /* TRIE_HASH_PerfectHash */\ +" triehash --multi-byte=0 --enum-class --enum-name="Foo::Bar" + +# Check for --function-name support +testsuccessequal "\ +#ifndef TRIE_HASH_NonSense +#define TRIE_HASH_NonSense +#include <stddef.h> +#include <stdint.h> +enum PerfectKey { + Unknown = -1, +}; +static enum PerfectKey NonSense(const char *string, size_t length); +static enum PerfectKey NonSense(const char *string, size_t length) +{ + switch (length) { + default: + return Unknown; + } +} +#endif /* TRIE_HASH_NonSense */\ +" triehash --multi-byte=0 --function-name="NonSense" + +# Check for --counter-name support +testsuccessequal "\ +#ifndef TRIE_HASH_PerfectHash +#define TRIE_HASH_PerfectHash +#include <stddef.h> +#include <stdint.h> +enum { MyCounter = 0 }; +enum PerfectKey { + Unknown = -1, +}; +static enum PerfectKey PerfectHash(const char *string, size_t length); +static enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + default: + return Unknown; + } +} +#endif /* TRIE_HASH_PerfectHash */\ +" triehash --multi-byte=0 --counter-name="MyCounter" diff --git a/triehash/tests/test-multi-byte-level b/triehash/tests/test-multi-byte-level new file mode 100755 index 0000000..ddfb8cd --- /dev/null +++ b/triehash/tests/test-multi-byte-level @@ -0,0 +1,427 @@ +#!/bin/sh +. $(dirname $(readlink -f $0))/framework.sh + +# Check that building a single-byte trie works +testsuccessequal "\ +┌────────────────────────────────────────────────────┐ +│ Initial trie │ +└────────────────────────────────────────────────────┘ + +├── V +│ ├── e +│ │ ├── r +│ │ │ ├── y +│ │ │ │ ├── L +│ │ │ │ │ ├── o +│ │ │ │ │ │ ├── n +│ │ │ │ │ │ │ ├── g +│ │ │ │ │ │ │ │ ├── W +│ │ │ │ │ │ │ │ │ ├── o +│ │ │ │ │ │ │ │ │ │ ├── r +│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d → Word +│ │ │ │ ├── - +│ │ │ │ │ ├── _ +│ │ │ │ │ │ ├── 0 → Word-_0 +│ │ │ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Rebuilt trie │ +└────────────────────────────────────────────────────┘ + +├── V +│ ├── e +│ │ ├── r +│ │ │ ├── y +│ │ │ │ ├── L +│ │ │ │ │ ├── o +│ │ │ │ │ │ ├── n +│ │ │ │ │ │ │ ├── g +│ │ │ │ │ │ │ │ ├── W +│ │ │ │ │ │ │ │ │ ├── o +│ │ │ │ │ │ │ │ │ │ ├── r +│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d → Word +│ │ │ │ ├── - +│ │ │ │ │ ├── _ +│ │ │ │ │ │ ├── 0 → Word-_0 +│ │ │ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 4 │ +└────────────────────────────────────────────────────┘ + +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d → Word +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 5 │ +└────────────────────────────────────────────────────┘ + +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d +│ │ │ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 7 │ +└────────────────────────────────────────────────────┘ + +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d +│ │ │ │ ├── - +│ │ │ │ │ ├── _ +│ │ │ │ │ │ ├── 0 → Word-_0 +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 12 │ +└────────────────────────────────────────────────────┘ + +├── V +│ ├── e +│ │ ├── r +│ │ │ ├── y +│ │ │ │ ├── L +│ │ │ │ │ ├── o +│ │ │ │ │ │ ├── n +│ │ │ │ │ │ │ ├── g +│ │ │ │ │ │ │ │ ├── W +│ │ │ │ │ │ │ │ │ ├── o +│ │ │ │ │ │ │ │ │ │ ├── r +│ │ │ │ │ │ │ │ │ │ │ ├── d → VeryLongWord" triehash --multi-byte=0 -l tree /dev/stdin + +# Two byte optimization +testsuccessequal "\ +┌────────────────────────────────────────────────────┐ +│ Initial trie │ +└────────────────────────────────────────────────────┘ + +├── Ve +│ ├── ry +│ │ ├── Lo +│ │ │ ├── ng +│ │ │ │ ├── Wo +│ │ │ │ │ ├── rd → VeryLongWord +├── Wo +│ ├── rd → Word +│ │ ├── -_ +│ │ │ ├── 0 → Word-_0 +│ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Rebuilt trie │ +└────────────────────────────────────────────────────┘ + +├── Ve +│ ├── ry +│ │ ├── Lo +│ │ │ ├── ng +│ │ │ │ ├── Wo +│ │ │ │ │ ├── rd → VeryLongWord +├── Wo +│ ├── rd → Word +│ │ ├── - +│ │ │ ├── _0 → Word-_0 +│ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 4 │ +└────────────────────────────────────────────────────┘ + +├── Wo +│ ├── rd → Word +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 5 │ +└────────────────────────────────────────────────────┘ + +├── Wo +│ ├── rd +│ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 7 │ +└────────────────────────────────────────────────────┘ + +├── Wo +│ ├── rd +│ │ ├── -_ +│ │ │ ├── 0 → Word-_0 +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 12 │ +└────────────────────────────────────────────────────┘ + +├── Ve +│ ├── ry +│ │ ├── Lo +│ │ │ ├── ng +│ │ │ │ ├── Wo +│ │ │ │ │ ├── rd → VeryLongWord" triehash --multi-byte=1 -l tree /dev/stdin +# Four byte optimization +testsuccessequal "\ +┌────────────────────────────────────────────────────┐ +│ Initial trie │ +└────────────────────────────────────────────────────┘ + +├── Very +│ ├── Long +│ │ ├── Word → VeryLongWord +├── Word → Word +│ ├── - +│ │ ├── _ +│ │ │ ├── 0 → Word-_0 +│ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Rebuilt trie │ +└────────────────────────────────────────────────────┘ + +├── Very +│ ├── Long +│ │ ├── Word → VeryLongWord +├── Word → Word +│ ├── - +│ │ ├── _ +│ │ │ ├── 0 → Word-_0 +│ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 4 │ +└────────────────────────────────────────────────────┘ + +├── Word → Word +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 5 │ +└────────────────────────────────────────────────────┘ + +├── Word +│ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 7 │ +└────────────────────────────────────────────────────┘ + +├── Word +│ ├── - +│ │ ├── _ +│ │ │ ├── 0 → Word-_0 +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 12 │ +└────────────────────────────────────────────────────┘ + +├── Very +│ ├── Long +│ │ ├── Word → VeryLongWord" triehash --multi-byte=2 -l tree /dev/stdin +# Eigh byte optimization +testsuccessequal "\ +┌────────────────────────────────────────────────────┐ +│ Initial trie │ +└────────────────────────────────────────────────────┘ + +├── VeryLong +│ ├── W +│ │ ├── o +│ │ │ ├── r +│ │ │ │ ├── d → VeryLongWord +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d → Word +│ │ │ │ ├── - +│ │ │ │ │ ├── _ +│ │ │ │ │ │ ├── 0 → Word-_0 +│ │ │ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Rebuilt trie │ +└────────────────────────────────────────────────────┘ + +├── V +│ ├── eryLongW +│ │ ├── o +│ │ │ ├── r +│ │ │ │ ├── d → VeryLongWord +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d → Word +│ │ │ │ ├── - +│ │ │ │ │ ├── _ +│ │ │ │ │ │ ├── 0 → Word-_0 +│ │ │ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 4 │ +└────────────────────────────────────────────────────┘ + +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d → Word +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 5 │ +└────────────────────────────────────────────────────┘ + +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d +│ │ │ │ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 7 │ +└────────────────────────────────────────────────────┘ + +├── W +│ ├── o +│ │ ├── r +│ │ │ ├── d +│ │ │ │ ├── - +│ │ │ │ │ ├── _ +│ │ │ │ │ │ ├── 0 → Word-_0 +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 12 │ +└────────────────────────────────────────────────────┘ + +├── VeryLong +│ ├── W +│ │ ├── o +│ │ │ ├── r +│ │ │ │ ├── d → VeryLongWord" triehash --multi-byte=3 -l tree /dev/stdin + + +# Check that building a multi-byte trie works +testsuccessequal "\ +┌────────────────────────────────────────────────────┐ +│ Initial trie │ +└────────────────────────────────────────────────────┘ + +├── VeryLong +│ ├── Word → VeryLongWord +├── Word → Word +│ ├── - +│ │ ├── _ +│ │ │ ├── 0 → Word-_0 +│ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Rebuilt trie │ +└────────────────────────────────────────────────────┘ + +├── Very +│ ├── LongWord → VeryLongWord +├── Word → Word +│ ├── - +│ │ ├── _ +│ │ │ ├── 0 → Word-_0 +│ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 4 │ +└────────────────────────────────────────────────────┘ + +├── Word → Word +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 5 │ +└────────────────────────────────────────────────────┘ + +├── Word +│ ├── 2 → Label +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 7 │ +└────────────────────────────────────────────────────┘ + +├── Word +│ ├── - +│ │ ├── _ +│ │ │ ├── 0 → Word-_0 +┌────────────────────────────────────────────────────┐ +│ Trie for words of length 12 │ +└────────────────────────────────────────────────────┘ + +├── VeryLong +│ ├── Word → VeryLongWord" triehash -l tree /dev/stdin + + +###### CHANGE THE WORDS FOR THE FOLLOWING TESTS ####### +WORDS="Word" + +# Check that we are generating the proper multi-byte and fallback sessions +testsuccessequal "#include \"/dev/null\" +#ifdef __GNUC__ +typedef uint16_t __attribute__((aligned (1))) triehash_uu16; +typedef char static_assert16[__alignof__(triehash_uu16) == 1 ? 1 : -1]; +typedef uint32_t __attribute__((aligned (1))) triehash_uu32; +typedef char static_assert32[__alignof__(triehash_uu32) == 1 ? 1 : -1]; +typedef uint64_t __attribute__((aligned (1))) triehash_uu64; +typedef char static_assert64[__alignof__(triehash_uu64) == 1 ? 1 : -1]; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define onechar(c, s, l) (((uint64_t)(c)) << (s)) +#else +#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s)) +#endif +#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE) +#define TRIE_HASH_MULTI_BYTE +#endif +#endif /*GNUC */ +#ifdef TRIE_HASH_MULTI_BYTE +static enum PerfectKey PerfectHash4(const char *string) +{ + switch(*((triehash_uu32*) &string[0])) { + case 0| onechar('W', 0, 32)| onechar('o', 8, 32)| onechar('r', 16, 32)| onechar('d', 24, 32): + return Word; + } + return Unknown; +} +#else +static enum PerfectKey PerfectHash4(const char *string) +{ + switch(string[0]) { + case 'W': + switch(string[1]) { + case 'o': + switch(string[2]) { + case 'r': + switch(string[3]) { + case 'd': + return Word; + } + } + } + } + return Unknown; +} +#endif /* TRIE_HASH_MULTI_BYTE */ + enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + case 4: + return PerfectHash4(string); + default: + return Unknown; + } +}" triehash -H /dev/null /dev/stdin + + +# Check that we are generating no multi-byte session +testsuccessequal "#include \"/dev/null\" +static enum PerfectKey PerfectHash4(const char *string) +{ + switch(string[0]) { + case 'W': + switch(string[1]) { + case 'o': + switch(string[2]) { + case 'r': + switch(string[3]) { + case 'd': + return Word; + } + } + } + } + return Unknown; +} + enum PerfectKey PerfectHash(const char *string, size_t length) +{ + switch (length) { + case 4: + return PerfectHash4(string); + default: + return Unknown; + } +}" triehash --multi-byte=0 -H /dev/null /dev/stdin diff --git a/triehash/triehash.pl b/triehash/triehash.pl new file mode 100755 index 0000000..841c0e7 --- /dev/null +++ b/triehash/triehash.pl @@ -0,0 +1,691 @@ +#!/usr/bin/perl -w +# +# Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + +=head1 NAME + +triehash - Generate a perfect hash function derived from a trie. + +=cut + +use strict; +use warnings; +use Getopt::Long; + +=head1 SYNOPSIS + +B<triehash> [S<I<option>>] [S<I<input file>>] + +=head1 DESCRIPTION + +triehash takes a list of words in input file and generates a function and +an enumeration to describe the word + +=head1 INPUT FILE FORMAT + +The file consists of multiple lines of the form: + + [label ~ ] word [= value] + +This maps word to value, and generates an enumeration with entries of the form: + + label = value + +If I<label> is undefined, the word will be used, the minus character will be +replaced by an underscore. If value is undefined it is counted upwards from +the last value. + +There may also be one line of the format + + [ label ~] = value + +Which defines the value to be used for non-existing keys. Note that this also +changes default value for other keys, as for normal entries. So if you place + + = 0 + +at the beginning of the file, unknown strings map to 0, and the other strings +map to values starting with 1. If label is not specified, the default is +I<Unknown>. + +=head1 OPTIONS + +=over 4 + +=item B<-C>I<.c file> B<--code>=I<.c file> + +Generate code in the given file. + +=item B<-H>I<header file> B<--header>=I<header file> + +Generate a header in the given file, containing a declaration of the hash +function and an enumeration. + +=item B<--enum-name=>I<word> + +The name of the enumeration. + +=item B<--function-name=>I<word> + +The name of the function. + +=item B<--namespace=>I<name> + +Put the function and enum into a namespace (C++) + +=item B<--class=>I<name> + +Put the function and enum into a class (C++) + +=item B<--enum-class> + +Generate an enum class instead of an enum (C++) + +=item B<--counter-name=>I<name> + +Use I<name> for a counter that is set to the latest entry in the enumeration ++ 1. This can be useful for defining array sizes. + +=item B<--extern-c> + +Wrap everything into an extern "C" block. Not compatible with the C++ +options, as a header with namespaces, classes, or enum classes is not +valid C. + +=item B<--multi-byte>=I<value> + +Generate code reading multiple bytes at once. The value is a string of power +of twos to enable. The default value is 320 meaning that 8, 4, and single byte +reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you +also want to allow 2-byte reads. 2-byte reads are disabled by default because +they negatively affect performance on older Intel architectures. + +This generates code for both multiple bytes and single byte reads, but only +enables the multiple byte reads of GNU C compatible compilers, as the following +extensions are used: + +=over 8 + +=item Byte-aligned integers + +We must be able to generate integers that are aligned to a single byte using: + + typedef uint64_t __attribute__((aligned (1))) triehash_uu64; + +=item Byte-order + +The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined. + +=back + +We forcefully disable multi-byte reads on platforms where the variable +I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined, +as there is a measurable overhead from emulating the unaligned reads on +ARM. + +=item B<--language=>I<language> + +Generate a file in the specified language. Currently known are 'C' and 'tree', +the latter generating a tree. + +=item B<--include=>I<header> + +Add the header to the include statements of the header file. The value must +be surrounded by quotes or angle brackets for C code. May be specified multiple +times. + +=back + +=cut + +my $unknown = -1; +my $unknown_label = "Unknown"; +my $counter_start = 0; +my $enum_name = "PerfectKey"; +my $function_name = "PerfectHash"; +my $enum_class = 0; + +my $code_name = "-"; +my $header_name = "-"; +my $code; +my $header; +my $ignore_case = 0; +my $multi_byte = "320"; +my $language = 'C'; +my $counter_name = undef; +my @includes = (); + + +Getopt::Long::config('default', + 'bundling', + 'no_getopt_compat', + 'no_auto_abbrev', + 'permute', + 'auto_help'); + +GetOptions ("code|C=s" => \$code_name, + "header|H=s" => \$header_name, + "function-name=s" => \$function_name, + "ignore-case" => \$ignore_case, + "enum-name=s" => \$enum_name, + "language|l=s" => \$language, + "multi-byte=s" => \$multi_byte, + "enum-class" => \$enum_class, + "include=s" => \@includes, + "counter-name=s" => \$counter_name) + or die("Could not parse options!"); + + +# This implements a simple trie. Each node has three attributes: +# +# children - A hash of keys to other nodes +# value - The value to be stored here +# label - A named representation of the value. +# +# The key at each level of the trie can consist of one or more bytes, and the +# trie can be normalized to a form where all keys at a level have the same +# length using rebuild_tree(). +package Trie { + + sub new { + my $class = shift; + my $self = {}; + bless $self, $class; + + $self->{children} = {}; + $self->{value} = undef; + $self->{label} = undef; + + return $self; + } + + # Return the largest power of 2 smaller or equal to the argument + sub alignpower2 { + my ($self, $length) = @_; + + return 8 if ($length >= 8 && $multi_byte =~ /3/); + return 4 if ($length >= 4 && $multi_byte =~ /2/); + return 2 if ($length >= 2 && $multi_byte =~ /1/); + + return 1; + } + + # Split the key into a head block and a tail + sub split_key { + my ($self, $key) = @_; + my $length = length $key; + my $split = $self->alignpower2($length); + + return (substr($key, 0, $split), substr($key, $split)); + } + + # Given a key, a label, and a value, insert that into the tree, possibly + # replacing an existing node. + sub insert { + my ($self, $key, $label, $value) = @_; + + if (length($key) == 0) { + $self->{label} = $label; + $self->{value} = $value; + return; + } + + my ($child, $tail) = $self->split_key($key); + + $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child})); + + $self->{children}{$child}->insert($tail, $label, $value); + } + + # Construct a new trie that only contains words of a given length. This + # is used to split up the common trie after knowing all words, so we can + # switch on the expected word length first, and have the per-trie function + # implement simple longest prefix matching. + sub filter_depth { + my ($self, $togo) = @_; + + my $new = Trie->new; + + if ($togo != 0) { + my $found = 0; + foreach my $key (sort keys %{$self->{children}}) { + if ($togo > length($key) || defined $self->{children}{$key}->{value}) { + my $child = $self->{children}{$key}->filter_depth($togo - length($key)); + + $new->{children}{$key}= $child if defined $child; + $found = 1 if defined $child; + } + } + return undef if (!$found); + } else { + $new->{value} = $self->{value}; + $new->{label} = $self->{label}; + } + + return $new; + } + + # (helper for rebuild_tree) + # Reinsert all value nodes into the specified $trie, prepending $prefix + # to their $paths. + sub reinsert_value_nodes_into { + my ($self, $trie, $prefix) = @_; + + $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value}); + + foreach my $key (sort keys %{$self->{children}}) { + $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key); + } + } + + # (helper for rebuild_tree) + # Find the earliest point to split a key. Normally, we split at the maximum + # power of 2 that is greater or equal than the length of the key. When we + # are building an ASCII-optimised case-insensitive trie that simply ORs + # each byte with 0x20, we need to split at the first ambiguous character: + # + # For example, the words a-bc and a\rbc are identical in such a situation: + # '-' | 0x20 == '-' == '\r' | 0x20 + # We cannot simply switch on all 4 bytes at once, but need to split before + # the ambiguous character so we can process the ambiguous character on its + # own. + sub find_ealier_split { + my ($self, $key) = @_; + + if ($ignore_case) { + for my $i (0..length($key)-1) { + # If the key starts with an ambiguous character, we need to + # take only it. Otherwise, we need to take everything + # before the character. + return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1))); + } + } + return $self->alignpower2(length $key); + } + + # This rebuilds the trie, splitting each key before ambiguous characters + # as explained in find_earlier_split(), and then chooses the smallest + # such split at each level, so that all keys at all levels have the same + # length (so we can use a multi-byte switch). + sub rebuild_tree { + my $self = shift; + # Determine if/where we need to split before an ambiguous character + my $new_split = 99999999999999999; + foreach my $key (sort keys %{$self->{children}}) { + my $special_length = $self->find_ealier_split($key); + $new_split = $special_length if ($special_length < $new_split); + } + + # Start building a new uniform trie + my $newself = Trie->new; + $newself->{label} = $self->{label}; + $newself->{value} = $self->{value}; + $newself->{children} = {}; + + foreach my $key (sort keys %{$self->{children}}) { + my $head = substr($key, 0, $new_split); + my $tail = substr($key, $new_split); + # Rebuild the child node at $head, pushing $tail downwards + $newself->{children}{$head} //= Trie->new; + $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail); + # We took up to one special character of each key label. There might + # be more, so we need to rebuild recursively. + $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree(); + } + + return $newself; + } +} + +# Code generator for C and C++ +package CCodeGen { + my $static = ($code_name eq $header_name) ? "static" : ""; + my $enum_specifier = $enum_class ? "enum class" : "enum"; + + sub new { + my $class = shift; + my $self = {}; + bless $self, $class; + + return $self; + } + + sub open_output { + my $self = shift; + if ($code_name ne "-") { + open($code, '>', $code_name) or die "Cannot open $code_name: $!" ; + } else { + $code = *STDOUT; + } + if($code_name eq $header_name) { + $header = $code; + } elsif ($header_name ne "-") { + open($header, '>', $header_name) or die "Cannot open $header_name: $!" ; + } else { + $header = *STDOUT; + } + } + + sub word_to_label { + my ($class, $word) = @_; + + $word =~ s/_/__/g; + $word =~ s/-/_/g; + return $word; + } + + # Return a case label, by shifting and or-ing bytes in the word + sub case_label { + my ($self, $key) = @_; + + return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte; + + my $output = '0'; + + for my $i (0..length($key)-1) { + $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key)); + } + + return $output; + } + + # Return an appropriate read instruction for $length bytes from $offset + sub switch_key { + my ($self, $offset, $length) = @_; + + return "string[$offset]" if $length == 1; + return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8); + } + + # Render the trie so that it matches the longest prefix. + sub print_table { + my ($self, $trie, $fh, $indent, $index) = @_; + $indent //= 0; + $index //= 0; + + # If we have children, try to match them. + if (%{$trie->{children}}) { + # The difference between lowercase and uppercase alphabetical characters + # is that they have one bit flipped. If we have alphabetical characters + # in the search space, and the entire search space works fine if we + # always turn on the flip, just OR the character we are switching over + # with the bit. + my $want_use_bit = 0; + my $can_use_bit = 1; + my $key_length = 0; + foreach my $key (sort keys %{$trie->{children}}) { + $can_use_bit &= not main::ambiguous($key); + $want_use_bit |= ($key =~ /^[a-zA-Z]+$/); + $key_length = length($key); + } + + if ($ignore_case && $can_use_bit && $want_use_bit) { + printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length); + } else { + printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length)); + } + + my $notfirst = 0; + foreach my $key (sort keys %{$trie->{children}}) { + if ($notfirst) { + printf $fh (" " x $indent . " break;\n"); + } + if ($ignore_case) { + printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key))); + printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit); + } else { + printf $fh (" " x $indent . "case %s:\n", $self->case_label($key)); + } + + $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key)); + + $notfirst=1; + } + + printf $fh (" " x $indent . "}\n"); + } + + + # This node has a value, so it is a possible end point. If no children + # matched, we have found our longest prefix. + if (defined $trie->{value}) { + printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label}); + } + + } + + sub print_words { + my ($self, $trie, $fh, $indent, $sofar) = @_; + + $indent //= 0; + $sofar //= ""; + + + printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value}; + + foreach my $key (sort keys %{$trie->{children}}) { + $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key); + } + } + + sub print_functions { + my ($self, $trie, %lengths) = @_; + foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { + print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n"); + print $code ("{\n"); + $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1); + printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : "")); + print $code ("}\n"); + } + } + + sub main { + my ($self, $trie, $num_values, %lengths) = @_; + print $header ("#ifndef TRIE_HASH_${function_name}\n"); + print $header ("#define TRIE_HASH_${function_name}\n"); + print $header ("#include <stddef.h>\n"); + print $header ("#include <stdint.h>\n"); + foreach my $include (@includes) { + print $header ("#include $include\n"); + } + printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name)); + print $header ("${enum_specifier} ${enum_name} {\n"); + $self->print_words($trie, $header, 1); + printf $header (" $unknown_label = $unknown,\n"); + print $header ("};\n"); + print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n"); + + print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name); + + if ($multi_byte) { + print $code ("#ifdef __GNUC__\n"); + for (my $i=16; $i <= 64; $i *= 2) { + print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n"); + print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n"); + } + + print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n"); + print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n"); + print $code ("#else\n"); + print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n"); + print $code ("#endif\n"); + print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n"); + print $code ("#define TRIE_HASH_MULTI_BYTE\n"); + print $code ("#endif\n"); + print $code ("#endif /*GNUC */\n"); + + print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n"); + $self->print_functions($trie, %lengths); + $multi_byte = 0; + print $code ("#else\n"); + $self->print_functions($trie, %lengths); + print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n"); + } else { + $self->print_functions($trie, %lengths); + } + + print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n"); + print $code ("{\n"); + print $code (" switch (length) {\n"); + foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { + print $code (" case $local_length:\n"); + print $code (" return ${function_name}${local_length}(string);\n"); + } + print $code (" default:\n"); + printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : "")); + print $code (" }\n"); + print $code ("}\n"); + + # Print end of header here, in case header and code point to the same file + print $header ("#endif /* TRIE_HASH_${function_name} */\n"); + } +} + +# A character is ambiguous if the 1<<5 (0x20) bit does not correspond to the +# lower case bit. A word is ambiguous if any character is. This definition is +# used to check if we can perform the |0x20 optimization when building a case- +# insensitive trie. +sub ambiguous { + my $word = shift; + + foreach my $char (split //, $word) { + # If 0x20 does not solely indicate lowercase, it is ambiguous + return 1 if ord(lc($char)) != (ord($char) | 0x20); + return 1 if ord(uc($char)) != (ord($char) & ~0x20); + } + + return 0; +} + +sub build_trie { + my $codegen = shift; + my $trie = Trie->new; + + my $counter = $counter_start; + my %lengths; + + open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!"; + while (my $line = <$input>) { + my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/; + + if (defined $word) { + $counter = $value if defined($value); + $label //= $codegen->word_to_label($word); + + $trie->insert($word, $label, $counter); + $lengths{length($word)} = 1; + $counter++; + } elsif (defined $value) { + $unknown = $value; + $unknown_label = $label if defined($label); + $counter = $value + 1; + } else { + die "Invalid line: $line"; + } + } + + return ($trie, $counter, %lengths); +} + +# Generates an ASCII art tree +package TreeCodeGen { + + sub new { + my $class = shift; + my $self = {}; + bless $self, $class; + + return $self; + } + + sub word_to_label { + my ($self, $word) = @_; + return $word; + } + + sub main { + my ($self, $trie, $counter, %lengths) = @_; + printf $code ("┌────────────────────────────────────────────────────┐\n"); + printf $code ("│ Initial trie │\n"); + printf $code ("└────────────────────────────────────────────────────┘\n"); + $self->print($trie); + printf $code ("┌────────────────────────────────────────────────────┐\n"); + printf $code ("│ Rebuilt trie │\n"); + printf $code ("└────────────────────────────────────────────────────┘\n"); + $self->print($trie->rebuild_tree()); + + foreach my $local_length (sort { $a <=> $b } (keys %lengths)) { + printf $code ("┌────────────────────────────────────────────────────┐\n"); + printf $code ("│ Trie for words of length %-4d │\n", $local_length); + printf $code ("└────────────────────────────────────────────────────┘\n"); + $self->print($trie->filter_depth($local_length)->rebuild_tree()); + } + } + + sub open_output { + my $self = shift; + if ($code_name ne "-") { + open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ; + } else { + $code = *STDOUT; + } + } + + # Print a trie + sub print { + my ($self, $trie, $depth) = @_; + $depth //= 0; + + print $code (" → ") if defined($trie->{label}); + print $code ($trie->{label} // "", "\n"); + foreach my $key (sort keys %{$trie->{children}}) { + print $code ("│ " x ($depth), "├── $key"); + $self->print($trie->{children}{$key}, $depth + 1); + } + } +} + +my %codegens = ( + C => "CCodeGen", + tree => "TreeCodeGen", +); + + +defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens); +my $codegen = $codegens{$language}->new(); +my ($trie, $counter, %lengths) = build_trie($codegen); + +$codegen->open_output(); +$codegen->main($trie, $counter, %lengths); + + +=head1 LICENSE + +triehash is available under the MIT/Expat license, see the source code +for more information. + +=head1 AUTHOR + +Julian Andres Klode <jak@jak-linux.org> + +=cut + |