summaryrefslogtreecommitdiffstats
path: root/vendor/memchr/scripts/make-byte-frequency-table
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/memchr/scripts/make-byte-frequency-table')
-rwxr-xr-xvendor/memchr/scripts/make-byte-frequency-table74
1 files changed, 0 insertions, 74 deletions
diff --git a/vendor/memchr/scripts/make-byte-frequency-table b/vendor/memchr/scripts/make-byte-frequency-table
deleted file mode 100755
index 37eeca7b7..000000000
--- a/vendor/memchr/scripts/make-byte-frequency-table
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python
-
-# This does simple normalized frequency analysis on UTF-8 encoded text. The
-# result of the analysis is translated to a ranked list, where every byte is
-# assigned a rank. This list is written to src/freqs.rs.
-#
-# Currently, the frequencies are generated from the following corpuses:
-#
-# * The CIA world fact book
-# * The source code of rustc
-# * Septuaginta
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-from collections import Counter
-import sys
-
-preamble = '''
-// NOTE: The following code was generated by "scripts/frequencies.py", do not
-// edit directly
-'''.lstrip()
-
-
-def eprint(*args, **kwargs):
- kwargs['file'] = sys.stderr
- print(*args, **kwargs)
-
-
-def main():
- p = argparse.ArgumentParser()
- p.add_argument('corpus', metavar='FILE', nargs='+')
- args = p.parse_args()
-
- # Get frequency counts of each byte.
- freqs = Counter()
- for i in range(0, 256):
- freqs[i] = 0
-
- eprint('reading entire corpus into memory')
- corpus = []
- for fpath in args.corpus:
- corpus.append(open(fpath, 'rb').read())
-
- eprint('computing byte frequencies')
- for c in corpus:
- for byte in c:
- freqs[byte] += 1.0 / float(len(c))
-
- eprint('writing Rust code')
- # Get the rank of each byte. A lower rank => lower relative frequency.
- rank = [0] * 256
- for i, (byte, _) in enumerate(freqs.most_common()):
- # print(byte)
- rank[byte] = 255 - i
-
- # Forcefully set the highest rank possible for bytes that start multi-byte
- # UTF-8 sequences. The idea here is that a continuation byte will be more
- # discerning in a homogenous haystack.
- for byte in range(0xC0, 0xFF + 1):
- rank[byte] = 255
-
- # Now write Rust.
- olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
- for byte in range(256):
- olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
- olines.append('];')
-
- print(preamble)
- print('\n'.join(olines))
-
-
-if __name__ == '__main__':
- main()