diff options
Diffstat (limited to 'third_party/rust/memchr/scripts/make-byte-frequency-table')
-rwxr-xr-x | third_party/rust/memchr/scripts/make-byte-frequency-table | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/third_party/rust/memchr/scripts/make-byte-frequency-table b/third_party/rust/memchr/scripts/make-byte-frequency-table new file mode 100755 index 0000000000..37eeca7b7d --- /dev/null +++ b/third_party/rust/memchr/scripts/make-byte-frequency-table @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +# This does simple normalized frequency analysis on UTF-8 encoded text. The +# result of the analysis is translated to a ranked list, where every byte is +# assigned a rank. This list is written to src/freqs.rs. +# +# Currently, the frequencies are generated from the following corpuses: +# +# * The CIA world fact book +# * The source code of rustc +# * Septuaginta + +from __future__ import absolute_import, division, print_function + +import argparse +from collections import Counter +import sys + +preamble = ''' +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly +'''.lstrip() + + +def eprint(*args, **kwargs): + kwargs['file'] = sys.stderr + print(*args, **kwargs) + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('corpus', metavar='FILE', nargs='+') + args = p.parse_args() + + # Get frequency counts of each byte. + freqs = Counter() + for i in range(0, 256): + freqs[i] = 0 + + eprint('reading entire corpus into memory') + corpus = [] + for fpath in args.corpus: + corpus.append(open(fpath, 'rb').read()) + + eprint('computing byte frequencies') + for c in corpus: + for byte in c: + freqs[byte] += 1.0 / float(len(c)) + + eprint('writing Rust code') + # Get the rank of each byte. A lower rank => lower relative frequency. + rank = [0] * 256 + for i, (byte, _) in enumerate(freqs.most_common()): + # print(byte) + rank[byte] = 255 - i + + # Forcefully set the highest rank possible for bytes that start multi-byte + # UTF-8 sequences. The idea here is that a continuation byte will be more + # discerning in a homogenous haystack. + for byte in range(0xC0, 0xFF + 1): + rank[byte] = 255 + + # Now write Rust. + olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] + for byte in range(256): + olines.append(' %3d, // %r' % (rank[byte], chr(byte))) + olines.append('];') + + print(preamble) + print('\n'.join(olines)) + + +if __name__ == '__main__': + main() |