#!/usr/bin/env python # This does simple normalized frequency analysis on UTF-8 encoded text. The # result of the analysis is translated to a ranked list, where every byte is # assigned a rank. This list is written to src/freqs.rs. # # Currently, the frequencies are generated from the following corpuses: # # * The CIA world fact book # * The source code of rustc # * Septuaginta from __future__ import absolute_import, division, print_function import argparse from collections import Counter import sys preamble = ''' // NOTE: The following code was generated by "scripts/frequencies.py", do not // edit directly '''.lstrip() def eprint(*args, **kwargs): kwargs['file'] = sys.stderr print(*args, **kwargs) def main(): p = argparse.ArgumentParser() p.add_argument('corpus', metavar='FILE', nargs='+') args = p.parse_args() # Get frequency counts of each byte. freqs = Counter() for i in range(0, 256): freqs[i] = 0 eprint('reading entire corpus into memory') corpus = [] for fpath in args.corpus: corpus.append(open(fpath, 'rb').read()) eprint('computing byte frequencies') for c in corpus: for byte in c: freqs[byte] += 1.0 / float(len(c)) eprint('writing Rust code') # Get the rank of each byte. A lower rank => lower relative frequency. rank = [0] * 256 for i, (byte, _) in enumerate(freqs.most_common()): # print(byte) rank[byte] = 255 - i # Forcefully set the highest rank possible for bytes that start multi-byte # UTF-8 sequences. The idea here is that a continuation byte will be more # discerning in a homogenous haystack. for byte in range(0xC0, 0xFF + 1): rank[byte] = 255 # Now write Rust. olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] for byte in range(256): olines.append(' %3d, // %r' % (rank[byte], chr(byte))) olines.append('];') print(preamble) print('\n'.join(olines)) if __name__ == '__main__': main()