summaryrefslogtreecommitdiffstats
path: root/third_party/rust/memchr/scripts/make-byte-frequency-table
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/rust/memchr/scripts/make-byte-frequency-table
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/memchr/scripts/make-byte-frequency-table')
-rwxr-xr-xthird_party/rust/memchr/scripts/make-byte-frequency-table74
1 files changed, 74 insertions, 0 deletions
diff --git a/third_party/rust/memchr/scripts/make-byte-frequency-table b/third_party/rust/memchr/scripts/make-byte-frequency-table
new file mode 100755
index 0000000000..37eeca7b7d
--- /dev/null
+++ b/third_party/rust/memchr/scripts/make-byte-frequency-table
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+# This does simple normalized frequency analysis on UTF-8 encoded text. The
+# result of the analysis is translated to a ranked list, where every byte is
+# assigned a rank. This list is written to src/freqs.rs.
+#
+# Currently, the frequencies are generated from the following corpuses:
+#
+# * The CIA world fact book
+# * The source code of rustc
+# * Septuaginta
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from collections import Counter
+import sys
+
+preamble = '''
+// NOTE: The following code was generated by "scripts/frequencies.py", do not
+// edit directly
+'''.lstrip()
+
+
+def eprint(*args, **kwargs):
+ kwargs['file'] = sys.stderr
+ print(*args, **kwargs)
+
+
+def main():
+ p = argparse.ArgumentParser()
+ p.add_argument('corpus', metavar='FILE', nargs='+')
+ args = p.parse_args()
+
+ # Get frequency counts of each byte.
+ freqs = Counter()
+ for i in range(0, 256):
+ freqs[i] = 0
+
+ eprint('reading entire corpus into memory')
+ corpus = []
+ for fpath in args.corpus:
+ corpus.append(open(fpath, 'rb').read())
+
+ eprint('computing byte frequencies')
+ for c in corpus:
+ for byte in c:
+ freqs[byte] += 1.0 / float(len(c))
+
+ eprint('writing Rust code')
+ # Get the rank of each byte. A lower rank => lower relative frequency.
+ rank = [0] * 256
+ for i, (byte, _) in enumerate(freqs.most_common()):
+ # print(byte)
+ rank[byte] = 255 - i
+
+ # Forcefully set the highest rank possible for bytes that start multi-byte
+ # UTF-8 sequences. The idea here is that a continuation byte will be more
+ # discerning in a homogenous haystack.
+ for byte in range(0xC0, 0xFF + 1):
+ rank[byte] = 255
+
+ # Now write Rust.
+ olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
+ for byte in range(256):
+ olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
+ olines.append('];')
+
+ print(preamble)
+ print('\n'.join(olines))
+
+
+if __name__ == '__main__':
+ main()