Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:02:58 +0000
commit: 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree: 173a775858bd501c378080a10dca74132f05bc50 /vendor/memchr/scripts/make-byte-frequency-table
parent: Initial commit. (diff)
download: rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
1 files changed, 74 insertions, 0 deletions
diff --git a/vendor/memchr/scripts/make-byte-frequency-table b/vendor/memchr/scripts/make-byte-frequency-table
new file mode 100755
index 000000000..37eeca7b7
--- /dev/null
+++ b/vendor/memchr/scripts/make-byte-frequency-table
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+# This does simple normalized frequency analysis on UTF-8 encoded text. The
+# result of the analysis is translated to a ranked list, where every byte is
+# assigned a rank. This list is written to src/freqs.rs.
+#
+# Currently, the frequencies are generated from the following corpuses:
+#
+#   * The CIA world fact book
+#   * The source code of rustc
+#   * Septuaginta
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from collections import Counter
+import sys
+
+preamble = '''
+// NOTE: The following code was generated by "scripts/frequencies.py", do not
+// edit directly
+'''.lstrip()
+
+
+def eprint(*args, **kwargs):
+    kwargs['file'] = sys.stderr
+    print(*args, **kwargs)
+
+
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument('corpus', metavar='FILE', nargs='+')
+    args = p.parse_args()
+
+    # Get frequency counts of each byte.
+    freqs = Counter()
+    for i in range(0, 256):
+        freqs[i] = 0
+
+    eprint('reading entire corpus into memory')
+    corpus = []
+    for fpath in args.corpus:
+        corpus.append(open(fpath, 'rb').read())
+
+    eprint('computing byte frequencies')
+    for c in corpus:
+        for byte in c:
+            freqs[byte] += 1.0 / float(len(c))
+
+    eprint('writing Rust code')
+    # Get the rank of each byte. A lower rank => lower relative frequency.
+    rank = [0] * 256
+    for i, (byte, _) in enumerate(freqs.most_common()):
+        # print(byte)
+        rank[byte] = 255 - i
+
+    # Forcefully set the highest rank possible for bytes that start multi-byte
+    # UTF-8 sequences. The idea here is that a continuation byte will be more
+    # discerning in a homogenous haystack.
+    for byte in range(0xC0, 0xFF + 1):
+        rank[byte] = 255
+
+    # Now write Rust.
+    olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
+    for byte in range(256):
+        olines.append('    %3d, // %r' % (rank[byte], chr(byte)))
+    olines.append('];')
+
+    print(preamble)
+    print('\n'.join(olines))
+
+
+if __name__ == '__main__':
+    main()
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:02:58 +0000
commit	698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree	173a775858bd501c378080a10dca74132f05bc50 /vendor/memchr/scripts/make-byte-frequency-table
parent	Initial commit. (diff)
download	rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip