From 9835e2ae736235810b4ea1c162ca5e65c547e770 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 04:49:50 +0200 Subject: Merging upstream version 1.71.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/ruzstd/.cargo-checksum.json | 1 + vendor/ruzstd/Cargo.lock | 704 +++++++++++++++++++++ vendor/ruzstd/Cargo.toml | 46 ++ vendor/ruzstd/LICENSE | 21 + vendor/ruzstd/Readme.md | 94 +++ vendor/ruzstd/benches/reversedbitreader_bench.rs | 38 ++ vendor/ruzstd/optimizations.md | 34 + vendor/ruzstd/src/bin/zstd.rs | 140 ++++ vendor/ruzstd/src/bin/zstd_stream.rs | 41 ++ vendor/ruzstd/src/blocks/block.rs | 25 + vendor/ruzstd/src/blocks/literals_section.rs | 223 +++++++ vendor/ruzstd/src/blocks/mod.rs | 3 + vendor/ruzstd/src/blocks/sequence_section.rs | 127 ++++ vendor/ruzstd/src/decoding/bit_reader.rs | 107 ++++ vendor/ruzstd/src/decoding/bit_reader_reverse.rs | 258 ++++++++ vendor/ruzstd/src/decoding/block_decoder.rs | 378 +++++++++++ vendor/ruzstd/src/decoding/decodebuffer.rs | 423 +++++++++++++ vendor/ruzstd/src/decoding/dictionary.rs | 93 +++ .../src/decoding/literals_section_decoder.rs | 180 ++++++ vendor/ruzstd/src/decoding/mod.rs | 11 + vendor/ruzstd/src/decoding/ringbuffer.rs | 632 ++++++++++++++++++ vendor/ruzstd/src/decoding/scratch.rs | 128 ++++ vendor/ruzstd/src/decoding/sequence_execution.rs | 123 ++++ .../src/decoding/sequence_section_decoder.rs | 495 +++++++++++++++ vendor/ruzstd/src/frame.rs | 288 +++++++++ vendor/ruzstd/src/frame_decoder.rs | 569 +++++++++++++++++ vendor/ruzstd/src/fse/fse_decoder.rs | 336 ++++++++++ vendor/ruzstd/src/fse/mod.rs | 2 + vendor/ruzstd/src/huff0/huff0_decoder.rs | 388 ++++++++++++ vendor/ruzstd/src/huff0/mod.rs | 2 + vendor/ruzstd/src/lib.rs | 15 + vendor/ruzstd/src/streaming_decoder.rs | 66 ++ vendor/ruzstd/src/tests/bit_reader.rs | 79 +++ vendor/ruzstd/src/tests/decode_corpus.rs | 181 ++++++ vendor/ruzstd/src/tests/dict_test.rs | 252 ++++++++ vendor/ruzstd/src/tests/fuzz_regressions.rs | 26 + vendor/ruzstd/src/tests/mod.rs | 324 ++++++++++ 37 files changed, 6853 insertions(+) create mode 100644 vendor/ruzstd/.cargo-checksum.json create mode 100644 vendor/ruzstd/Cargo.lock create mode 100644 vendor/ruzstd/Cargo.toml create mode 100644 vendor/ruzstd/LICENSE create mode 100644 vendor/ruzstd/Readme.md create mode 100644 vendor/ruzstd/benches/reversedbitreader_bench.rs create mode 100644 vendor/ruzstd/optimizations.md create mode 100644 vendor/ruzstd/src/bin/zstd.rs create mode 100644 vendor/ruzstd/src/bin/zstd_stream.rs create mode 100644 vendor/ruzstd/src/blocks/block.rs create mode 100644 vendor/ruzstd/src/blocks/literals_section.rs create mode 100644 vendor/ruzstd/src/blocks/mod.rs create mode 100644 vendor/ruzstd/src/blocks/sequence_section.rs create mode 100644 vendor/ruzstd/src/decoding/bit_reader.rs create mode 100644 vendor/ruzstd/src/decoding/bit_reader_reverse.rs create mode 100644 vendor/ruzstd/src/decoding/block_decoder.rs create mode 100644 vendor/ruzstd/src/decoding/decodebuffer.rs create mode 100644 vendor/ruzstd/src/decoding/dictionary.rs create mode 100644 vendor/ruzstd/src/decoding/literals_section_decoder.rs create mode 100644 vendor/ruzstd/src/decoding/mod.rs create mode 100644 vendor/ruzstd/src/decoding/ringbuffer.rs create mode 100644 vendor/ruzstd/src/decoding/scratch.rs create mode 100644 vendor/ruzstd/src/decoding/sequence_execution.rs create mode 100644 vendor/ruzstd/src/decoding/sequence_section_decoder.rs create mode 100644 vendor/ruzstd/src/frame.rs create mode 100644 vendor/ruzstd/src/frame_decoder.rs create mode 100644 vendor/ruzstd/src/fse/fse_decoder.rs create mode 100644 vendor/ruzstd/src/fse/mod.rs create mode 100644 vendor/ruzstd/src/huff0/huff0_decoder.rs create mode 100644 vendor/ruzstd/src/huff0/mod.rs create mode 100644 vendor/ruzstd/src/lib.rs create mode 100644 vendor/ruzstd/src/streaming_decoder.rs create mode 100644 vendor/ruzstd/src/tests/bit_reader.rs create mode 100644 vendor/ruzstd/src/tests/decode_corpus.rs create mode 100644 vendor/ruzstd/src/tests/dict_test.rs create mode 100644 vendor/ruzstd/src/tests/fuzz_regressions.rs create mode 100644 vendor/ruzstd/src/tests/mod.rs (limited to 'vendor/ruzstd') diff --git a/vendor/ruzstd/.cargo-checksum.json b/vendor/ruzstd/.cargo-checksum.json new file mode 100644 index 000000000..af8587e3a --- /dev/null +++ b/vendor/ruzstd/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.lock":"a0f00aaaca9e22323964f42b58ecf1b22e8f7cf6f84b570cede93b8f09c4125b","Cargo.toml":"c8fe2e19f0dfea4c3bc8c1800be3c28414c9fd0d9bbdf2f275de8853e8d5c76d","LICENSE":"d715eb0d9cfe1cc0cc22577e387436aa568b31ca272a7c837004dc0cdde925b5","Readme.md":"8226236cf1401c2c518aab068fe2399966707caae3fc435f8771c49450922629","benches/reversedbitreader_bench.rs":"1fd002b329fbe3a16fd365ebd08308a70080c3f58d1befddabff9d2262956b09","optimizations.md":"2e579e02562fce569e623c15ddc9f39db580be8e43c89f2696e403e2424b88a5","src/bin/zstd.rs":"73ebf4eade7dd3efa1443c3064c3b3f2504186fb3543e2e177487928a5005737","src/bin/zstd_stream.rs":"77742d7b5af8c5398ad06623a99d3637e437009850e7d6859cfb5aa55bd04fbc","src/blocks/block.rs":"c8cc992d4e5104ba83ab7ddc85de43cdb3919ef31018330cee0da7ff1dfcf779","src/blocks/literals_section.rs":"12ee3c2b46a38410e31a1b2622061365dfde6c65c8ac9dba17e9b943ec6111d5","src/blocks/mod.rs":"449d8d285c1f3c746a671b3b41b1b6faed46713c0a5450ac50e7ea8b17d5e0c2","src/blocks/sequence_section.rs":"341f1026be7f204e5e8137ed8cee895f2bd9ddb45ad0fffa5662c22957d485ee","src/decoding/bit_reader.rs":"7a90929c77e553b1775bb30a6d3779e0e7c932128d6beebd0671c37526ff3abd","src/decoding/bit_reader_reverse.rs":"148e8f5eeaaad9ae3bd2ef4c4c5c608c4cc450fb387f07219faccb016dea20ca","src/decoding/block_decoder.rs":"248bd1d68f4a14f1692a76abf065292cab3a126132854bd79df5d52fdc4db489","src/decoding/decodebuffer.rs":"e6374b7ece0bb80527328607898c63442862bbebe1f5880dd2a3ec2dac2596e0","src/decoding/dictionary.rs":"887209058e54967096e980d9376f7f864ffcd00a8c546fb8fcc7d21f6b7f9bf0","src/decoding/literals_section_decoder.rs":"1fad8ba89fcbf768c67d4fdde4566309f7eac84399cda4cc04ef439f09c6795e","src/decoding/mod.rs":"ace355d2128dddd8593707f563c87c281e4c842f8ba619d4d7c1fa7274641cf1","src/decoding/ringbuffer.rs":"81ef4b2079347542dc1e087ffe4ddc3cc0aa4f3c5e09b5521556e03c3951fc67","src/decoding/scratch.rs":"6af1184615c91da75c8400d0bd654fd7114fb33a1bc6de5ff729393d44fc0a24","src/decoding/sequence_execution.rs":"1bacf019e44551aabc3b6f2b6da17ccf07ad658eaabf8fc34ff6a4883fc0e4d9","src/decoding/sequence_section_decoder.rs":"545e578cf571b4b44aa1f21916cc321ae17e8bbb5614e0fa85bc17e99d21c638","src/frame.rs":"0399fb58370ad6697dfa6e1fe6b26a4cf1bf870b5b73a7a3e1e521f1b48e022d","src/frame_decoder.rs":"8d565a6db4e39c38ecba9339bcd349768d80eb8cbf05445678c89d1613c18ff7","src/fse/fse_decoder.rs":"b9bfddabb18616cf5809e75f4852d1df639288739c0c8e254885131a9161db11","src/fse/mod.rs":"030030afffa293d7062997006d7feeff2c0c7d8052af9ea02682b5b6441e7b9d","src/huff0/huff0_decoder.rs":"4d469a4f798239bd62317e6de4d793a22d52b590f1fc365c475d8e4023f40775","src/huff0/mod.rs":"8672ad0b34989bfdf0829ae00dc509923f312fb5613c7de7e4602fce88cfbd5b","src/lib.rs":"2964f45f8c446adcd7dd061a51db6c7934453e70a66af8194451fc5c2d3df4d8","src/streaming_decoder.rs":"b3ea02cbd141b0ba98da382d9d994f005edda31ff16fcbcbe3122ac36ad3f71e","src/tests/bit_reader.rs":"c3c18ea9b06e61a61109b81ba04d6376e4b3633ea2f4b7b6c90aa0f91c96a136","src/tests/decode_corpus.rs":"7fea5ae3fb9b181c4d69c04345e925ebbadddfc348c296ac175d9c79f33bfba0","src/tests/dict_test.rs":"4a6d3dcbb8576b53d43bc829f3539b37838aa8bb4e170c32fec8cc99dcadf2b3","src/tests/fuzz_regressions.rs":"f1a75a6a894808b197e8256037ce04086b105d7deff86dbf7cdff4ef39f5f614","src/tests/mod.rs":"f9d536685b46c0bf68bd8a04ebe384683b3a7b697ab50ca9349793e08ac6b461"},"package":"9a15e661f0f9dac21f3494fe5d23a6338c0ac116a2d22c2b63010acd89467ffe"} \ No newline at end of file diff --git a/vendor/ruzstd/Cargo.lock b/vendor/ruzstd/Cargo.lock new file mode 100644 index 000000000..45aea2146 --- /dev/null +++ b/vendor/ruzstd/Cargo.lock @@ -0,0 +1,704 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "getrandom" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "itoa" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" + +[[package]] +name = "js-sys" +version = "0.3.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "671a26f820db17c2a2750743f1dd03bafd15b98c9f30c7c2628c024c05d73397" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num-traits" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + +[[package]] +name = "proc-macro2" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c54b25569025b7fc9651de43004ae593a75ad88543b17178aa5e1b9c4f15f56f" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "regex" +version = "1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-syntax" +version = "0.6.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "ruzstd" +version = "0.3.1" +dependencies = [ + "byteorder", + "criterion", + "rand", + "thiserror", + "twox-hash", +] + +[[package]] +name = "ryu" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "semver" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cb243bdfdb5936c8dc3c45762a19d12ab4550cdc753bc247637d4ec35a040fd" + +[[package]] +name = "serde" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1" + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f26faba0c3959972377d3b2d306ee9f71faee9714294e41bb777f83f88578be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b7ce2b32a1aed03c558dc61a5cd328f15aff2dbc17daad8fb8af04d2100e15c" +dependencies = [ + "itoa 1.0.2", + "ryu", + "serde", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "syn" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbaf6116ab8924f39d52792136fb74fd60a80194cf1b1c6ffa6453eef1c3f942" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "unicode-ident" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d22af068fba1eb5edcb4aea19d382b2a3deb4c8f9d475c589b6ada9e0fd493ee" + +[[package]] +name = "unicode-width" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "wasm-bindgen" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27370197c907c55e3f1a9fbe26f44e937fe6451368324e009cba39e139dc08ad" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53e04185bfa3a779273da532f5025e33398409573f348985af9a1cbf3774d3f4" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17cae7ff784d7e83a2fe7611cfe766ecf034111b49deb850a3dc7699c08251f5" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ec0dc7a4756fffc231aab1b9f2f578d23cd391390ab27f952ae0c9b3ece20b" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d554b7f530dee5964d9a9468d95c1f8b8acae4f282807e7d27d4b03099a46744" + +[[package]] +name = "web-sys" +version = "0.3.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b17e741662c70c8bd24ac5c5b18de314a2c26c32bf8346ee1e6f53de919c283" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/vendor/ruzstd/Cargo.toml b/vendor/ruzstd/Cargo.toml new file mode 100644 index 000000000..b80aad59c --- /dev/null +++ b/vendor/ruzstd/Cargo.toml @@ -0,0 +1,46 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "ruzstd" +version = "0.3.1" +authors = ["Moritz Borcherding "] +exclude = [ + "decodecorpus_files/*", + "dict_tests/*", + "fuzz_decodecorpus/*", +] +description = "A decoder for the zstd compression format" +homepage = "https://github.com/KillingSpark/zstd-rs" +readme = "Readme.md" +license = "MIT" +repository = "https://github.com/KillingSpark/zstd-rs" + +[[bench]] +name = "reversedbitreader_bench" +harness = false + +[dependencies.byteorder] +version = "1.4" + +[dependencies.thiserror] +version = "1" + +[dependencies.twox-hash] +version = "1.6" +default-features = false + +[dev-dependencies.criterion] +version = "0.3" + +[dev-dependencies.rand] +version = "0.8.5" diff --git a/vendor/ruzstd/LICENSE b/vendor/ruzstd/LICENSE new file mode 100644 index 000000000..486a3351b --- /dev/null +++ b/vendor/ruzstd/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Moritz Borcherding + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/ruzstd/Readme.md b/vendor/ruzstd/Readme.md new file mode 100644 index 000000000..1a6edee99 --- /dev/null +++ b/vendor/ruzstd/Readme.md @@ -0,0 +1,94 @@ +# What is this + +A feature-complete decoder for the zstd compression format as defined in: [This document](https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md). + +It is NOT a compressor. I don't plan on implementing that part either, at least not in the near future. (If someone is motivated enough I will of course accept a pull-request!) + +This crate might look like it is not active, this is because there isn't really anything to do anymore, unless a bug is found or a new API feature is requested. I will of course respond to and look into issues! + +# Current Status + +[![Actions Status](https://github.com/KillingSpark/zstd-rs/workflows/CI/badge.svg)](https://github.com/KillingSpark/zstd-rs/actions?query=workflow%3A"CI") + +This started as a toy project but it is in a usable state now. + +In terms of speed it is still behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs). + +## Speed + +Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from aramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower. + +## Can do: + +1. Parse all files in /decodecorpus_files. These were generated with [decodecorpus](https://github.com/facebook/zstd/tree/dev/tests) by the original zstd developers +1. Decode all of them correctly into the output buffer +1. Decode all the decode_corpus files (1000+) I created locally +1. Calculate checksums +1. Act as a `zstd -c -d` dropin replacement + +## Cannot do + +This decoder is pretty much feature complete. If there are any wishes for new APIs or bug reports please file an issue, I will gladly take a look! + +## Roadmap + +1. More Performance optimizations (targets would be sequence_decoding and reverse_bitreader::get_bits. Those account for about 50% of the whole time used) +1. More tests (especially unit-tests for the bitreaders and other lower-level parts) +1. Find more bugs + +## Testing + +Tests take two forms. + +1. Tests using well-formed files that have to decode correctly and are checked against their originals +1. Tests using malformed input that have been generated by the fuzzer. These don't have to decode (they are garbage) but they must not make the decoder panic + +## Fuzzing + +Fuzzing has been done with cargo fuzz. Each time it crashes the decoder I fixed the issue and added the offending input as a test. It's checked into the repo in the fuzz/artifacts/fuzz_target_1 directory. Those get tested in the fuzz_regressions.rs test. +At the time of writing the fuzzer was able to run for over 12 hours on the random input without finding new crashes. Obviously this doesn't mean there are no bugs but the common ones are probably fixed. + +Fuzzing has been done on + +1. Random input with no initial corpus +2. The \*.zst in /fuzz_decodecorpus + +### You wanna help fuzz? + +Use `cargo +nightly fuzz run decode` to run the fuzzer. It is seeded with files created with decodecorpus. + +If (when) the fuzzer finds a crash it will be saved to the artifacts dir by the fuzzer. Run `cargo test artifacts` to run the artifacts tests. +This will tell you where the decoder panics exactly. If you are able to fix the issue please feel free to do a pull request. If not please still submit the offending input and I will see how to fix it myself. + +# How can you use it? + +Additionally to the descriptions and the docs you can have a look at the zstd / zstd_streaming binaries. They showcase how this library can be used. + +## Easy + +The easiest is to wrap the io::Read into a StreamingDecoder which itself implements io::Read. It will decode blocks as necessary to fulfill the read requests + +``` +let mut f = File::open(path).unwrap(); +let mut decoder = StreamingDecoder::new(&mut f); + +let mut result = Vec::new(); +decoder.read_to_end(&mut buffer).unwrap(); +``` + +This might be a problem if you are accepting user provided data. Frames can be REALLY big when decoded. If this is the case you should either check how big the frame +actually is or use the memory efficient approach described below. + +## Memory efficient + +If memory is a concern you can decode frames partially. There are two ways to do this: + +#### Streaming decoder + +Use the StreamingDecoder and use a while loop to fill your buffer (see src/bin/zstd_stream.rs for an example). This is the +recommended approach. + +#### Use the lower level FrameDecoder + +For an example see the src/bin/zstd.rs file. Basically you can decode the frame until either a +given block count has been decoded or the decodebuffer has reached a certain size. Then you can collect no longer needed bytes from the buffer and do something with them, discard them and resume decoding the frame in a loop until the frame has been decoded completely. diff --git a/vendor/ruzstd/benches/reversedbitreader_bench.rs b/vendor/ruzstd/benches/reversedbitreader_bench.rs new file mode 100644 index 000000000..89b01e1f5 --- /dev/null +++ b/vendor/ruzstd/benches/reversedbitreader_bench.rs @@ -0,0 +1,38 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::Rng; +use ruzstd::decoding::bit_reader_reverse::BitReaderReversed; + +fn fibonacci(br: &mut BitReaderReversed, accesses: &[u8]) -> u64 { + let mut sum = 0; + for x in accesses { + sum += br.get_bits(*x).unwrap() as u64; + } + let _ = black_box(br); + sum +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut rng = rand::thread_rng(); + let mut rand_vec = vec![]; + for _ in 0..100000 { + rand_vec.push(rng.gen()); + } + + let mut access_vec = vec![]; + let mut br = BitReaderReversed::new(&rand_vec); + while br.bits_remaining() > 0 { + let x = rng.gen_range(1..20); + br.get_bits(x).unwrap(); + access_vec.push(x); + } + + c.bench_function("fib 20", |b| { + b.iter(|| { + br.reset(&rand_vec); + fibonacci(&mut br, &access_vec) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/vendor/ruzstd/optimizations.md b/vendor/ruzstd/optimizations.md new file mode 100644 index 000000000..dad6eba76 --- /dev/null +++ b/vendor/ruzstd/optimizations.md @@ -0,0 +1,34 @@ +# Optimizations +This document tracks which optimizations have been done after the initial implementation passed corpus tests and a good amount of fuzzing. + +## Introducing more unsafe code: +These optimizations introduced more unsafe code. These should yield significant improvements, or else they are not really worth it. + +### Optimizing bitreader with byteorder which uses ptr::copy_nonoverlapping +* Reverse bitreader_reversed::get_bits was identified by linux perf tool using about 36% of the whole time +* Benchmark: decode enwik9 + +* Before: about 14.7 seconds +* After: about 12.2 seconds with about 25% of the time used for get_bits() + +### Optimizing decodebuffer::repeat with ptr::copy_nonoverlapping +* decodebuffer::repeate was identified by linux perf tool using about 28% of the whole time +* Benchmark: decode enwik9 + +* Before: about 9.9 seconds +* After: about 9.4 seconds + +### Use custom ringbuffer in the decodebuffer +The decode buffer must be able to do two things efficiently +* Collect bytes from the front +* Copy bytes from the contents to the end + +The stdlibs VecDequeu and Vec can each do one but not the other efficiently. So a custom implementation of a ringbuffer was written. + +## Introducing NO additional unsafe code +These are just nice to have + +### Even better bitreaders +Studying this material lead to a big improvement in bitreader speed +* https://fgiesen.wordpress.com/2018/02/19/reading-bits-in-far-too-many-ways-part-1/ +* https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/ diff --git a/vendor/ruzstd/src/bin/zstd.rs b/vendor/ruzstd/src/bin/zstd.rs new file mode 100644 index 000000000..54c5d19e0 --- /dev/null +++ b/vendor/ruzstd/src/bin/zstd.rs @@ -0,0 +1,140 @@ +extern crate ruzstd; +use std::fs::File; +use std::io::Read; +use std::io::Seek; +use std::io::SeekFrom; +use std::io::Write; + +use ruzstd::frame::ReadFrameHeaderError; +use ruzstd::frame_decoder::FrameDecoderError; + +struct StateTracker { + bytes_used: u64, + frames_used: usize, + valid_checksums: usize, + invalid_checksums: usize, + file_pos: u64, + file_size: u64, + old_percentage: i8, +} + +fn main() { + let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); + let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); + file_paths.remove(0); + + if !flags.contains(&"-d".to_owned()) { + eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag"); + return; + } + + if !flags.contains(&"-c".to_owned()) { + eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file"); + return; + } + + if flags.len() != 2 { + eprintln!( + "No flags other than -d and -c are currently implemented. Flags used: {:?}", + flags + ); + return; + } + + let mut frame_dec = ruzstd::FrameDecoder::new(); + + for path in file_paths { + eprintln!("File: {}", path); + let mut f = File::open(path).unwrap(); + + let mut tracker = StateTracker { + bytes_used: 0, + frames_used: 0, + valid_checksums: 0, + invalid_checksums: 0, + file_size: f.metadata().unwrap().len(), + file_pos: 0, + old_percentage: -1, + }; + + let batch_size = 1024 * 1024 * 10; + let mut result = vec![0; batch_size]; + + while tracker.file_pos < tracker.file_size { + match frame_dec.reset(&mut f) { + Err(FrameDecoderError::ReadFrameHeaderError(ReadFrameHeaderError::SkipFrame( + magic_num, + skip_size, + ))) => { + eprintln!("Found a skippable frame with magic number: {magic_num} and size: {skip_size}"); + tracker.file_pos += skip_size as u64; + f.seek(SeekFrom::Current(skip_size as i64)).unwrap(); + continue; + } + other => other.unwrap(), + } + + tracker.frames_used += 1; + + while !frame_dec.is_finished() { + frame_dec + .decode_blocks(&mut f, ruzstd::BlockDecodingStrategy::UptoBytes(batch_size)) + .unwrap(); + + if frame_dec.can_collect() > batch_size { + let x = frame_dec.read(result.as_mut_slice()).unwrap(); + tracker.file_pos = f.stream_position().unwrap(); + do_something(&result[..x], &mut tracker); + } + } + + // handle the last chunk of data + while frame_dec.can_collect() > 0 { + let x = frame_dec.read(result.as_mut_slice()).unwrap(); + tracker.file_pos = f.stream_position().unwrap(); + do_something(&result[..x], &mut tracker); + } + + if let Some(chksum) = frame_dec.get_checksum_from_data() { + if frame_dec.get_calculated_checksum().unwrap() != chksum { + tracker.invalid_checksums += 1; + eprintln!( + "Checksum did not match in frame {}! From data: {}, calculated while decoding: {}", + tracker.frames_used, + chksum, + frame_dec.get_calculated_checksum().unwrap() + ); + } else { + tracker.valid_checksums += 1; + } + } + } + + eprintln!( + "\nDecoded frames: {} bytes: {}", + tracker.frames_used, tracker.bytes_used + ); + if tracker.valid_checksums == 0 && tracker.invalid_checksums == 0 { + eprintln!("No checksums to test"); + } else { + eprintln!( + "{} of {} checksums are ok!", + tracker.valid_checksums, + tracker.valid_checksums + tracker.invalid_checksums, + ); + } + } +} + +fn do_something(data: &[u8], s: &mut StateTracker) { + //Do something. Like writing it to a file or to stdout... + std::io::stdout().write_all(data).unwrap(); + s.bytes_used += data.len() as u64; + + let percentage = (s.file_pos * 100) / s.file_size; + if percentage as i8 != s.old_percentage { + eprint!("\r"); + eprint!("{} % done", percentage); + s.old_percentage = percentage as i8; + } +} diff --git a/vendor/ruzstd/src/bin/zstd_stream.rs b/vendor/ruzstd/src/bin/zstd_stream.rs new file mode 100644 index 000000000..cb2ebc76b --- /dev/null +++ b/vendor/ruzstd/src/bin/zstd_stream.rs @@ -0,0 +1,41 @@ +extern crate ruzstd; +use std::fs::File; +use std::io::{Read, Write}; + +fn main() { + let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect(); + let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect(); + file_paths.remove(0); + + if !flags.contains(&"-d".to_owned()) { + eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag"); + return; + } + + if !flags.contains(&"-c".to_owned()) { + eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file"); + return; + } + + if flags.len() != 2 { + eprintln!( + "No flags other than -d and -c are currently implemented. Flags used: {:?}", + flags + ); + return; + } + + for path in file_paths { + eprintln!("File: {}", path); + let f = File::open(path).unwrap(); + let mut buf_read = std::io::BufReader::new(f); + + let mut decoder = ruzstd::StreamingDecoder::new(&mut buf_read).unwrap(); + let mut buf = [0u8; 1024 * 1024]; + let mut stdout = std::io::stdout(); + while !decoder.decoder.is_finished() || decoder.decoder.can_collect() > 0 { + let bytes = decoder.read(&mut buf[..]).unwrap(); + stdout.write_all(&buf[..bytes]).unwrap(); + } + } +} diff --git a/vendor/ruzstd/src/blocks/block.rs b/vendor/ruzstd/src/blocks/block.rs new file mode 100644 index 000000000..9c872ebf9 --- /dev/null +++ b/vendor/ruzstd/src/blocks/block.rs @@ -0,0 +1,25 @@ +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BlockType { + Raw, + RLE, + Compressed, + Reserved, +} + +impl std::fmt::Display for BlockType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + BlockType::Compressed => write!(f, "Compressed"), + BlockType::Raw => write!(f, "Raw"), + BlockType::RLE => write!(f, "RLE"), + BlockType::Reserved => write!(f, "Reserverd"), + } + } +} + +pub struct BlockHeader { + pub last_block: bool, + pub block_type: BlockType, + pub decompressed_size: u32, + pub content_size: u32, +} diff --git a/vendor/ruzstd/src/blocks/literals_section.rs b/vendor/ruzstd/src/blocks/literals_section.rs new file mode 100644 index 000000000..9d8307223 --- /dev/null +++ b/vendor/ruzstd/src/blocks/literals_section.rs @@ -0,0 +1,223 @@ +use super::super::decoding::bit_reader::{BitReader, GetBitsError}; + +pub struct LiteralsSection { + pub regenerated_size: u32, + pub compressed_size: Option, + pub num_streams: Option, + pub ls_type: LiteralsSectionType, +} + +pub enum LiteralsSectionType { + Raw, + RLE, + Compressed, + Treeless, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum LiteralsSectionParseError { + #[error("Illegal literalssectiontype. Is: {got}, must be in: 0, 1, 2, 3")] + IllegalLiteralSectionType { got: u8 }, + #[error(transparent)] + GetBitsError(#[from] GetBitsError), + #[error("Not enough byte to parse the literals section header. Have: {have}, Need: {need}")] + NotEnoughBytes { have: usize, need: u8 }, +} + +impl std::fmt::Display for LiteralsSectionType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + LiteralsSectionType::Compressed => write!(f, "Compressed"), + LiteralsSectionType::Raw => write!(f, "Raw"), + LiteralsSectionType::RLE => write!(f, "RLE"), + LiteralsSectionType::Treeless => write!(f, "Treeless"), + } + } +} + +impl Default for LiteralsSection { + fn default() -> Self { + Self::new() + } +} + +impl LiteralsSection { + pub fn new() -> LiteralsSection { + LiteralsSection { + regenerated_size: 0, + compressed_size: None, + num_streams: None, + ls_type: LiteralsSectionType::Raw, + } + } + + pub fn header_bytes_needed(&self, first_byte: u8) -> Result { + let ls_type = Self::section_type(first_byte)?; + let size_format = (first_byte >> 2) & 0x3; + match ls_type { + LiteralsSectionType::RLE | LiteralsSectionType::Raw => { + match size_format { + 0 | 2 => { + //size_format actually only uses one bit + //regenerated_size uses 5 bits + Ok(1) + } + 1 => { + //size_format uses 2 bit + //regenerated_size uses 12 bits + Ok(2) + } + 3 => { + //size_format uses 2 bit + //regenerated_size uses 20 bits + Ok(3) + } + _ => panic!( + "This is a bug in the program. There should only be values between 0..3" + ), + } + } + LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => { + match size_format { + 0 | 1 => { + //Only differ in num_streams + //both regenerated and compressed sizes use 10 bit + Ok(3) + } + 2 => { + //both regenerated and compressed sizes use 14 bit + Ok(4) + } + 3 => { + //both regenerated and compressed sizes use 18 bit + Ok(5) + } + + _ => panic!( + "This is a bug in the program. There should only be values between 0..3" + ), + } + } + } + } + + pub fn parse_from_header(&mut self, raw: &[u8]) -> Result { + let mut br = BitReader::new(raw); + let t = br.get_bits(2)? as u8; + self.ls_type = Self::section_type(t)?; + let size_format = br.get_bits(2)? as u8; + + let byte_needed = self.header_bytes_needed(raw[0])?; + if raw.len() < byte_needed as usize { + return Err(LiteralsSectionParseError::NotEnoughBytes { + have: raw.len(), + need: byte_needed, + }); + } + + match self.ls_type { + LiteralsSectionType::RLE | LiteralsSectionType::Raw => { + self.compressed_size = None; + match size_format { + 0 | 2 => { + //size_format actually only uses one bit + //regenerated_size uses 5 bits + self.regenerated_size = u32::from(raw[0]) >> 3; + Ok(1) + } + 1 => { + //size_format uses 2 bit + //regenerated_size uses 12 bits + self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4); + Ok(2) + } + 3 => { + //size_format uses 2 bit + //regenerated_size uses 20 bits + self.regenerated_size = (u32::from(raw[0]) >> 4) + + (u32::from(raw[1]) << 4) + + (u32::from(raw[2]) << 12); + Ok(3) + } + _ => panic!( + "This is a bug in the program. There should only be values between 0..3" + ), + } + } + LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => { + match size_format { + 0 => { + self.num_streams = Some(1); + } + 1 | 2 | 3 => { + self.num_streams = Some(4); + } + _ => panic!( + "This is a bug in the program. There should only be values between 0..3" + ), + }; + + match size_format { + 0 | 1 => { + //Differ in num_streams see above + //both regenerated and compressed sizes use 10 bit + + //4 from the first, six from the second byte + self.regenerated_size = + (u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4); + + // 2 from the second, full last byte + self.compressed_size = + Some(u32::from(raw[1] >> 6) + (u32::from(raw[2]) << 2)); + Ok(3) + } + 2 => { + //both regenerated and compressed sizes use 14 bit + + //4 from first, full second, 2 from the third byte + self.regenerated_size = (u32::from(raw[0]) >> 4) + + (u32::from(raw[1]) << 4) + + ((u32::from(raw[2]) & 0x3) << 12); + + //6 from the third, full last byte + self.compressed_size = + Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6)); + Ok(4) + } + 3 => { + //both regenerated and compressed sizes use 18 bit + + //4 from first, full second, six from third byte + self.regenerated_size = (u32::from(raw[0]) >> 4) + + (u32::from(raw[1]) << 4) + + ((u32::from(raw[2]) & 0x3F) << 12); + + //2 from third, full fourth, full fifth byte + self.compressed_size = Some( + (u32::from(raw[2]) >> 6) + + (u32::from(raw[3]) << 2) + + (u32::from(raw[4]) << 10), + ); + Ok(5) + } + + _ => panic!( + "This is a bug in the program. There should only be values between 0..3" + ), + } + } + } + } + + fn section_type(raw: u8) -> Result { + let t = raw & 0x3; + match t { + 0 => Ok(LiteralsSectionType::Raw), + 1 => Ok(LiteralsSectionType::RLE), + 2 => Ok(LiteralsSectionType::Compressed), + 3 => Ok(LiteralsSectionType::Treeless), + other => Err(LiteralsSectionParseError::IllegalLiteralSectionType { got: other }), + } + } +} diff --git a/vendor/ruzstd/src/blocks/mod.rs b/vendor/ruzstd/src/blocks/mod.rs new file mode 100644 index 000000000..d12a1866d --- /dev/null +++ b/vendor/ruzstd/src/blocks/mod.rs @@ -0,0 +1,3 @@ +pub mod block; +pub mod literals_section; +pub mod sequence_section; diff --git a/vendor/ruzstd/src/blocks/sequence_section.rs b/vendor/ruzstd/src/blocks/sequence_section.rs new file mode 100644 index 000000000..28b9efff2 --- /dev/null +++ b/vendor/ruzstd/src/blocks/sequence_section.rs @@ -0,0 +1,127 @@ +pub struct SequencesHeader { + pub num_sequences: u32, + pub modes: Option, +} + +#[derive(Clone, Copy)] +pub struct Sequence { + pub ll: u32, + pub ml: u32, + pub of: u32, +} + +impl std::fmt::Display for Sequence { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of) + } +} + +#[derive(Copy, Clone)] +pub struct CompressionModes(u8); +pub enum ModeType { + Predefined, + RLE, + FSECompressed, + Repeat, +} + +impl CompressionModes { + pub fn decode_mode(m: u8) -> ModeType { + match m { + 0 => ModeType::Predefined, + 1 => ModeType::RLE, + 2 => ModeType::FSECompressed, + 3 => ModeType::Repeat, + _ => panic!("This can never happen"), + } + } + + pub fn ll_mode(self) -> ModeType { + Self::decode_mode(self.0 >> 6) + } + + pub fn of_mode(self) -> ModeType { + Self::decode_mode((self.0 >> 4) & 0x3) + } + + pub fn ml_mode(self) -> ModeType { + Self::decode_mode((self.0 >> 2) & 0x3) + } +} + +impl Default for SequencesHeader { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum SequencesHeaderParseError { + #[error("source must have at least {need_at_least} bytes to parse header; got {got} bytes")] + NotEnoughBytes { need_at_least: u8, got: usize }, +} + +impl SequencesHeader { + pub fn new() -> SequencesHeader { + SequencesHeader { + num_sequences: 0, + modes: None, + } + } + + pub fn parse_from_header(&mut self, source: &[u8]) -> Result { + let mut bytes_read = 0; + if source.is_empty() { + return Err(SequencesHeaderParseError::NotEnoughBytes { + need_at_least: 1, + got: 0, + }); + } + + let source = match source[0] { + 0 => { + self.num_sequences = 0; + return Ok(1); + } + 1..=127 => { + if source.len() < 2 { + return Err(SequencesHeaderParseError::NotEnoughBytes { + need_at_least: 2, + got: source.len(), + }); + } + self.num_sequences = u32::from(source[0]); + bytes_read += 1; + &source[1..] + } + 128..=254 => { + if source.len() < 3 { + return Err(SequencesHeaderParseError::NotEnoughBytes { + need_at_least: 3, + got: source.len(), + }); + } + self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]); + bytes_read += 2; + &source[2..] + } + 255 => { + if source.len() < 4 { + return Err(SequencesHeaderParseError::NotEnoughBytes { + need_at_least: 4, + got: source.len(), + }); + } + self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00; + bytes_read += 3; + &source[3..] + } + }; + + self.modes = Some(CompressionModes(source[0])); + bytes_read += 1; + + Ok(bytes_read) + } +} diff --git a/vendor/ruzstd/src/decoding/bit_reader.rs b/vendor/ruzstd/src/decoding/bit_reader.rs new file mode 100644 index 000000000..e07ac45c4 --- /dev/null +++ b/vendor/ruzstd/src/decoding/bit_reader.rs @@ -0,0 +1,107 @@ +pub struct BitReader<'s> { + idx: usize, //index counts bits already read + source: &'s [u8], +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum GetBitsError { + #[error("Cant serve this request. The reader is limited to {limit} bits, requested {num_requested_bits} bits")] + TooManyBits { + num_requested_bits: usize, + limit: u8, + }, + #[error("Can't read {requested} bits, only have {remaining} bits left")] + NotEnoughRemainingBits { requested: usize, remaining: usize }, +} + +impl<'s> BitReader<'s> { + pub fn new(source: &'s [u8]) -> BitReader<'_> { + BitReader { idx: 0, source } + } + + pub fn bits_left(&self) -> usize { + self.source.len() * 8 - self.idx + } + + pub fn bits_read(&self) -> usize { + self.idx + } + + pub fn return_bits(&mut self, n: usize) { + if n > self.idx { + panic!("Cant return this many bits"); + } + self.idx -= n; + } + + pub fn get_bits(&mut self, n: usize) -> Result { + if n > 64 { + return Err(GetBitsError::TooManyBits { + num_requested_bits: n, + limit: 64, + }); + } + if self.bits_left() < n { + return Err(GetBitsError::NotEnoughRemainingBits { + requested: n, + remaining: self.bits_left(), + }); + } + + let old_idx = self.idx; + + let bits_left_in_current_byte = 8 - (self.idx % 8); + let bits_not_needed_in_current_byte = 8 - bits_left_in_current_byte; + + //collect bits from the currently pointed to byte + let mut value = u64::from(self.source[self.idx / 8] >> bits_not_needed_in_current_byte); + + if bits_left_in_current_byte >= n { + //no need for fancy stuff + + //just mask all but the needed n bit + value &= (1 << n) - 1; + self.idx += n; + } else { + self.idx += bits_left_in_current_byte; + + //n spans over multiple bytes + let full_bytes_needed = (n - bits_left_in_current_byte) / 8; + let bits_in_last_byte_needed = n - bits_left_in_current_byte - full_bytes_needed * 8; + + assert!( + bits_left_in_current_byte + full_bytes_needed * 8 + bits_in_last_byte_needed == n + ); + + let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value + + assert!(self.idx % 8 == 0); + + //collect full bytes + for _ in 0..full_bytes_needed { + value |= u64::from(self.source[self.idx / 8]) << bit_shift; + self.idx += 8; + bit_shift += 8; + } + + assert!(n - bit_shift == bits_in_last_byte_needed); + + if bits_in_last_byte_needed > 0 { + let val_las_byte = + u64::from(self.source[self.idx / 8]) & ((1 << bits_in_last_byte_needed) - 1); + value |= val_las_byte << bit_shift; + self.idx += bits_in_last_byte_needed; + } + } + + assert!(self.idx == old_idx + n); + + Ok(value) + } + + pub fn reset(&mut self, new_source: &'s [u8]) { + self.idx = 0; + self.source = new_source; + } +} diff --git a/vendor/ruzstd/src/decoding/bit_reader_reverse.rs b/vendor/ruzstd/src/decoding/bit_reader_reverse.rs new file mode 100644 index 000000000..5bc5a2a74 --- /dev/null +++ b/vendor/ruzstd/src/decoding/bit_reader_reverse.rs @@ -0,0 +1,258 @@ +pub use super::bit_reader::GetBitsError; +use byteorder::ByteOrder; +use byteorder::LittleEndian; + +pub struct BitReaderReversed<'s> { + idx: isize, //index counts bits already read + source: &'s [u8], + + bit_container: u64, + bits_in_container: u8, +} + +impl<'s> BitReaderReversed<'s> { + pub fn bits_remaining(&self) -> isize { + self.idx + self.bits_in_container as isize + } + + pub fn new(source: &'s [u8]) -> BitReaderReversed<'_> { + BitReaderReversed { + idx: source.len() as isize * 8, + source, + bit_container: 0, + bits_in_container: 0, + } + } + + /// We refill the container in full bytes, shifting the still unread portion to the left, and filling the lower bits with new data + #[inline(always)] + fn refill_container(&mut self) { + let byte_idx = self.byte_idx() as usize; + + let retain_bytes = (self.bits_in_container + 7) / 8; + let want_to_read_bits = 64 - (retain_bytes * 8); + + // if there are >= 8 byte left to read we go a fast path: + // The slice is looking something like this |U..UCCCCCCCCR..R| Where U are some unread bytes, C are the bytes in the container, and R are already read bytes + // What we do is, we shift the container by a few bytes to the left by just reading a u64 from the correct position, rereading the portion we did not yet return from the conainer. + // Technically this would still work for positions lower than 8 but this guarantees that enough bytes are in the source and generally makes for less edge cases + if byte_idx >= 8 { + self.refill_fast(byte_idx, retain_bytes, want_to_read_bits) + } else { + // In the slow path we just read however many bytes we can + self.refill_slow(byte_idx, want_to_read_bits) + } + } + + #[inline(always)] + fn refill_fast(&mut self, byte_idx: usize, retain_bytes: u8, want_to_read_bits: u8) { + let load_from_byte_idx = byte_idx - 7 + retain_bytes as usize; + let refill = LittleEndian::read_u64(&self.source[load_from_byte_idx..]); + self.bit_container = refill; + self.bits_in_container += want_to_read_bits; + self.idx -= want_to_read_bits as isize; + } + + #[cold] + fn refill_slow(&mut self, byte_idx: usize, want_to_read_bits: u8) { + let can_read_bits = isize::min(want_to_read_bits as isize, self.idx); + let can_read_bytes = can_read_bits / 8; + + match can_read_bytes { + 8 => { + self.bit_container = LittleEndian::read_u64(&self.source[byte_idx - 7..]); + self.bits_in_container += 64; + self.idx -= 64; + } + 6..=7 => { + self.bit_container <<= 48; + self.bits_in_container += 48; + self.bit_container |= LittleEndian::read_u48(&self.source[byte_idx - 5..]); + self.idx -= 48; + } + 4..=5 => { + self.bit_container <<= 32; + self.bits_in_container += 32; + self.bit_container |= + u64::from(LittleEndian::read_u32(&self.source[byte_idx - 3..])); + self.idx -= 32; + } + 2..=3 => { + self.bit_container <<= 16; + self.bits_in_container += 16; + self.bit_container |= + u64::from(LittleEndian::read_u16(&self.source[byte_idx - 1..])); + self.idx -= 16; + } + 1 => { + self.bit_container <<= 8; + self.bits_in_container += 8; + self.bit_container |= u64::from(self.source[byte_idx]); + self.idx -= 8; + } + _ => { + panic!("This cannot be reached"); + } + } + } + + /// Next byte that should be read into the container + /// Negative values mean that the source buffer as been read into the container completetly. + fn byte_idx(&self) -> isize { + (self.idx - 1) / 8 + } + + #[inline(always)] + pub fn get_bits(&mut self, n: u8) -> Result { + if n == 0 { + return Ok(0); + } + if self.bits_in_container >= n { + return Ok(self.get_bits_unchecked(n)); + } + + self.get_bits_cold(n) + } + + #[cold] + fn get_bits_cold(&mut self, n: u8) -> Result { + if n > 56 { + return Err(GetBitsError::TooManyBits { + num_requested_bits: usize::from(n), + limit: 56, + }); + } + + let signed_n = n as isize; + + if self.bits_remaining() <= 0 { + self.idx -= signed_n; + return Ok(0); + } + + if self.bits_remaining() < signed_n { + let emulated_read_shift = signed_n - self.bits_remaining(); + let v = self.get_bits(self.bits_remaining() as u8)?; + debug_assert!(self.idx == 0); + let value = v << emulated_read_shift; + self.idx -= emulated_read_shift; + return Ok(value); + } + + while (self.bits_in_container < n) && self.idx > 0 { + self.refill_container(); + } + + debug_assert!(self.bits_in_container >= n); + + //if we reach this point there are enough bits in the container + + Ok(self.get_bits_unchecked(n)) + } + + #[inline(always)] + pub fn get_bits_triple( + &mut self, + n1: u8, + n2: u8, + n3: u8, + ) -> Result<(u64, u64, u64), GetBitsError> { + let sum = n1 as usize + n2 as usize + n3 as usize; + if sum == 0 { + return Ok((0, 0, 0)); + } + if sum > 56 { + // try and get the values separatly + return Ok((self.get_bits(n1)?, self.get_bits(n2)?, self.get_bits(n3)?)); + } + let sum = sum as u8; + + if self.bits_in_container >= sum { + let v1 = if n1 == 0 { + 0 + } else { + self.get_bits_unchecked(n1) + }; + let v2 = if n2 == 0 { + 0 + } else { + self.get_bits_unchecked(n2) + }; + let v3 = if n3 == 0 { + 0 + } else { + self.get_bits_unchecked(n3) + }; + + return Ok((v1, v2, v3)); + } + + self.get_bits_triple_cold(n1, n2, n3, sum) + } + + #[cold] + fn get_bits_triple_cold( + &mut self, + n1: u8, + n2: u8, + n3: u8, + sum: u8, + ) -> Result<(u64, u64, u64), GetBitsError> { + let sum_signed = sum as isize; + + if self.bits_remaining() <= 0 { + self.idx -= sum_signed; + return Ok((0, 0, 0)); + } + + if self.bits_remaining() < sum_signed { + return Ok((self.get_bits(n1)?, self.get_bits(n2)?, self.get_bits(n3)?)); + } + + while (self.bits_in_container < sum) && self.idx > 0 { + self.refill_container(); + } + + debug_assert!(self.bits_in_container >= sum); + + //if we reach this point there are enough bits in the container + + let v1 = if n1 == 0 { + 0 + } else { + self.get_bits_unchecked(n1) + }; + let v2 = if n2 == 0 { + 0 + } else { + self.get_bits_unchecked(n2) + }; + let v3 = if n3 == 0 { + 0 + } else { + self.get_bits_unchecked(n3) + }; + + Ok((v1, v2, v3)) + } + + #[inline(always)] + fn get_bits_unchecked(&mut self, n: u8) -> u64 { + let shift_by = self.bits_in_container - n; + let mask = (1u64 << n) - 1u64; + + let value = self.bit_container >> shift_by; + self.bits_in_container -= n; + let value_masked = value & mask; + debug_assert!(value_masked < (1 << n)); + + value_masked + } + + pub fn reset(&mut self, new_source: &'s [u8]) { + self.idx = new_source.len() as isize * 8; + self.source = new_source; + self.bit_container = 0; + self.bits_in_container = 0; + } +} diff --git a/vendor/ruzstd/src/decoding/block_decoder.rs b/vendor/ruzstd/src/decoding/block_decoder.rs new file mode 100644 index 000000000..11a4c28c1 --- /dev/null +++ b/vendor/ruzstd/src/decoding/block_decoder.rs @@ -0,0 +1,378 @@ +use super::super::blocks::block::BlockHeader; +use super::super::blocks::block::BlockType; +use super::super::blocks::literals_section::LiteralsSection; +use super::super::blocks::literals_section::LiteralsSectionType; +use super::super::blocks::sequence_section::SequencesHeader; +use super::literals_section_decoder::{decode_literals, DecompressLiteralsError}; +use super::sequence_execution::ExecuteSequencesError; +use super::sequence_section_decoder::decode_sequences; +use super::sequence_section_decoder::DecodeSequenceError; +use crate::blocks::literals_section::LiteralsSectionParseError; +use crate::blocks::sequence_section::SequencesHeaderParseError; +use crate::decoding::scratch::DecoderScratch; +use crate::decoding::sequence_execution::execute_sequences; +use std::io::{self, Read}; + +pub struct BlockDecoder { + header_buffer: [u8; 3], + internal_state: DecoderState, +} + +enum DecoderState { + ReadyToDecodeNextHeader, + ReadyToDecodeNextBody, + #[allow(dead_code)] + Failed, //TODO put "self.internal_state = DecoderState::Failed;" everywhere an unresolvable error occurs +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum BlockHeaderReadError { + #[error("Error while reading the block header")] + ReadError(#[from] io::Error), + #[error("Reserved block occured. This is considered corruption by the documentation")] + FoundReservedBlock, + #[error("Error getting block type: {0}")] + BlockTypeError(#[from] BlockTypeError), + #[error("Error getting block content size: {0}")] + BlockSizeError(#[from] BlockSizeError), +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum BlockTypeError { + #[error( + "Invalid Blocktype number. Is: {num} Should be one of: 0, 1, 2, 3 (3 is reserved though" + )] + InvalidBlocktypeNumber { num: u8 }, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum BlockSizeError { + #[error("Blocksize was bigger than the absolute maximum {ABSOLUTE_MAXIMUM_BLOCK_SIZE} (128kb). Is: {size}")] + BlockSizeTooLarge { size: u32 }, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum DecompressBlockError { + #[error("Error while reading the block content: {0}")] + BlockContentReadError(#[from] io::Error), + #[error("Malformed section header. Says literals would be this long: {expected_len} but there are only {remaining_bytes} bytes left")] + MalformedSectionHeader { + expected_len: usize, + remaining_bytes: usize, + }, + #[error(transparent)] + DecompressLiteralsError(#[from] DecompressLiteralsError), + #[error(transparent)] + LiteralsSectionParseError(#[from] LiteralsSectionParseError), + #[error(transparent)] + SequencesHeaderParseError(#[from] SequencesHeaderParseError), + #[error(transparent)] + DecodeSequenceError(#[from] DecodeSequenceError), + #[error(transparent)] + ExecuteSequencesError(#[from] ExecuteSequencesError), +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum DecodeBlockContentError { + #[error("Can't decode next block if failed along the way. Results will be nonsense")] + DecoderStateIsFailed, + #[error("Cant decode next block body, while expecting to decode the header of the previous block. Results will be nonsense")] + ExpectedHeaderOfPreviousBlock, + #[error("Error while reading bytes for {step}: {source}")] + ReadError { + step: BlockType, + #[source] + source: io::Error, + }, + #[error(transparent)] + DecompressBlockError(#[from] DecompressBlockError), +} + +pub fn new() -> BlockDecoder { + BlockDecoder { + internal_state: DecoderState::ReadyToDecodeNextHeader, + header_buffer: [0u8; 3], + } +} + +const ABSOLUTE_MAXIMUM_BLOCK_SIZE: u32 = 128 * 1024; + +impl BlockDecoder { + pub fn decode_block_content( + &mut self, + header: &BlockHeader, + workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees + mut source: impl Read, + ) -> Result { + match self.internal_state { + DecoderState::ReadyToDecodeNextBody => { /* Happy :) */ } + DecoderState::Failed => return Err(DecodeBlockContentError::DecoderStateIsFailed), + DecoderState::ReadyToDecodeNextHeader => { + return Err(DecodeBlockContentError::ExpectedHeaderOfPreviousBlock) + } + } + + let block_type = header.block_type; + match block_type { + BlockType::RLE => { + const BATCH_SIZE: usize = 512; + let mut buf = [0u8; BATCH_SIZE]; + let full_reads = header.decompressed_size / BATCH_SIZE as u32; + let single_read_size = header.decompressed_size % BATCH_SIZE as u32; + + source.read_exact(&mut buf[0..1]).map_err(|err| { + DecodeBlockContentError::ReadError { + step: block_type, + source: err, + } + })?; + self.internal_state = DecoderState::ReadyToDecodeNextHeader; + + for i in 1..BATCH_SIZE { + buf[i] = buf[0]; + } + + for _ in 0..full_reads { + workspace.buffer.push(&buf[..]); + } + let smaller = &mut buf[..single_read_size as usize]; + workspace.buffer.push(smaller); + + Ok(1) + } + BlockType::Raw => { + const BATCH_SIZE: usize = 128 * 1024; + let mut buf = [0u8; BATCH_SIZE]; + let full_reads = header.decompressed_size / BATCH_SIZE as u32; + let single_read_size = header.decompressed_size % BATCH_SIZE as u32; + + for _ in 0..full_reads { + source.read_exact(&mut buf[..]).map_err(|err| { + DecodeBlockContentError::ReadError { + step: block_type, + source: err, + } + })?; + workspace.buffer.push(&buf[..]); + } + + let smaller = &mut buf[..single_read_size as usize]; + source + .read_exact(smaller) + .map_err(|err| DecodeBlockContentError::ReadError { + step: block_type, + source: err, + })?; + workspace.buffer.push(smaller); + + self.internal_state = DecoderState::ReadyToDecodeNextHeader; + Ok(u64::from(header.decompressed_size)) + } + + BlockType::Reserved => { + panic!("How did you even get this. The decoder should error out if it detects a reserved-type block"); + } + + BlockType::Compressed => { + self.decompress_block(header, workspace, source)?; + + self.internal_state = DecoderState::ReadyToDecodeNextHeader; + Ok(u64::from(header.content_size)) + } + } + } + + fn decompress_block( + &mut self, + header: &BlockHeader, + workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees + mut source: impl Read, + ) -> Result<(), DecompressBlockError> { + workspace + .block_content_buffer + .resize(header.content_size as usize, 0); + + source.read_exact(workspace.block_content_buffer.as_mut_slice())?; + let raw = workspace.block_content_buffer.as_slice(); + + let mut section = LiteralsSection::new(); + let bytes_in_literals_header = section.parse_from_header(raw)?; + let raw = &raw[bytes_in_literals_header as usize..]; + if crate::VERBOSE { + println!( + "Found {} literalssection with regenerated size: {}, and compressed size: {:?}", + section.ls_type, section.regenerated_size, section.compressed_size + ); + } + + let upper_limit_for_literals = match section.compressed_size { + Some(x) => x as usize, + None => match section.ls_type { + LiteralsSectionType::RLE => 1, + LiteralsSectionType::Raw => section.regenerated_size as usize, + _ => panic!("Bug in this library"), + }, + }; + + if raw.len() < upper_limit_for_literals { + return Err(DecompressBlockError::MalformedSectionHeader { + expected_len: upper_limit_for_literals, + remaining_bytes: raw.len(), + }); + } + + let raw_literals = &raw[..upper_limit_for_literals]; + if crate::VERBOSE { + println!("Slice for literals: {}", raw_literals.len()); + } + + workspace.literals_buffer.clear(); //all literals of the previous block must have been used in the sequence execution anyways. just be defensive here + let bytes_used_in_literals_section = decode_literals( + §ion, + &mut workspace.huf, + raw_literals, + &mut workspace.literals_buffer, + )?; + assert!( + section.regenerated_size == workspace.literals_buffer.len() as u32, + "Wrong number of literals: {}, Should have been: {}", + workspace.literals_buffer.len(), + section.regenerated_size + ); + assert!(bytes_used_in_literals_section == upper_limit_for_literals as u32); + + let raw = &raw[upper_limit_for_literals..]; + if crate::VERBOSE { + println!("Slice for sequences with headers: {}", raw.len()); + } + + let mut seq_section = SequencesHeader::new(); + let bytes_in_sequence_header = seq_section.parse_from_header(raw)?; + let raw = &raw[bytes_in_sequence_header as usize..]; + if crate::VERBOSE { + println!( + "Found sequencessection with sequences: {} and size: {}", + seq_section.num_sequences, + raw.len() + ); + } + + assert!( + u32::from(bytes_in_literals_header) + + bytes_used_in_literals_section + + u32::from(bytes_in_sequence_header) + + raw.len() as u32 + == header.content_size + ); + if crate::VERBOSE { + println!("Slice for sequences: {}", raw.len()); + } + + if seq_section.num_sequences != 0 { + decode_sequences( + &seq_section, + raw, + &mut workspace.fse, + &mut workspace.sequences, + )?; + if crate::VERBOSE { + println!("Executing sequences"); + } + execute_sequences(workspace)?; + } else { + workspace.buffer.push(&workspace.literals_buffer); + workspace.sequences.clear(); + } + + Ok(()) + } + + pub fn read_block_header( + &mut self, + mut r: impl Read, + ) -> Result<(BlockHeader, u8), BlockHeaderReadError> { + //match self.internal_state { + // DecoderState::ReadyToDecodeNextHeader => {/* Happy :) */}, + // DecoderState::Failed => return Err(format!("Cant decode next block if failed along the way. Results will be nonsense")), + // DecoderState::ReadyToDecodeNextBody => return Err(format!("Cant decode next block header, while expecting to decode the body of the previous block. Results will be nonsense")), + //} + + r.read_exact(&mut self.header_buffer[0..3])?; + + let btype = self.block_type()?; + if let BlockType::Reserved = btype { + return Err(BlockHeaderReadError::FoundReservedBlock); + } + + let block_size = self.block_content_size()?; + let decompressed_size = match btype { + BlockType::Raw => block_size, + BlockType::RLE => block_size, + BlockType::Reserved => 0, //should be catched above, this is an error state + BlockType::Compressed => 0, //unknown but will be smaller than 128kb (or window_size if that is smaller than 128kb) + }; + let content_size = match btype { + BlockType::Raw => block_size, + BlockType::Compressed => block_size, + BlockType::RLE => 1, + BlockType::Reserved => 0, //should be catched above, this is an error state + }; + + let last_block = self.is_last(); + + self.reset_buffer(); + self.internal_state = DecoderState::ReadyToDecodeNextBody; + + //just return 3. Blockheaders always take 3 bytes + Ok(( + BlockHeader { + last_block, + block_type: btype, + decompressed_size, + content_size, + }, + 3, + )) + } + + fn reset_buffer(&mut self) { + self.header_buffer[0] = 0; + self.header_buffer[1] = 0; + self.header_buffer[2] = 0; + } + + fn is_last(&self) -> bool { + self.header_buffer[0] & 0x1 == 1 + } + + fn block_type(&self) -> Result { + let t = (self.header_buffer[0] >> 1) & 0x3; + match t { + 0 => Ok(BlockType::Raw), + 1 => Ok(BlockType::RLE), + 2 => Ok(BlockType::Compressed), + 3 => Ok(BlockType::Reserved), + other => Err(BlockTypeError::InvalidBlocktypeNumber { num: other }), + } + } + + fn block_content_size(&self) -> Result { + let val = self.block_content_size_unchecked(); + if val > ABSOLUTE_MAXIMUM_BLOCK_SIZE { + Err(BlockSizeError::BlockSizeTooLarge { size: val }) + } else { + Ok(val) + } + } + + fn block_content_size_unchecked(&self) -> u32 { + u32::from(self.header_buffer[0] >> 3) //push out type and last_block flags. Retain 5 bit + | (u32::from(self.header_buffer[1]) << 5) + | (u32::from(self.header_buffer[2]) << 13) + } +} diff --git a/vendor/ruzstd/src/decoding/decodebuffer.rs b/vendor/ruzstd/src/decoding/decodebuffer.rs new file mode 100644 index 000000000..0dea7015e --- /dev/null +++ b/vendor/ruzstd/src/decoding/decodebuffer.rs @@ -0,0 +1,423 @@ +use std::hash::Hasher; +use std::io; + +use twox_hash::XxHash64; + +use super::ringbuffer::RingBuffer; + +pub struct Decodebuffer { + buffer: RingBuffer, + pub dict_content: Vec, + + pub window_size: usize, + total_output_counter: u64, + pub hash: XxHash64, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum DecodebufferError { + #[error("Need {need} bytes from the dictionary but it is only {got} bytes long")] + NotEnoughBytesInDictionary { got: usize, need: usize }, + #[error("offset: {offset} bigger than buffer: {buf_len}")] + OffsetTooBig { offset: usize, buf_len: usize }, +} + +impl io::Read for Decodebuffer { + fn read(&mut self, target: &mut [u8]) -> io::Result { + let max_amount = self.can_drain_to_window_size().unwrap_or(0); + let amount = max_amount.min(target.len()); + + let mut written = 0; + self.drain_to(amount, |buf| { + target[written..][..buf.len()].copy_from_slice(buf); + written += buf.len(); + (buf.len(), Ok(())) + })?; + Ok(amount) + } +} + +impl Decodebuffer { + pub fn new(window_size: usize) -> Decodebuffer { + Decodebuffer { + buffer: RingBuffer::new(), + dict_content: Vec::new(), + window_size, + total_output_counter: 0, + hash: XxHash64::with_seed(0), + } + } + + pub fn reset(&mut self, window_size: usize) { + self.window_size = window_size; + self.buffer.clear(); + self.buffer.reserve(self.window_size); + self.dict_content.clear(); + self.total_output_counter = 0; + self.hash = XxHash64::with_seed(0); + } + + pub fn len(&self) -> usize { + self.buffer.len() + } + + pub fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + pub fn push(&mut self, data: &[u8]) { + self.buffer.extend(data); + self.total_output_counter += data.len() as u64; + } + + pub fn repeat(&mut self, offset: usize, match_length: usize) -> Result<(), DecodebufferError> { + if offset > self.buffer.len() { + if self.total_output_counter <= self.window_size as u64 { + // at least part of that repeat is from the dictionary content + let bytes_from_dict = offset - self.buffer.len(); + + if bytes_from_dict > self.dict_content.len() { + return Err(DecodebufferError::NotEnoughBytesInDictionary { + got: self.dict_content.len(), + need: bytes_from_dict, + }); + } + + if bytes_from_dict < match_length { + let dict_slice = + &self.dict_content[self.dict_content.len() - bytes_from_dict..]; + self.buffer.extend(dict_slice); + + self.total_output_counter += bytes_from_dict as u64; + return self.repeat(self.buffer.len(), match_length - bytes_from_dict); + } else { + let low = self.dict_content.len() - bytes_from_dict; + let high = low + match_length; + let dict_slice = &self.dict_content[low..high]; + self.buffer.extend(dict_slice); + } + } else { + return Err(DecodebufferError::OffsetTooBig { + offset, + buf_len: self.buffer.len(), + }); + } + } else { + let buf_len = self.buffer.len(); + let start_idx = buf_len - offset; + let end_idx = start_idx + match_length; + + self.buffer.reserve(match_length); + if end_idx > buf_len { + // We need to copy in chunks. + // We have at max offset bytes in one chunk, the last one can be smaller + let mut start_idx = start_idx; + let mut copied_counter_left = match_length; + // TODO this can be optimized further I think. + // Each time we copy a chunk we have a repetiton of length 'offset', so we can copy offset * iteration many bytes from start_idx + while copied_counter_left > 0 { + let chunksize = usize::min(offset, copied_counter_left); + + // SAFETY: + // we know that start_idx <= buf_len and start_idx + offset == buf_len and we reserverd match_length space + unsafe { + self.buffer + .extend_from_within_unchecked(start_idx, chunksize) + }; + copied_counter_left -= chunksize; + start_idx += chunksize; + } + } else { + // can just copy parts of the existing buffer + // SAFETY: + // we know that start_idx and end_idx <= buf_len and we reserverd match_length space + unsafe { + self.buffer + .extend_from_within_unchecked(start_idx, match_length) + }; + } + + self.total_output_counter += match_length as u64; + } + + Ok(()) + } + + // Check if and how many bytes can currently be drawn from the buffer + pub fn can_drain_to_window_size(&self) -> Option { + if self.buffer.len() > self.window_size { + Some(self.buffer.len() - self.window_size) + } else { + None + } + } + + //How many bytes can be drained if the window_size does not have to be maintained + pub fn can_drain(&self) -> usize { + self.buffer.len() + } + + //drain as much as possible while retaining enough so that decoding si still possible with the required window_size + //At best call only if can_drain_to_window_size reports a 'high' number of bytes to reduce allocations + pub fn drain_to_window_size(&mut self) -> Option> { + //TODO investigate if it is possible to return the std::vec::Drain iterator directly without collecting here + match self.can_drain_to_window_size() { + None => None, + Some(can_drain) => { + let mut vec = Vec::with_capacity(can_drain); + self.drain_to(can_drain, |buf| { + vec.extend_from_slice(buf); + (buf.len(), Ok(())) + }) + .ok()?; + Some(vec) + } + } + } + + pub fn drain_to_window_size_writer(&mut self, mut sink: impl io::Write) -> io::Result { + match self.can_drain_to_window_size() { + None => Ok(0), + Some(can_drain) => { + self.drain_to(can_drain, |buf| write_all_bytes(&mut sink, buf))?; + Ok(can_drain) + } + } + } + + //drain the buffer completely + pub fn drain(&mut self) -> Vec { + let (slice1, slice2) = self.buffer.as_slices(); + self.hash.write(slice1); + self.hash.write(slice2); + + let mut vec = Vec::with_capacity(slice1.len() + slice2.len()); + vec.extend_from_slice(slice1); + vec.extend_from_slice(slice2); + self.buffer.clear(); + vec + } + + pub fn drain_to_writer(&mut self, mut sink: impl io::Write) -> io::Result { + let len = self.buffer.len(); + self.drain_to(len, |buf| write_all_bytes(&mut sink, buf))?; + + Ok(len) + } + + pub fn read_all(&mut self, target: &mut [u8]) -> io::Result { + let amount = self.buffer.len().min(target.len()); + + let mut written = 0; + self.drain_to(amount, |buf| { + target[written..][..buf.len()].copy_from_slice(buf); + written += buf.len(); + (buf.len(), Ok(())) + })?; + Ok(amount) + } + + /// Semantics of write_bytes: + /// Should dump as many of the provided bytes as possible to whatever sink until no bytes are left or an error is encountered + /// Return how many bytes have actually been dumped to the sink. + fn drain_to( + &mut self, + amount: usize, + mut write_bytes: impl FnMut(&[u8]) -> (usize, io::Result<()>), + ) -> io::Result<()> { + if amount == 0 { + return Ok(()); + } + + struct DrainGuard<'a> { + buffer: &'a mut RingBuffer, + amount: usize, + } + + impl<'a> Drop for DrainGuard<'a> { + fn drop(&mut self) { + if self.amount != 0 { + self.buffer.drop_first_n(self.amount); + } + } + } + + let mut drain_guard = DrainGuard { + buffer: &mut self.buffer, + amount: 0, + }; + + let (slice1, slice2) = drain_guard.buffer.as_slices(); + let n1 = slice1.len().min(amount); + let n2 = slice2.len().min(amount - n1); + + if n1 != 0 { + let (written1, res1) = write_bytes(&slice1[..n1]); + self.hash.write(&slice1[..written1]); + drain_guard.amount += written1; + + // Apparently this is what clippy thinks is the best way of expressing this + res1?; + + // Only if the first call to write_bytes was not a partial write we can continue with slice2 + // Partial writes SHOULD never happen without res1 being an error, but lets just protect against it anyways. + if written1 == n1 && n2 != 0 { + let (written2, res2) = write_bytes(&slice2[..n2]); + self.hash.write(&slice2[..written2]); + drain_guard.amount += written2; + + // Apparently this is what clippy thinks is the best way of expressing this + res2?; + } + } + + // Make sure we don't accidentally drop `DrainGuard` earlier. + drop(drain_guard); + + Ok(()) + } +} + +/// Like Write::write_all but returns partial write length even on error +fn write_all_bytes(mut sink: impl io::Write, buf: &[u8]) -> (usize, io::Result<()>) { + let mut written = 0; + while written < buf.len() { + match sink.write(&buf[written..]) { + Ok(w) => written += w, + Err(e) => return (written, Err(e)), + } + } + (written, Ok(())) +} + +#[cfg(test)] +mod tests { + use super::Decodebuffer; + use std::io::Write; + + #[test] + fn short_writer() { + struct ShortWriter { + buf: Vec, + write_len: usize, + } + + impl Write for ShortWriter { + fn write(&mut self, buf: &[u8]) -> std::result::Result { + if buf.len() > self.write_len { + self.buf.extend_from_slice(&buf[..self.write_len]); + Ok(self.write_len) + } else { + self.buf.extend_from_slice(buf); + Ok(buf.len()) + } + } + + fn flush(&mut self) -> std::result::Result<(), std::io::Error> { + Ok(()) + } + } + + let mut short_writer = ShortWriter { + buf: vec![], + write_len: 10, + }; + + let mut decode_buf = Decodebuffer::new(100); + decode_buf.push(b"0123456789"); + decode_buf.repeat(10, 90).unwrap(); + let repeats = 1000; + for _ in 0..repeats { + assert_eq!(decode_buf.len(), 100); + decode_buf.repeat(10, 50).unwrap(); + assert_eq!(decode_buf.len(), 150); + decode_buf + .drain_to_window_size_writer(&mut short_writer) + .unwrap(); + assert_eq!(decode_buf.len(), 100); + } + + assert_eq!(short_writer.buf.len(), repeats * 50); + decode_buf.drain_to_writer(&mut short_writer).unwrap(); + assert_eq!(short_writer.buf.len(), repeats * 50 + 100); + } + + #[test] + fn wouldblock_writer() { + struct WouldblockWriter { + buf: Vec, + last_blocked: usize, + block_every: usize, + } + + impl Write for WouldblockWriter { + fn write(&mut self, buf: &[u8]) -> std::result::Result { + if self.last_blocked < self.block_every { + self.buf.extend_from_slice(buf); + self.last_blocked += 1; + Ok(buf.len()) + } else { + self.last_blocked = 0; + Err(std::io::Error::from(std::io::ErrorKind::WouldBlock)) + } + } + + fn flush(&mut self) -> std::result::Result<(), std::io::Error> { + Ok(()) + } + } + + let mut short_writer = WouldblockWriter { + buf: vec![], + last_blocked: 0, + block_every: 5, + }; + + let mut decode_buf = Decodebuffer::new(100); + decode_buf.push(b"0123456789"); + decode_buf.repeat(10, 90).unwrap(); + let repeats = 1000; + for _ in 0..repeats { + assert_eq!(decode_buf.len(), 100); + decode_buf.repeat(10, 50).unwrap(); + assert_eq!(decode_buf.len(), 150); + loop { + match decode_buf.drain_to_window_size_writer(&mut short_writer) { + Ok(written) => { + if written == 0 { + break; + } + } + Err(e) => { + if e.kind() == std::io::ErrorKind::WouldBlock { + continue; + } else { + panic!("Unexpected error {:?}", e); + } + } + } + } + assert_eq!(decode_buf.len(), 100); + } + + assert_eq!(short_writer.buf.len(), repeats * 50); + loop { + match decode_buf.drain_to_writer(&mut short_writer) { + Ok(written) => { + if written == 0 { + break; + } + } + Err(e) => { + if e.kind() == std::io::ErrorKind::WouldBlock { + continue; + } else { + panic!("Unexpected error {:?}", e); + } + } + } + } + assert_eq!(short_writer.buf.len(), repeats * 50 + 100); + } +} diff --git a/vendor/ruzstd/src/decoding/dictionary.rs b/vendor/ruzstd/src/decoding/dictionary.rs new file mode 100644 index 000000000..51fbcdf0b --- /dev/null +++ b/vendor/ruzstd/src/decoding/dictionary.rs @@ -0,0 +1,93 @@ +use std::convert::TryInto; + +use crate::decoding::scratch::FSEScratch; +use crate::decoding::scratch::HuffmanScratch; +use crate::fse::FSETableError; +use crate::huff0::HuffmanTableError; + +pub struct Dictionary { + pub id: u32, + pub fse: FSEScratch, + pub huf: HuffmanScratch, + pub dict_content: Vec, + pub offset_hist: [u32; 3], +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum DictionaryDecodeError { + #[error( + "Bad magic_num at start of the dictionary; Got: {got:#04X?}, Expected: {MAGIC_NUM:#04x?}" + )] + BadMagicNum { got: [u8; 4] }, + #[error(transparent)] + FSETableError(#[from] FSETableError), + #[error(transparent)] + HuffmanTableError(#[from] HuffmanTableError), +} + +pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC]; + +impl Dictionary { + /// parses the dictionary and set the tables + /// it returns the dict_id for checking with the frame's dict_id + pub fn decode_dict(raw: &[u8]) -> Result { + let mut new_dict = Dictionary { + id: 0, + fse: FSEScratch::new(), + huf: HuffmanScratch::new(), + dict_content: Vec::new(), + offset_hist: [2, 4, 8], + }; + + let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away"); + if magic_num != MAGIC_NUM { + return Err(DictionaryDecodeError::BadMagicNum { got: magic_num }); + } + + let dict_id = raw[4..8].try_into().expect("optimized away"); + let dict_id = u32::from_le_bytes(dict_id); + new_dict.id = dict_id; + + let raw_tables = &raw[8..]; + + let huf_size = new_dict.huf.table.build_decoder(raw_tables)?; + let raw_tables = &raw_tables[huf_size as usize..]; + + let of_size = new_dict.fse.offsets.build_decoder( + raw_tables, + crate::decoding::sequence_section_decoder::OF_MAX_LOG, + )?; + let raw_tables = &raw_tables[of_size..]; + + let ml_size = new_dict.fse.match_lengths.build_decoder( + raw_tables, + crate::decoding::sequence_section_decoder::ML_MAX_LOG, + )?; + let raw_tables = &raw_tables[ml_size..]; + + let ll_size = new_dict.fse.literal_lengths.build_decoder( + raw_tables, + crate::decoding::sequence_section_decoder::LL_MAX_LOG, + )?; + let raw_tables = &raw_tables[ll_size..]; + + let offset1 = raw_tables[0..4].try_into().expect("optimized away"); + let offset1 = u32::from_le_bytes(offset1); + + let offset2 = raw_tables[4..8].try_into().expect("optimized away"); + let offset2 = u32::from_le_bytes(offset2); + + let offset3 = raw_tables[8..12].try_into().expect("optimized away"); + let offset3 = u32::from_le_bytes(offset3); + + new_dict.offset_hist[0] = offset1; + new_dict.offset_hist[1] = offset2; + new_dict.offset_hist[2] = offset3; + + let raw_content = &raw_tables[12..]; + new_dict.dict_content.extend(raw_content); + + Ok(new_dict) + } +} diff --git a/vendor/ruzstd/src/decoding/literals_section_decoder.rs b/vendor/ruzstd/src/decoding/literals_section_decoder.rs new file mode 100644 index 000000000..d947f87eb --- /dev/null +++ b/vendor/ruzstd/src/decoding/literals_section_decoder.rs @@ -0,0 +1,180 @@ +use super::super::blocks::literals_section::{LiteralsSection, LiteralsSectionType}; +use super::bit_reader_reverse::{BitReaderReversed, GetBitsError}; +use super::scratch::HuffmanScratch; +use crate::huff0::{HuffmanDecoder, HuffmanDecoderError, HuffmanTableError}; + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum DecompressLiteralsError { + #[error( + "compressed size was none even though it must be set to something for compressed literals" + )] + MissingCompressedSize, + #[error("num_streams was none even though it must be set to something (1 or 4) for compressed literals")] + MissingNumStreams, + #[error(transparent)] + GetBitsError(#[from] GetBitsError), + #[error(transparent)] + HuffmanTableError(#[from] HuffmanTableError), + #[error(transparent)] + HuffmanDecoderError(#[from] HuffmanDecoderError), + #[error("Tried to reuse huffman table but it was never initialized")] + UninitializedHuffmanTable, + #[error("Need 6 bytes to decode jump header, got {got} bytes")] + MissingBytesForJumpHeader { got: usize }, + #[error("Need at least {needed} bytes to decode literals. Have: {got} bytes")] + MissingBytesForLiterals { got: usize, needed: usize }, + #[error("Padding at the end of the sequence_section was more than a byte long: {skipped_bits} bits. Probably caused by data corruption")] + ExtraPadding { skipped_bits: i32 }, + #[error("Bitstream was read till: {read_til}, should have been: {expected}")] + BitstreamReadMismatch { read_til: isize, expected: isize }, + #[error("Did not decode enough literals: {decoded}, Should have been: {expected}")] + DecodedLiteralCountMismatch { decoded: usize, expected: usize }, +} + +pub fn decode_literals( + section: &LiteralsSection, + scratch: &mut HuffmanScratch, + source: &[u8], + target: &mut Vec, +) -> Result { + match section.ls_type { + LiteralsSectionType::Raw => { + target.extend(&source[0..section.regenerated_size as usize]); + Ok(section.regenerated_size) + } + LiteralsSectionType::RLE => { + target.resize(target.len() + section.regenerated_size as usize, source[0]); + Ok(1) + } + LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => { + let bytes_read = decompress_literals(section, scratch, source, target)?; + + //return sum of used bytes + Ok(bytes_read) + } + } +} + +fn decompress_literals( + section: &LiteralsSection, + scratch: &mut HuffmanScratch, + source: &[u8], + target: &mut Vec, +) -> Result { + use DecompressLiteralsError as err; + + let compressed_size = section.compressed_size.ok_or(err::MissingCompressedSize)? as usize; + let num_streams = section.num_streams.ok_or(err::MissingNumStreams)?; + + target.reserve(section.regenerated_size as usize); + let source = &source[0..compressed_size]; + let mut bytes_read = 0; + + match section.ls_type { + LiteralsSectionType::Compressed => { + //read Huffman tree description + bytes_read += scratch.table.build_decoder(source)?; + if crate::VERBOSE { + println!("Built huffman table using {} bytes", bytes_read); + } + } + LiteralsSectionType::Treeless => { + if scratch.table.max_num_bits == 0 { + return Err(err::UninitializedHuffmanTable); + } + } + _ => { /* nothing to do, huffman tree has been provided by previous block */ } + } + + let source = &source[bytes_read as usize..]; + + if num_streams == 4 { + //build jumptable + if source.len() < 6 { + return Err(err::MissingBytesForJumpHeader { got: source.len() }); + } + let jump1 = source[0] as usize + ((source[1] as usize) << 8); + let jump2 = jump1 + source[2] as usize + ((source[3] as usize) << 8); + let jump3 = jump2 + source[4] as usize + ((source[5] as usize) << 8); + bytes_read += 6; + let source = &source[6..]; + + if source.len() < jump3 { + return Err(err::MissingBytesForLiterals { + got: source.len(), + needed: jump3, + }); + } + + //decode 4 streams + let stream1 = &source[..jump1]; + let stream2 = &source[jump1..jump2]; + let stream3 = &source[jump2..jump3]; + let stream4 = &source[jump3..]; + + for stream in &[stream1, stream2, stream3, stream4] { + let mut decoder = HuffmanDecoder::new(&scratch.table); + let mut br = BitReaderReversed::new(stream); + //skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found + let mut skipped_bits = 0; + loop { + let val = br.get_bits(1)?; + skipped_bits += 1; + if val == 1 || skipped_bits > 8 { + break; + } + } + if skipped_bits > 8 { + //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data + return Err(DecompressLiteralsError::ExtraPadding { skipped_bits }); + } + decoder.init_state(&mut br)?; + + while br.bits_remaining() > -(scratch.table.max_num_bits as isize) { + target.push(decoder.decode_symbol()); + decoder.next_state(&mut br)?; + } + if br.bits_remaining() != -(scratch.table.max_num_bits as isize) { + return Err(DecompressLiteralsError::BitstreamReadMismatch { + read_til: br.bits_remaining(), + expected: -(scratch.table.max_num_bits as isize), + }); + } + } + + bytes_read += source.len() as u32; + } else { + //just decode the one stream + assert!(num_streams == 1); + let mut decoder = HuffmanDecoder::new(&scratch.table); + let mut br = BitReaderReversed::new(source); + let mut skipped_bits = 0; + loop { + let val = br.get_bits(1)?; + skipped_bits += 1; + if val == 1 || skipped_bits > 8 { + break; + } + } + if skipped_bits > 8 { + //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data + return Err(DecompressLiteralsError::ExtraPadding { skipped_bits }); + } + decoder.init_state(&mut br)?; + while br.bits_remaining() > -(scratch.table.max_num_bits as isize) { + target.push(decoder.decode_symbol()); + decoder.next_state(&mut br)?; + } + bytes_read += source.len() as u32; + } + + if target.len() != section.regenerated_size as usize { + return Err(DecompressLiteralsError::DecodedLiteralCountMismatch { + decoded: target.len(), + expected: section.regenerated_size as usize, + }); + } + + Ok(bytes_read) +} diff --git a/vendor/ruzstd/src/decoding/mod.rs b/vendor/ruzstd/src/decoding/mod.rs new file mode 100644 index 000000000..b89df3510 --- /dev/null +++ b/vendor/ruzstd/src/decoding/mod.rs @@ -0,0 +1,11 @@ +pub mod bit_reader; +pub mod bit_reader_reverse; +pub mod block_decoder; +pub mod decodebuffer; +pub mod dictionary; +pub mod literals_section_decoder; +mod ringbuffer; +#[allow(dead_code)] +pub mod scratch; +pub mod sequence_execution; +pub mod sequence_section_decoder; diff --git a/vendor/ruzstd/src/decoding/ringbuffer.rs b/vendor/ruzstd/src/decoding/ringbuffer.rs new file mode 100644 index 000000000..9e3e9ba5e --- /dev/null +++ b/vendor/ruzstd/src/decoding/ringbuffer.rs @@ -0,0 +1,632 @@ +use std::{alloc::Layout, ptr::NonNull, slice}; + +pub struct RingBuffer { + buf: NonNull, + cap: usize, + head: usize, + tail: usize, +} + +impl RingBuffer { + pub fn new() -> Self { + RingBuffer { + buf: NonNull::dangling(), + cap: 0, + head: 0, + tail: 0, + } + } + + pub fn len(&self) -> usize { + let (x, y) = self.data_slice_lengths(); + x + y + } + + pub fn free(&self) -> usize { + let (x, y) = self.free_slice_lengths(); + (x + y).saturating_sub(1) + } + + pub fn clear(&mut self) { + self.head = 0; + self.tail = 0; + } + + pub fn is_empty(&self) -> bool { + self.head == self.tail + } + + pub fn reserve(&mut self, amount: usize) { + let free = self.free(); + if free >= amount { + return; + } + + self.reserve_amortized(amount - free); + } + + #[inline(never)] + #[cold] + fn reserve_amortized(&mut self, amount: usize) { + // SAFETY: if we were succesfully able to construct this layout when we allocated then it's also valid do so now + let current_layout = unsafe { Layout::array::(self.cap).unwrap_unchecked() }; + + // Always have at least 1 unused element as the sentinel. + let new_cap = usize::max( + self.cap.next_power_of_two(), + (self.cap + amount).next_power_of_two(), + ) + 1; + + // Check that the capacity isn't bigger than isize::MAX, which is the max allowed by LLVM, or that + // we are on a >= 64 bit system which will never allow that much memory to be allocated + #[allow(clippy::assertions_on_constants)] + { + debug_assert!(usize::BITS >= 64 || new_cap < isize::MAX as usize); + } + + let new_layout = Layout::array::(new_cap) + .unwrap_or_else(|_| panic!("Could not create layout for u8 array of size {}", new_cap)); + + // alloc the new memory region and panic if alloc fails + // TODO maybe rework this to generate an error? + let new_buf = unsafe { + let new_buf = std::alloc::alloc(new_layout); + + NonNull::new(new_buf).expect("Allocating new space for the ringbuffer failed") + }; + + // If we had data before, copy it over to the newly alloced memory region + if self.cap > 0 { + let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts(); + + unsafe { + new_buf.as_ptr().copy_from_nonoverlapping(s1_ptr, s1_len); + new_buf + .as_ptr() + .add(s1_len) + .copy_from_nonoverlapping(s2_ptr, s2_len); + std::alloc::dealloc(self.buf.as_ptr(), current_layout); + } + + self.tail = s1_len + s2_len; + self.head = 0; + } + self.buf = new_buf; + self.cap = new_cap; + } + + #[allow(dead_code)] + pub fn push_back(&mut self, byte: u8) { + self.reserve(1); + + unsafe { self.buf.as_ptr().add(self.tail).write(byte) }; + self.tail = (self.tail + 1) % self.cap; + } + + #[allow(dead_code)] + pub fn get(&self, idx: usize) -> Option { + if idx < self.len() { + let idx = (self.head + idx) % self.cap; + Some(unsafe { self.buf.as_ptr().add(idx).read() }) + } else { + None + } + } + + pub fn extend(&mut self, data: &[u8]) { + let len = data.len(); + let ptr = data.as_ptr(); + if len == 0 { + return; + } + + self.reserve(len); + + debug_assert!(self.len() + len <= self.cap - 1); + debug_assert!(self.free() >= len, "free: {} len: {}", self.free(), len); + + let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts(); + debug_assert!(f1_len + f2_len >= len, "{} + {} < {}", f1_len, f2_len, len); + + let in_f1 = usize::min(len, f1_len); + let in_f2 = len - in_f1; + + debug_assert!(in_f1 + in_f2 == len); + + unsafe { + if in_f1 > 0 { + f1_ptr.copy_from_nonoverlapping(ptr, in_f1); + } + if in_f2 > 0 { + f2_ptr.copy_from_nonoverlapping(ptr.add(in_f1), in_f2); + } + } + self.tail = (self.tail + len) % self.cap; + } + + pub fn drop_first_n(&mut self, amount: usize) { + debug_assert!(amount <= self.len()); + let amount = usize::min(amount, self.len()); + self.head = (self.head + amount) % self.cap; + } + + fn data_slice_lengths(&self) -> (usize, usize) { + let len_after_head; + let len_to_tail; + + // TODO can we do this branchless? + if self.tail >= self.head { + len_after_head = self.tail - self.head; + len_to_tail = 0; + } else { + len_after_head = self.cap - self.head; + len_to_tail = self.tail; + } + (len_after_head, len_to_tail) + } + + fn data_slice_parts(&self) -> ((*const u8, usize), (*const u8, usize)) { + let (len_after_head, len_to_tail) = self.data_slice_lengths(); + + ( + (unsafe { self.buf.as_ptr().add(self.head) }, len_after_head), + (self.buf.as_ptr(), len_to_tail), + ) + } + pub fn as_slices(&self) -> (&[u8], &[u8]) { + let (s1, s2) = self.data_slice_parts(); + unsafe { + let s1 = slice::from_raw_parts(s1.0, s1.1); + let s2 = slice::from_raw_parts(s2.0, s2.1); + (s1, s2) + } + } + + fn free_slice_lengths(&self) -> (usize, usize) { + let len_to_head; + let len_after_tail; + + // TODO can we do this branchless? + if self.tail < self.head { + len_after_tail = self.head - self.tail; + len_to_head = 0; + } else { + len_after_tail = self.cap - self.tail; + len_to_head = self.head; + } + (len_to_head, len_after_tail) + } + + fn free_slice_parts(&self) -> ((*mut u8, usize), (*mut u8, usize)) { + let (len_to_head, len_after_tail) = self.free_slice_lengths(); + + ( + (unsafe { self.buf.as_ptr().add(self.tail) }, len_after_tail), + (self.buf.as_ptr(), len_to_head), + ) + } + + #[allow(dead_code)] + pub fn extend_from_within(&mut self, start: usize, len: usize) { + if start + len > self.len() { + panic!( + "Calls to this functions must respect start ({}) + len ({}) <= self.len() ({})!", + start, + len, + self.len() + ); + } + + self.reserve(len); + unsafe { self.extend_from_within_unchecked(start, len) } + } + + /// SAFETY: + /// Needs start + len <= self.len() + /// And more then len reserved space + #[warn(unsafe_op_in_unsafe_fn)] + pub unsafe fn extend_from_within_unchecked(&mut self, start: usize, len: usize) { + debug_assert!(!self.buf.as_ptr().is_null()); + + if self.head < self.tail { + // continous data slice |____HDDDDDDDT_____| + let after_tail = usize::min(len, self.cap - self.tail); + unsafe { + self.buf + .as_ptr() + .add(self.tail) + .copy_from_nonoverlapping(self.buf.as_ptr().add(self.head + start), after_tail); + if after_tail < len { + self.buf.as_ptr().copy_from_nonoverlapping( + self.buf.as_ptr().add(self.head + start + after_tail), + len - after_tail, + ); + } + } + } else { + // continous free slice |DDDT_________HDDDD| + if self.head + start > self.cap { + let start = (self.head + start) % self.cap; + unsafe { + self.buf + .as_ptr() + .add(self.tail) + .copy_from_nonoverlapping(self.buf.as_ptr().add(start), len) + } + } else { + let after_start = usize::min(len, self.cap - self.head - start); + unsafe { + self.buf.as_ptr().add(self.tail).copy_from_nonoverlapping( + self.buf.as_ptr().add(self.head + start), + after_start, + ); + if after_start < len { + self.buf + .as_ptr() + .add(self.tail + after_start) + .copy_from_nonoverlapping(self.buf.as_ptr(), len - after_start); + } + } + } + } + + self.tail = (self.tail + len) % self.cap; + } + + #[allow(dead_code)] + /// SAFETY: + /// Needs start + len <= self.len() + /// And more then len reserved space + pub unsafe fn extend_from_within_unchecked_branchless(&mut self, start: usize, len: usize) { + // data slices in raw parts + let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts(); + + debug_assert!(len <= s1_len + s2_len, "{} > {} + {}", len, s1_len, s2_len); + + // calc the actually wanted slices in raw parts + let start_in_s1 = usize::min(s1_len, start); + let end_in_s1 = usize::min(s1_len, start + len); + let m1_ptr = s1_ptr.add(start_in_s1); + let m1_len = end_in_s1 - start_in_s1; + + debug_assert!(end_in_s1 <= s1_len); + debug_assert!(start_in_s1 <= s1_len); + + let start_in_s2 = start.saturating_sub(s1_len); + let end_in_s2 = start_in_s2 + (len - m1_len); + let m2_ptr = s2_ptr.add(start_in_s2); + let m2_len = end_in_s2 - start_in_s2; + + debug_assert!(start_in_s2 <= s2_len); + debug_assert!(end_in_s2 <= s2_len); + + debug_assert_eq!(len, m1_len + m2_len); + + // the free slices, must hold: f1_len + f2_len >= m1_len + m2_len + let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts(); + + debug_assert!(f1_len + f2_len >= m1_len + m2_len); + + // calc how many from where bytes go where + let m1_in_f1 = usize::min(m1_len, f1_len); + let m1_in_f2 = m1_len - m1_in_f1; + let m2_in_f1 = usize::min(f1_len - m1_in_f1, m2_len); + let m2_in_f2 = m2_len - m2_in_f1; + + debug_assert_eq!(m1_len, m1_in_f1 + m1_in_f2); + debug_assert_eq!(m2_len, m2_in_f1 + m2_in_f2); + debug_assert!(f1_len >= m1_in_f1 + m2_in_f1); + debug_assert!(f2_len >= m1_in_f2 + m2_in_f2); + debug_assert_eq!(len, m1_in_f1 + m2_in_f1 + m1_in_f2 + m2_in_f2); + + debug_assert!(self.buf.as_ptr().add(self.cap) > f1_ptr.add(m1_in_f1 + m2_in_f1)); + debug_assert!(self.buf.as_ptr().add(self.cap) > f2_ptr.add(m1_in_f2 + m2_in_f2)); + + debug_assert!((m1_in_f2 > 0) ^ (m2_in_f1 > 0) || (m1_in_f2 == 0 && m2_in_f1 == 0)); + + copy_with_checks( + m1_ptr, m2_ptr, f1_ptr, f2_ptr, m1_in_f1, m2_in_f1, m1_in_f2, m2_in_f2, + ); + self.tail = (self.tail + len) % self.cap; + } +} + +impl Drop for RingBuffer { + fn drop(&mut self) { + if self.cap == 0 { + return; + } + + // SAFETY: is we were succesfully able to construct this layout when we allocated then it's also valid do so now + let current_layout = unsafe { Layout::array::(self.cap).unwrap_unchecked() }; + + unsafe { + std::alloc::dealloc(self.buf.as_ptr(), current_layout); + } + } +} + +#[allow(dead_code)] +#[inline(always)] +#[allow(clippy::too_many_arguments)] +unsafe fn copy_without_checks( + m1_ptr: *const u8, + m2_ptr: *const u8, + f1_ptr: *mut u8, + f2_ptr: *mut u8, + m1_in_f1: usize, + m2_in_f1: usize, + m1_in_f2: usize, + m2_in_f2: usize, +) { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + f1_ptr + .add(m1_in_f1) + .copy_from_nonoverlapping(m2_ptr, m2_in_f1); + + f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); + f2_ptr + .add(m1_in_f2) + .copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); +} + +#[allow(dead_code)] +#[inline(always)] +#[allow(clippy::too_many_arguments)] +unsafe fn copy_with_checks( + m1_ptr: *const u8, + m2_ptr: *const u8, + f1_ptr: *mut u8, + f2_ptr: *mut u8, + m1_in_f1: usize, + m2_in_f1: usize, + m1_in_f2: usize, + m2_in_f2: usize, +) { + if m1_in_f1 != 0 { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + } + if m2_in_f1 != 0 { + f1_ptr + .add(m1_in_f1) + .copy_from_nonoverlapping(m2_ptr, m2_in_f1); + } + + if m1_in_f2 != 0 { + f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); + } + if m2_in_f2 != 0 { + f2_ptr + .add(m1_in_f2) + .copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); + } +} + +#[allow(dead_code)] +#[inline(always)] +#[allow(clippy::too_many_arguments)] +unsafe fn copy_with_nobranch_check( + m1_ptr: *const u8, + m2_ptr: *const u8, + f1_ptr: *mut u8, + f2_ptr: *mut u8, + m1_in_f1: usize, + m2_in_f1: usize, + m1_in_f2: usize, + m2_in_f2: usize, +) { + let case = (m1_in_f1 > 0) as usize + | (((m2_in_f1 > 0) as usize) << 1) + | (((m1_in_f2 > 0) as usize) << 2) + | (((m2_in_f2 > 0) as usize) << 3); + + match case { + 0 => {} + + // one bit set + 1 => { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + } + 2 => { + f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1); + } + 4 => { + f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2); + } + 8 => { + f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2); + } + + // two bit set + 3 => { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + f1_ptr + .add(m1_in_f1) + .copy_from_nonoverlapping(m2_ptr, m2_in_f1); + } + 5 => { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); + } + 6 => std::hint::unreachable_unchecked(), + 7 => std::hint::unreachable_unchecked(), + 9 => { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2); + } + 10 => { + f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1); + f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); + } + 12 => { + f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2); + f2_ptr + .add(m1_in_f2) + .copy_from_nonoverlapping(m2_ptr, m2_in_f2); + } + + // three bit set + 11 => { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + f1_ptr + .add(m1_in_f1) + .copy_from_nonoverlapping(m2_ptr, m2_in_f1); + f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2); + } + 13 => { + f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1); + f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2); + f2_ptr + .add(m1_in_f2) + .copy_from_nonoverlapping(m2_ptr, m2_in_f2); + } + 14 => std::hint::unreachable_unchecked(), + 15 => std::hint::unreachable_unchecked(), + _ => std::hint::unreachable_unchecked(), + } +} + +#[cfg(test)] +mod tests { + use super::RingBuffer; + + #[test] + fn smoke() { + let mut rb = RingBuffer::new(); + + rb.reserve(15); + assert_eq!(17, rb.cap); + + rb.extend(b"0123456789"); + assert_eq!(rb.len(), 10); + assert_eq!(rb.as_slices().0, b"0123456789"); + assert_eq!(rb.as_slices().1, b""); + + rb.drop_first_n(5); + assert_eq!(rb.len(), 5); + assert_eq!(rb.as_slices().0, b"56789"); + assert_eq!(rb.as_slices().1, b""); + + rb.extend_from_within(2, 3); + assert_eq!(rb.len(), 8); + assert_eq!(rb.as_slices().0, b"56789789"); + assert_eq!(rb.as_slices().1, b""); + + rb.extend_from_within(0, 3); + assert_eq!(rb.len(), 11); + assert_eq!(rb.as_slices().0, b"56789789567"); + assert_eq!(rb.as_slices().1, b""); + + rb.extend_from_within(0, 2); + assert_eq!(rb.len(), 13); + assert_eq!(rb.as_slices().0, b"567897895675"); + assert_eq!(rb.as_slices().1, b"6"); + + rb.drop_first_n(11); + assert_eq!(rb.len(), 2); + assert_eq!(rb.as_slices().0, b"5"); + assert_eq!(rb.as_slices().1, b"6"); + + rb.extend(b"0123456789"); + assert_eq!(rb.len(), 12); + assert_eq!(rb.as_slices().0, b"5"); + assert_eq!(rb.as_slices().1, b"60123456789"); + + rb.drop_first_n(11); + assert_eq!(rb.len(), 1); + assert_eq!(rb.as_slices().0, b"9"); + assert_eq!(rb.as_slices().1, b""); + + rb.extend(b"0123456789"); + assert_eq!(rb.len(), 11); + assert_eq!(rb.as_slices().0, b"9012345"); + assert_eq!(rb.as_slices().1, b"6789"); + } + + #[test] + fn edge_cases() { + // Fill exactly, then empty then fill again + let mut rb = RingBuffer::new(); + rb.reserve(16); + assert_eq!(17, rb.cap); + rb.extend(b"0123456789012345"); + assert_eq!(17, rb.cap); + assert_eq!(16, rb.len()); + assert_eq!(0, rb.free()); + rb.drop_first_n(16); + assert_eq!(0, rb.len()); + assert_eq!(16, rb.free()); + rb.extend(b"0123456789012345"); + assert_eq!(16, rb.len()); + assert_eq!(0, rb.free()); + assert_eq!(17, rb.cap); + assert_eq!(1, rb.as_slices().0.len()); + assert_eq!(15, rb.as_slices().1.len()); + + rb.clear(); + + // data in both slices and then reserve + rb.extend(b"0123456789012345"); + rb.drop_first_n(8); + rb.extend(b"67890123"); + assert_eq!(16, rb.len()); + assert_eq!(0, rb.free()); + assert_eq!(17, rb.cap); + assert_eq!(9, rb.as_slices().0.len()); + assert_eq!(7, rb.as_slices().1.len()); + rb.reserve(1); + assert_eq!(16, rb.len()); + assert_eq!(16, rb.free()); + assert_eq!(33, rb.cap); + assert_eq!(16, rb.as_slices().0.len()); + assert_eq!(0, rb.as_slices().1.len()); + + rb.clear(); + + // fill exactly, then extend from within + rb.extend(b"0123456789012345"); + rb.extend_from_within(0, 16); + assert_eq!(32, rb.len()); + assert_eq!(0, rb.free()); + assert_eq!(33, rb.cap); + assert_eq!(32, rb.as_slices().0.len()); + assert_eq!(0, rb.as_slices().1.len()); + + // extend from within cases + let mut rb = RingBuffer::new(); + rb.reserve(8); + rb.extend(b"01234567"); + rb.drop_first_n(5); + rb.extend_from_within(0, 3); + assert_eq!(4, rb.as_slices().0.len()); + assert_eq!(2, rb.as_slices().1.len()); + + rb.drop_first_n(2); + assert_eq!(2, rb.as_slices().0.len()); + assert_eq!(2, rb.as_slices().1.len()); + rb.extend_from_within(0, 4); + assert_eq!(2, rb.as_slices().0.len()); + assert_eq!(6, rb.as_slices().1.len()); + + rb.drop_first_n(2); + assert_eq!(6, rb.as_slices().0.len()); + assert_eq!(0, rb.as_slices().1.len()); + rb.drop_first_n(2); + assert_eq!(4, rb.as_slices().0.len()); + assert_eq!(0, rb.as_slices().1.len()); + rb.extend_from_within(0, 4); + assert_eq!(7, rb.as_slices().0.len()); + assert_eq!(1, rb.as_slices().1.len()); + + let mut rb = RingBuffer::new(); + rb.reserve(8); + rb.extend(b"11111111"); + rb.drop_first_n(7); + rb.extend(b"111"); + assert_eq!(2, rb.as_slices().0.len()); + assert_eq!(2, rb.as_slices().1.len()); + rb.extend_from_within(0, 4); + assert_eq!(b"11", rb.as_slices().0); + assert_eq!(b"111111", rb.as_slices().1); + } +} diff --git a/vendor/ruzstd/src/decoding/scratch.rs b/vendor/ruzstd/src/decoding/scratch.rs new file mode 100644 index 000000000..2bd753bde --- /dev/null +++ b/vendor/ruzstd/src/decoding/scratch.rs @@ -0,0 +1,128 @@ +use super::super::blocks::sequence_section::Sequence; +use super::decodebuffer::Decodebuffer; +use crate::decoding::dictionary::Dictionary; +use crate::fse::FSETable; +use crate::huff0::HuffmanTable; + +pub struct DecoderScratch { + pub huf: HuffmanScratch, + pub fse: FSEScratch, + pub buffer: Decodebuffer, + pub offset_hist: [u32; 3], + + pub literals_buffer: Vec, + pub sequences: Vec, + pub block_content_buffer: Vec, +} + +impl DecoderScratch { + pub fn new(window_size: usize) -> DecoderScratch { + DecoderScratch { + huf: HuffmanScratch { + table: HuffmanTable::new(), + }, + fse: FSEScratch { + offsets: FSETable::new(), + of_rle: None, + literal_lengths: FSETable::new(), + ll_rle: None, + match_lengths: FSETable::new(), + ml_rle: None, + }, + buffer: Decodebuffer::new(window_size), + offset_hist: [1, 4, 8], + + block_content_buffer: Vec::new(), + literals_buffer: Vec::new(), + sequences: Vec::new(), + } + } + + pub fn reset(&mut self, window_size: usize) { + self.offset_hist = [1, 4, 8]; + self.literals_buffer.clear(); + self.sequences.clear(); + self.block_content_buffer.clear(); + + self.buffer.reset(window_size); + + self.fse.literal_lengths.reset(); + self.fse.match_lengths.reset(); + self.fse.offsets.reset(); + self.fse.ll_rle = None; + self.fse.ml_rle = None; + self.fse.of_rle = None; + + self.huf.table.reset(); + } + + pub fn use_dict(&mut self, dict: &Dictionary) { + self.fse = dict.fse.clone(); + self.huf = dict.huf.clone(); + self.offset_hist = dict.offset_hist; + self.buffer.dict_content = dict.dict_content.clone(); + } + + /// parses the dictionary and set the tables + /// it returns the dict_id for checking with the frame's dict_id + pub fn load_dict( + &mut self, + raw: &[u8], + ) -> Result { + let dict = super::dictionary::Dictionary::decode_dict(raw)?; + + self.huf = dict.huf.clone(); + self.fse = dict.fse.clone(); + self.offset_hist = dict.offset_hist; + self.buffer.dict_content = dict.dict_content.clone(); + Ok(dict.id) + } +} + +#[derive(Clone)] +pub struct HuffmanScratch { + pub table: HuffmanTable, +} + +impl HuffmanScratch { + pub fn new() -> HuffmanScratch { + HuffmanScratch { + table: HuffmanTable::new(), + } + } +} + +impl Default for HuffmanScratch { + fn default() -> Self { + Self::new() + } +} + +#[derive(Clone)] +pub struct FSEScratch { + pub offsets: FSETable, + pub of_rle: Option, + pub literal_lengths: FSETable, + pub ll_rle: Option, + pub match_lengths: FSETable, + pub ml_rle: Option, +} + +impl FSEScratch { + pub fn new() -> FSEScratch { + FSEScratch { + offsets: FSETable::new(), + of_rle: None, + literal_lengths: FSETable::new(), + ll_rle: None, + match_lengths: FSETable::new(), + ml_rle: None, + } + } +} + +impl Default for FSEScratch { + fn default() -> Self { + Self::new() + } +} diff --git a/vendor/ruzstd/src/decoding/sequence_execution.rs b/vendor/ruzstd/src/decoding/sequence_execution.rs new file mode 100644 index 000000000..19247ec62 --- /dev/null +++ b/vendor/ruzstd/src/decoding/sequence_execution.rs @@ -0,0 +1,123 @@ +use super::{decodebuffer::DecodebufferError, scratch::DecoderScratch}; + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum ExecuteSequencesError { + #[error(transparent)] + DecodebufferError(#[from] DecodebufferError), + #[error("Sequence wants to copy up to byte {wanted}. Bytes in literalsbuffer: {have}")] + NotEnoughBytesForSequence { wanted: usize, have: usize }, + #[error("Illegal offset: 0 found")] + ZeroOffset, +} + +pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequencesError> { + let mut literals_copy_counter = 0; + let old_buffer_size = scratch.buffer.len(); + let mut seq_sum = 0; + + for idx in 0..scratch.sequences.len() { + let seq = scratch.sequences[idx]; + if crate::VERBOSE {} + //println!("{}: {}", idx, seq); + + if seq.ll > 0 { + let high = literals_copy_counter + seq.ll as usize; + if high > scratch.literals_buffer.len() { + return Err(ExecuteSequencesError::NotEnoughBytesForSequence { + wanted: high, + have: scratch.literals_buffer.len(), + }); + } + let literals = &scratch.literals_buffer[literals_copy_counter..high]; + literals_copy_counter += seq.ll as usize; + + scratch.buffer.push(literals); + } + + let actual_offset = do_offset_history(seq.of, seq.ll, &mut scratch.offset_hist); + if actual_offset == 0 { + return Err(ExecuteSequencesError::ZeroOffset); + } + if seq.ml > 0 { + scratch + .buffer + .repeat(actual_offset as usize, seq.ml as usize)?; + } + + seq_sum += seq.ml; + seq_sum += seq.ll; + } + if literals_copy_counter < scratch.literals_buffer.len() { + let rest_literals = &scratch.literals_buffer[literals_copy_counter..]; + scratch.buffer.push(rest_literals); + seq_sum += rest_literals.len() as u32; + } + + let diff = scratch.buffer.len() - old_buffer_size; + assert!( + seq_sum as usize == diff, + "Seq_sum: {} is different from the difference in buffersize: {}", + seq_sum, + diff + ); + Ok(()) +} + +fn do_offset_history(offset_value: u32, lit_len: u32, scratch: &mut [u32; 3]) -> u32 { + let actual_offset = if lit_len > 0 { + match offset_value { + 1..=3 => scratch[offset_value as usize - 1], + _ => { + //new offset + offset_value - 3 + } + } + } else { + match offset_value { + 1..=2 => scratch[offset_value as usize], + 3 => scratch[0] - 1, + _ => { + //new offset + offset_value - 3 + } + } + }; + + //update history + if lit_len > 0 { + match offset_value { + 1 => { + //nothing + } + 2 => { + scratch[1] = scratch[0]; + scratch[0] = actual_offset; + } + _ => { + scratch[2] = scratch[1]; + scratch[1] = scratch[0]; + scratch[0] = actual_offset; + } + } + } else { + match offset_value { + 1 => { + scratch[1] = scratch[0]; + scratch[0] = actual_offset; + } + 2 => { + scratch[2] = scratch[1]; + scratch[1] = scratch[0]; + scratch[0] = actual_offset; + } + _ => { + scratch[2] = scratch[1]; + scratch[1] = scratch[0]; + scratch[0] = actual_offset; + } + } + } + + actual_offset +} diff --git a/vendor/ruzstd/src/decoding/sequence_section_decoder.rs b/vendor/ruzstd/src/decoding/sequence_section_decoder.rs new file mode 100644 index 000000000..3d5990c05 --- /dev/null +++ b/vendor/ruzstd/src/decoding/sequence_section_decoder.rs @@ -0,0 +1,495 @@ +use super::super::blocks::sequence_section::ModeType; +use super::super::blocks::sequence_section::Sequence; +use super::super::blocks::sequence_section::SequencesHeader; +use super::bit_reader_reverse::{BitReaderReversed, GetBitsError}; +use super::scratch::FSEScratch; +use crate::fse::{FSEDecoder, FSEDecoderError, FSETableError}; + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum DecodeSequenceError { + #[error(transparent)] + GetBitsError(#[from] GetBitsError), + #[error(transparent)] + FSEDecoderError(#[from] FSEDecoderError), + #[error(transparent)] + FSETableError(#[from] FSETableError), + #[error("Padding at the end of the sequence_section was more than a byte long: {skipped_bits} bits. Probably caused by data corruption")] + ExtraPadding { skipped_bits: i32 }, + #[error("Do not support offsets bigger than 1<<32; got: {offset_code}")] + UnsupportedOffset { offset_code: u8 }, + #[error("Read an offset == 0. That is an illegal value for offsets")] + ZeroOffset, + #[error("Bytestream did not contain enough bytes to decode num_sequences")] + NotEnoughBytesForNumSequences, + #[error("Did not use full bitstream. Bits left: {bits_remaining} ({} bytes)", bits_remaining / 8)] + ExtraBits { bits_remaining: isize }, + #[error("compression modes are none but they must be set to something")] + MissingCompressionMode, + #[error("Need a byte to read for RLE ll table")] + MissingByteForRleLlTable, + #[error("Need a byte to read for RLE of table")] + MissingByteForRleOfTable, + #[error("Need a byte to read for RLE ml table")] + MissingByteForRleMlTable, +} + +pub fn decode_sequences( + section: &SequencesHeader, + source: &[u8], + scratch: &mut FSEScratch, + target: &mut Vec, +) -> Result<(), DecodeSequenceError> { + let bytes_read = maybe_update_fse_tables(section, source, scratch)?; + + if crate::VERBOSE { + println!("Updating tables used {} bytes", bytes_read); + } + + let bit_stream = &source[bytes_read..]; + + let mut br = BitReaderReversed::new(bit_stream); + + //skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found + let mut skipped_bits = 0; + loop { + let val = br.get_bits(1)?; + skipped_bits += 1; + if val == 1 || skipped_bits > 8 { + break; + } + } + if skipped_bits > 8 { + //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data + return Err(DecodeSequenceError::ExtraPadding { skipped_bits }); + } + + if scratch.ll_rle.is_some() || scratch.ml_rle.is_some() || scratch.of_rle.is_some() { + decode_sequences_with_rle(section, &mut br, scratch, target) + } else { + decode_sequences_without_rle(section, &mut br, scratch, target) + } +} + +fn decode_sequences_with_rle( + section: &SequencesHeader, + br: &mut BitReaderReversed<'_>, + scratch: &mut FSEScratch, + target: &mut Vec, +) -> Result<(), DecodeSequenceError> { + let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths); + let mut ml_dec = FSEDecoder::new(&scratch.match_lengths); + let mut of_dec = FSEDecoder::new(&scratch.offsets); + + if scratch.ll_rle.is_none() { + ll_dec.init_state(br)?; + } + if scratch.of_rle.is_none() { + of_dec.init_state(br)?; + } + if scratch.ml_rle.is_none() { + ml_dec.init_state(br)?; + } + + target.clear(); + target.reserve(section.num_sequences as usize); + + for _seq_idx in 0..section.num_sequences { + //get the codes from either the RLE byte or from the decoder + let ll_code = if scratch.ll_rle.is_some() { + scratch.ll_rle.unwrap() + } else { + ll_dec.decode_symbol() + }; + let ml_code = if scratch.ml_rle.is_some() { + scratch.ml_rle.unwrap() + } else { + ml_dec.decode_symbol() + }; + let of_code = if scratch.of_rle.is_some() { + scratch.of_rle.unwrap() + } else { + of_dec.decode_symbol() + }; + + let (ll_value, ll_num_bits) = lookup_ll_code(ll_code); + let (ml_value, ml_num_bits) = lookup_ml_code(ml_code); + + //println!("Sequence: {}", i); + //println!("of stat: {}", of_dec.state); + //println!("of Code: {}", of_code); + //println!("ll stat: {}", ll_dec.state); + //println!("ll bits: {}", ll_num_bits); + //println!("ll Code: {}", ll_value); + //println!("ml stat: {}", ml_dec.state); + //println!("ml bits: {}", ml_num_bits); + //println!("ml Code: {}", ml_value); + //println!(""); + + if of_code >= 32 { + return Err(DecodeSequenceError::UnsupportedOffset { + offset_code: of_code, + }); + } + + let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits)?; + let offset = obits as u32 + (1u32 << of_code); + + if offset == 0 { + return Err(DecodeSequenceError::ZeroOffset); + } + + target.push(Sequence { + ll: ll_value + ll_add as u32, + ml: ml_value + ml_add as u32, + of: offset, + }); + + if target.len() < section.num_sequences as usize { + //println!( + // "Bits left: {} ({} bytes)", + // br.bits_remaining(), + // br.bits_remaining() / 8, + //); + if scratch.ll_rle.is_none() { + ll_dec.update_state(br)?; + } + if scratch.ml_rle.is_none() { + ml_dec.update_state(br)?; + } + if scratch.of_rle.is_none() { + of_dec.update_state(br)?; + } + } + + if br.bits_remaining() < 0 { + return Err(DecodeSequenceError::NotEnoughBytesForNumSequences); + } + } + + if br.bits_remaining() > 0 { + Err(DecodeSequenceError::ExtraBits { + bits_remaining: br.bits_remaining(), + }) + } else { + Ok(()) + } +} + +fn decode_sequences_without_rle( + section: &SequencesHeader, + br: &mut BitReaderReversed<'_>, + scratch: &mut FSEScratch, + target: &mut Vec, +) -> Result<(), DecodeSequenceError> { + let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths); + let mut ml_dec = FSEDecoder::new(&scratch.match_lengths); + let mut of_dec = FSEDecoder::new(&scratch.offsets); + + ll_dec.init_state(br)?; + of_dec.init_state(br)?; + ml_dec.init_state(br)?; + + target.clear(); + target.reserve(section.num_sequences as usize); + + for _seq_idx in 0..section.num_sequences { + let ll_code = ll_dec.decode_symbol(); + let ml_code = ml_dec.decode_symbol(); + let of_code = of_dec.decode_symbol(); + + let (ll_value, ll_num_bits) = lookup_ll_code(ll_code); + let (ml_value, ml_num_bits) = lookup_ml_code(ml_code); + + if of_code >= 32 { + return Err(DecodeSequenceError::UnsupportedOffset { + offset_code: of_code, + }); + } + + let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits)?; + let offset = obits as u32 + (1u32 << of_code); + + if offset == 0 { + return Err(DecodeSequenceError::ZeroOffset); + } + + target.push(Sequence { + ll: ll_value + ll_add as u32, + ml: ml_value + ml_add as u32, + of: offset, + }); + + if target.len() < section.num_sequences as usize { + //println!( + // "Bits left: {} ({} bytes)", + // br.bits_remaining(), + // br.bits_remaining() / 8, + //); + ll_dec.update_state(br)?; + ml_dec.update_state(br)?; + of_dec.update_state(br)?; + } + + if br.bits_remaining() < 0 { + return Err(DecodeSequenceError::NotEnoughBytesForNumSequences); + } + } + + if br.bits_remaining() > 0 { + Err(DecodeSequenceError::ExtraBits { + bits_remaining: br.bits_remaining(), + }) + } else { + Ok(()) + } +} + +fn lookup_ll_code(code: u8) -> (u32, u8) { + match code { + 0..=15 => (u32::from(code), 0), + 16 => (16, 1), + 17 => (18, 1), + 18 => (20, 1), + 19 => (22, 1), + 20 => (24, 2), + 21 => (28, 2), + 22 => (32, 3), + 23 => (40, 3), + 24 => (48, 4), + 25 => (64, 6), + 26 => (128, 7), + 27 => (256, 8), + 28 => (512, 9), + 29 => (1024, 10), + 30 => (2048, 11), + 31 => (4096, 12), + 32 => (8192, 13), + 33 => (16384, 14), + 34 => (32768, 15), + 35 => (65536, 16), + _ => (0, 255), + } +} + +fn lookup_ml_code(code: u8) -> (u32, u8) { + match code { + 0..=31 => (u32::from(code) + 3, 0), + 32 => (35, 1), + 33 => (37, 1), + 34 => (39, 1), + 35 => (41, 1), + 36 => (43, 2), + 37 => (47, 2), + 38 => (51, 3), + 39 => (59, 3), + 40 => (67, 4), + 41 => (83, 4), + 42 => (99, 5), + 43 => (131, 7), + 44 => (259, 8), + 45 => (515, 9), + 46 => (1027, 10), + 47 => (2051, 11), + 48 => (4099, 12), + 49 => (8195, 13), + 50 => (16387, 14), + 51 => (32771, 15), + 52 => (65539, 16), + _ => (0, 255), + } +} + +pub const LL_MAX_LOG: u8 = 9; +pub const ML_MAX_LOG: u8 = 9; +pub const OF_MAX_LOG: u8 = 8; + +fn maybe_update_fse_tables( + section: &SequencesHeader, + source: &[u8], + scratch: &mut FSEScratch, +) -> Result { + let modes = section + .modes + .ok_or(DecodeSequenceError::MissingCompressionMode)?; + + let mut bytes_read = 0; + + match modes.ll_mode() { + ModeType::FSECompressed => { + let bytes = scratch.literal_lengths.build_decoder(source, LL_MAX_LOG)?; + bytes_read += bytes; + if crate::VERBOSE { + println!("Updating ll table"); + println!("Used bytes: {}", bytes); + } + scratch.ll_rle = None; + } + ModeType::RLE => { + if crate::VERBOSE { + println!("Use RLE ll table"); + } + if source.is_empty() { + return Err(DecodeSequenceError::MissingByteForRleLlTable); + } + bytes_read += 1; + scratch.ll_rle = Some(source[0]); + } + ModeType::Predefined => { + if crate::VERBOSE { + println!("Use predefined ll table"); + } + scratch.literal_lengths.build_from_probabilities( + LL_DEFAULT_ACC_LOG, + &Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]), + )?; + scratch.ll_rle = None; + } + ModeType::Repeat => { + if crate::VERBOSE { + println!("Repeat ll table"); + } + /* Nothing to do */ + } + }; + + let of_source = &source[bytes_read..]; + + match modes.of_mode() { + ModeType::FSECompressed => { + let bytes = scratch.offsets.build_decoder(of_source, OF_MAX_LOG)?; + if crate::VERBOSE { + println!("Updating of table"); + println!("Used bytes: {}", bytes); + } + bytes_read += bytes; + scratch.of_rle = None; + } + ModeType::RLE => { + if crate::VERBOSE { + println!("Use RLE of table"); + } + if of_source.is_empty() { + return Err(DecodeSequenceError::MissingByteForRleOfTable); + } + bytes_read += 1; + scratch.of_rle = Some(of_source[0]); + } + ModeType::Predefined => { + if crate::VERBOSE { + println!("Use predefined of table"); + } + scratch.offsets.build_from_probabilities( + OF_DEFAULT_ACC_LOG, + &Vec::from(&OFFSET_DEFAULT_DISTRIBUTION[..]), + )?; + scratch.of_rle = None; + } + ModeType::Repeat => { + if crate::VERBOSE { + println!("Repeat of table"); + } + /* Nothing to do */ + } + }; + + let ml_source = &source[bytes_read..]; + + match modes.ml_mode() { + ModeType::FSECompressed => { + let bytes = scratch.match_lengths.build_decoder(ml_source, ML_MAX_LOG)?; + bytes_read += bytes; + if crate::VERBOSE { + println!("Updating ml table"); + println!("Used bytes: {}", bytes); + } + scratch.ml_rle = None; + } + ModeType::RLE => { + if crate::VERBOSE { + println!("Use RLE ml table"); + } + if ml_source.is_empty() { + return Err(DecodeSequenceError::MissingByteForRleMlTable); + } + bytes_read += 1; + scratch.ml_rle = Some(ml_source[0]); + } + ModeType::Predefined => { + if crate::VERBOSE { + println!("Use predefined ml table"); + } + scratch.match_lengths.build_from_probabilities( + ML_DEFAULT_ACC_LOG, + &Vec::from(&MATCH_LENGTH_DEFAULT_DISTRIBUTION[..]), + )?; + scratch.ml_rle = None; + } + ModeType::Repeat => { + if crate::VERBOSE { + println!("Repeat ml table"); + } + /* Nothing to do */ + } + }; + + Ok(bytes_read) +} + +const LL_DEFAULT_ACC_LOG: u8 = 6; +const LITERALS_LENGTH_DEFAULT_DISTRIBUTION: [i32; 36] = [ + 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, + -1, -1, -1, -1, +]; + +const ML_DEFAULT_ACC_LOG: u8 = 6; +const MATCH_LENGTH_DEFAULT_DISTRIBUTION: [i32; 53] = [ + 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, +]; + +const OF_DEFAULT_ACC_LOG: u8 = 5; +const OFFSET_DEFAULT_DISTRIBUTION: [i32; 29] = [ + 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, +]; + +#[test] +fn test_ll_default() { + let mut table = crate::fse::FSETable::new(); + table + .build_from_probabilities( + LL_DEFAULT_ACC_LOG, + &Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]), + ) + .unwrap(); + + for idx in 0..table.decode.len() { + println!( + "{:3}: {:3} {:3} {:3}", + idx, table.decode[idx].symbol, table.decode[idx].num_bits, table.decode[idx].base_line + ); + } + + assert!(table.decode.len() == 64); + + //just test a few values. TODO test all values + assert!(table.decode[0].symbol == 0); + assert!(table.decode[0].num_bits == 4); + assert!(table.decode[0].base_line == 0); + + assert!(table.decode[19].symbol == 27); + assert!(table.decode[19].num_bits == 6); + assert!(table.decode[19].base_line == 0); + + assert!(table.decode[39].symbol == 25); + assert!(table.decode[39].num_bits == 4); + assert!(table.decode[39].base_line == 16); + + assert!(table.decode[60].symbol == 35); + assert!(table.decode[60].num_bits == 6); + assert!(table.decode[60].base_line == 0); + + assert!(table.decode[59].symbol == 24); + assert!(table.decode[59].num_bits == 5); + assert!(table.decode[59].base_line == 32); +} diff --git a/vendor/ruzstd/src/frame.rs b/vendor/ruzstd/src/frame.rs new file mode 100644 index 000000000..1eb3f8911 --- /dev/null +++ b/vendor/ruzstd/src/frame.rs @@ -0,0 +1,288 @@ +use std::convert::TryInto; +use std::io; + +pub const MAGIC_NUM: u32 = 0xFD2F_B528; +pub const MIN_WINDOW_SIZE: u64 = 1024; +pub const MAX_WINDOW_SIZE: u64 = (1 << 41) + 7 * (1 << 38); + +pub struct Frame { + magic_num: u32, + pub header: FrameHeader, +} + +pub struct FrameHeader { + pub descriptor: FrameDescriptor, + window_descriptor: u8, + dict_id: Vec, + frame_content_size: Vec, +} + +pub struct FrameDescriptor(u8); + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum FrameDescriptorError { + #[error("Invalid Frame_Content_Size_Flag; Is: {got}, Should be one of: 0, 1, 2, 3")] + InvalidFrameContentSizeFlag { got: u8 }, +} + +impl FrameDescriptor { + pub fn frame_content_size_flag(&self) -> u8 { + self.0 >> 6 + } + + pub fn reserved_flag(&self) -> bool { + ((self.0 >> 3) & 0x1) == 1 + } + + pub fn single_segment_flag(&self) -> bool { + ((self.0 >> 5) & 0x1) == 1 + } + + pub fn content_checksum_flag(&self) -> bool { + ((self.0 >> 2) & 0x1) == 1 + } + + pub fn dict_id_flag(&self) -> u8 { + self.0 & 0x3 + } + + // Deriving info from the flags + pub fn frame_content_size_bytes(&self) -> Result { + match self.frame_content_size_flag() { + 0 => { + if self.single_segment_flag() { + Ok(1) + } else { + Ok(0) + } + } + 1 => Ok(2), + 2 => Ok(4), + 3 => Ok(8), + other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }), + } + } + + pub fn dictionary_id_bytes(&self) -> Result { + match self.dict_id_flag() { + 0 => Ok(0), + 1 => Ok(1), + 2 => Ok(2), + 3 => Ok(4), + other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }), + } + } +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum FrameHeaderError { + #[error("window_size bigger than allowed maximum. Is: {got}, Should be lower than: {MAX_WINDOW_SIZE}")] + WindowTooBig { got: u64 }, + #[error("window_size smaller than allowed minimum. Is: {got}, Should be greater than: {MIN_WINDOW_SIZE}")] + WindowTooSmall { got: u64 }, + #[error(transparent)] + FrameDescriptorError(#[from] FrameDescriptorError), + #[error("Not enough bytes in dict_id. Is: {got}, Should be: {expected}")] + DictIdTooSmall { got: usize, expected: usize }, + #[error("frame_content_size does not have the right length. Is: {got}, Should be: {expected}")] + MismatchedFrameSize { got: usize, expected: u8 }, + #[error("frame_content_size was zero")] + FrameSizeIsZero, + #[error("Invalid frame_content_size. Is: {got}, Should be one of 1, 2, 4, 8 bytes")] + InvalidFrameSize { got: u8 }, +} + +impl FrameHeader { + pub fn window_size(&self) -> Result { + if self.descriptor.single_segment_flag() { + self.frame_content_size() + } else { + let exp = self.window_descriptor >> 3; + let mantissa = self.window_descriptor & 0x7; + + let window_log = 10 + u64::from(exp); + let window_base = 1 << window_log; + let window_add = (window_base / 8) * u64::from(mantissa); + + let window_size = window_base + window_add; + + if window_size >= MIN_WINDOW_SIZE { + if window_size < MAX_WINDOW_SIZE { + Ok(window_size) + } else { + Err(FrameHeaderError::WindowTooBig { got: window_size }) + } + } else { + Err(FrameHeaderError::WindowTooSmall { got: window_size }) + } + } + } + + pub fn dictionary_id(&self) -> Result, FrameHeaderError> { + if self.descriptor.dict_id_flag() == 0 { + Ok(None) + } else { + let bytes = self.descriptor.dictionary_id_bytes()?; + if self.dict_id.len() != bytes as usize { + Err(FrameHeaderError::DictIdTooSmall { + got: self.dict_id.len(), + expected: bytes as usize, + }) + } else { + let mut value: u32 = 0; + let mut shift = 0; + for x in &self.dict_id { + value |= u32::from(*x) << shift; + shift += 8; + } + + Ok(Some(value)) + } + } + } + + pub fn frame_content_size(&self) -> Result { + let bytes = self.descriptor.frame_content_size_bytes()?; + + if self.frame_content_size.len() != (bytes as usize) { + return Err(FrameHeaderError::MismatchedFrameSize { + got: self.frame_content_size.len(), + expected: bytes, + }); + } + + match bytes { + 0 => Err(FrameHeaderError::FrameSizeIsZero), + 1 => Ok(u64::from(self.frame_content_size[0])), + 2 => { + let val = (u64::from(self.frame_content_size[1]) << 8) + + (u64::from(self.frame_content_size[0])); + Ok(val + 256) //this weird offset is from the documentation. Only if bytes == 2 + } + 4 => { + let val = self.frame_content_size[..4] + .try_into() + .expect("optimized away"); + let val = u32::from_le_bytes(val); + Ok(u64::from(val)) + } + 8 => { + let val = self.frame_content_size[..8] + .try_into() + .expect("optimized away"); + let val = u64::from_le_bytes(val); + Ok(val) + } + other => Err(FrameHeaderError::InvalidFrameSize { got: other }), + } + } +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum FrameCheckError { + #[error("magic_num wrong. Is: {got}. Should be: {MAGIC_NUM}")] + WrongMagicNum { got: u32 }, + #[error("Reserved Flag set. Must be zero")] + ReservedFlagSet, + #[error(transparent)] + FrameHeaderError(#[from] FrameHeaderError), +} + +impl Frame { + pub fn check_valid(&self) -> Result<(), FrameCheckError> { + if self.magic_num != MAGIC_NUM { + Err(FrameCheckError::WrongMagicNum { + got: self.magic_num, + }) + } else if self.header.descriptor.reserved_flag() { + Err(FrameCheckError::ReservedFlagSet) + } else { + self.header.dictionary_id()?; + self.header.window_size()?; + if self.header.descriptor.single_segment_flag() { + self.header.frame_content_size()?; + } + Ok(()) + } + } +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum ReadFrameHeaderError { + #[error("Error while reading magic number: {0}")] + MagicNumberReadError(#[source] io::Error), + #[error("Error while reading frame descriptor: {0}")] + FrameDescriptorReadError(#[source] io::Error), + #[error(transparent)] + InvalidFrameDescriptor(#[from] FrameDescriptorError), + #[error("Error while reading window descriptor: {0}")] + WindowDescriptorReadError(#[source] io::Error), + #[error("Error while reading dictionary id: {0}")] + DictionaryIdReadError(#[source] io::Error), + #[error("Error while reading frame content size: {0}")] + FrameContentSizeReadError(#[source] io::Error), + #[error("SkippableFrame encountered with MagicNumber 0x{0:X} and length {1} bytes")] + SkipFrame(u32, u32), +} + +use std::io::Read; +pub fn read_frame_header(mut r: impl Read) -> Result<(Frame, u8), ReadFrameHeaderError> { + use ReadFrameHeaderError as err; + let mut buf = [0u8; 4]; + + r.read_exact(&mut buf).map_err(err::MagicNumberReadError)?; + let magic_num = u32::from_le_bytes(buf); + + // Skippable frames have a magic number in this interval + if (0x184D2A50..=0x184D2A5F).contains(&magic_num) { + r.read_exact(&mut buf) + .map_err(err::FrameDescriptorReadError)?; + let skip_size = u32::from_le_bytes(buf); + return Err(ReadFrameHeaderError::SkipFrame(magic_num, skip_size)); + } + + let mut bytes_read = 4; + + r.read_exact(&mut buf[0..1]) + .map_err(err::FrameDescriptorReadError)?; + let desc = FrameDescriptor(buf[0]); + + bytes_read += 1; + + let mut frame_header = FrameHeader { + descriptor: FrameDescriptor(desc.0), + dict_id: vec![0; desc.dictionary_id_bytes()? as usize], + frame_content_size: vec![0; desc.frame_content_size_bytes()? as usize], + window_descriptor: 0, + }; + + if !desc.single_segment_flag() { + r.read_exact(&mut buf[0..1]) + .map_err(err::WindowDescriptorReadError)?; + frame_header.window_descriptor = buf[0]; + bytes_read += 1; + } + + if !frame_header.dict_id.is_empty() { + r.read_exact(frame_header.dict_id.as_mut_slice()) + .map_err(err::DictionaryIdReadError)?; + bytes_read += frame_header.dict_id.len(); + } + + if !frame_header.frame_content_size.is_empty() { + r.read_exact(frame_header.frame_content_size.as_mut_slice()) + .map_err(err::FrameContentSizeReadError)?; + bytes_read += frame_header.frame_content_size.len(); + } + + let frame: Frame = Frame { + magic_num, + header: frame_header, + }; + + Ok((frame, bytes_read as u8)) +} diff --git a/vendor/ruzstd/src/frame_decoder.rs b/vendor/ruzstd/src/frame_decoder.rs new file mode 100644 index 000000000..560e82810 --- /dev/null +++ b/vendor/ruzstd/src/frame_decoder.rs @@ -0,0 +1,569 @@ +use super::frame; +use crate::decoding::dictionary::Dictionary; +use crate::decoding::scratch::DecoderScratch; +use crate::decoding::{self, dictionary}; +use std::collections::HashMap; +use std::convert::TryInto; +use std::hash::Hasher; +use std::io::{self, Read}; + +/// This implements a decoder for zstd frames. This decoder is able to decode frames only partially and gives control +/// over how many bytes/blocks will be decoded at a time (so you don't have to decode a 10GB file into memory all at once). +/// It reads bytes as needed from a provided source and can be read from to collect partial results. +/// +/// If you want to just read the whole frame with an io::Read without having to deal with manually calling decode_blocks +/// you can use the provided StreamingDecoder with wraps this FrameDecoder +/// +/// Workflow is as follows: +/// ``` +/// use ruzstd::frame_decoder::BlockDecodingStrategy; +/// use std::io::Read; +/// use std::io::Write; +/// +/// +/// fn decode_this(mut file: impl std::io::Read) { +/// //Create a new decoder +/// let mut frame_dec = ruzstd::FrameDecoder::new(); +/// let mut result = Vec::new(); +/// +/// // Use reset or init to make the decoder ready to decode the frame from the io::Read +/// frame_dec.reset(&mut file).unwrap(); +/// +/// // Loop until the frame has been decoded completely +/// while !frame_dec.is_finished() { +/// // decode (roughly) batch_size many bytes +/// frame_dec.decode_blocks(&mut file, BlockDecodingStrategy::UptoBytes(1024)).unwrap(); +/// +/// // read from the decoder to collect bytes from the internal buffer +/// let bytes_read = frame_dec.read(result.as_mut_slice()).unwrap(); +/// +/// // then do something with it +/// do_something(&result[0..bytes_read]); +/// } +/// +/// // handle the last chunk of data +/// while frame_dec.can_collect() > 0 { +/// let x = frame_dec.read(result.as_mut_slice()).unwrap(); +/// +/// do_something(&result[0..x]); +/// } +/// } +/// +/// fn do_something(data: &[u8]) { +/// std::io::stdout().write_all(data).unwrap(); +/// } +/// ``` +pub struct FrameDecoder { + state: Option, + dicts: HashMap, +} + +struct FrameDecoderState { + pub frame: frame::Frame, + decoder_scratch: DecoderScratch, + frame_finished: bool, + block_counter: usize, + bytes_read_counter: u64, + check_sum: Option, + using_dict: Option, +} + +pub enum BlockDecodingStrategy { + All, + UptoBlocks(usize), + UptoBytes(usize), +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum FrameDecoderError { + #[error(transparent)] + ReadFrameHeaderError(#[from] frame::ReadFrameHeaderError), + #[error(transparent)] + FrameHeaderError(#[from] frame::FrameHeaderError), + #[error(transparent)] + FrameCheckError(#[from] frame::FrameCheckError), + #[error("Specified window_size is too big; Requested: {requested}, Max: {MAX_WINDOW_SIZE}")] + WindowSizeTooBig { requested: u64 }, + #[error(transparent)] + DictionaryDecodeError(#[from] dictionary::DictionaryDecodeError), + #[error("Failed to parse/decode block body: {0}")] + FailedToReadBlockHeader(#[from] decoding::block_decoder::BlockHeaderReadError), + #[error("Failed to parse block header: {0}")] + FailedToReadBlockBody(decoding::block_decoder::DecodeBlockContentError), + #[error("Failed to read checksum: {0}")] + FailedToReadChecksum(#[source] io::Error), + #[error("Decoder must initialized or reset before using it")] + NotYetInitialized, + #[error("Decoder encountered error while initializing: {0}")] + FailedToInitialize(frame::FrameHeaderError), + #[error("Decoder encountered error while draining the decodebuffer: {0}")] + FailedToDrainDecodebuffer(#[source] io::Error), + #[error("Target must have at least as many bytes as the contentsize of the frame reports")] + TargetTooSmall, + #[error("Frame header specified dictionary id that wasnt provided by add_dict() or reset_with_dict()")] + DictNotProvided, +} + +const MAX_WINDOW_SIZE: u64 = 1024 * 1024 * 100; + +impl FrameDecoderState { + pub fn new(source: impl Read) -> Result { + let (frame, header_size) = frame::read_frame_header(source)?; + let window_size = frame.header.window_size()?; + frame.check_valid()?; + Ok(FrameDecoderState { + frame, + frame_finished: false, + block_counter: 0, + decoder_scratch: DecoderScratch::new(window_size as usize), + bytes_read_counter: u64::from(header_size), + check_sum: None, + using_dict: None, + }) + } + + pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { + let (frame, header_size) = frame::read_frame_header(source)?; + let window_size = frame.header.window_size()?; + frame.check_valid()?; + + if window_size > MAX_WINDOW_SIZE { + return Err(FrameDecoderError::WindowSizeTooBig { + requested: window_size, + }); + } + + self.frame = frame; + self.frame_finished = false; + self.block_counter = 0; + self.decoder_scratch.reset(window_size as usize); + self.bytes_read_counter = u64::from(header_size); + self.check_sum = None; + self.using_dict = None; + Ok(()) + } +} + +impl Default for FrameDecoder { + fn default() -> Self { + Self::new() + } +} + +impl FrameDecoder { + /// This will create a new decoder without allocating anything yet. + /// init()/reset() will allocate all needed buffers if it is the first time this decoder is used + /// else they just reset these buffers with not further allocations + pub fn new() -> FrameDecoder { + FrameDecoder { + state: None, + dicts: HashMap::new(), + } + } + + /// init() will allocate all needed buffers if it is the first time this decoder is used + /// else they just reset these buffers with not further allocations + /// + /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer() + /// + /// equivalent to reset() + pub fn init(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { + self.reset(source) + } + /// Like init but provides the dict to use for the next frame + pub fn init_with_dict( + &mut self, + source: impl Read, + dict: &[u8], + ) -> Result<(), FrameDecoderError> { + self.reset_with_dict(source, dict) + } + + /// reset() will allocate all needed buffers if it is the first time this decoder is used + /// else they just reset these buffers with not further allocations + /// + /// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer() + /// + /// equivalent to init() + pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> { + match &mut self.state { + Some(s) => s.reset(source), + None => { + self.state = Some(FrameDecoderState::new(source)?); + Ok(()) + } + } + } + + /// Like reset but provides the dict to use for the next frame + pub fn reset_with_dict( + &mut self, + source: impl Read, + dict: &[u8], + ) -> Result<(), FrameDecoderError> { + self.reset(source)?; + if let Some(state) = &mut self.state { + let id = state.decoder_scratch.load_dict(dict)?; + state.using_dict = Some(id); + }; + Ok(()) + } + + /// Add a dict to the FrameDecoder that can be used when needed. The FrameDecoder uses the appropriate one dynamically + pub fn add_dict(&mut self, raw_dict: &[u8]) -> Result<(), FrameDecoderError> { + let dict = Dictionary::decode_dict(raw_dict)?; + self.dicts.insert(dict.id, dict); + Ok(()) + } + + /// Returns how many bytes the frame contains after decompression + pub fn content_size(&self) -> Option { + let state = match &self.state { + None => return Some(0), + Some(s) => s, + }; + + match state.frame.header.frame_content_size() { + Err(_) => None, + Ok(x) => Some(x), + } + } + + /// Returns the checksum that was read from the data. Only available after all bytes have been read. It is the last 4 bytes of a zstd-frame + pub fn get_checksum_from_data(&self) -> Option { + let state = match &self.state { + None => return None, + Some(s) => s, + }; + + state.check_sum + } + + /// Returns the checksum that was calculated while decoding. + /// Only a sensible value after all decoded bytes have been collected/read from the FrameDecoder + pub fn get_calculated_checksum(&self) -> Option { + let state = match &self.state { + None => return None, + Some(s) => s, + }; + let cksum_64bit = state.decoder_scratch.buffer.hash.finish(); + //truncate to lower 32bit because reasons... + Some(cksum_64bit as u32) + } + + /// Counter for how many bytes have been consumed while decoding the frame + pub fn bytes_read_from_source(&self) -> u64 { + let state = match &self.state { + None => return 0, + Some(s) => s, + }; + state.bytes_read_counter + } + + /// Whether the current frames last block has been decoded yet + /// If this returns true you can call the drain* functions to get all content + /// (the read() function will drain automatically if this returns true) + pub fn is_finished(&self) -> bool { + let state = match &self.state { + None => return true, + Some(s) => s, + }; + if state.frame.header.descriptor.content_checksum_flag() { + state.frame_finished && state.check_sum.is_some() + } else { + state.frame_finished + } + } + + /// Counter for how many blocks have already been decoded + pub fn blocks_decoded(&self) -> usize { + let state = match &self.state { + None => return 0, + Some(s) => s, + }; + state.block_counter + } + + /// Decodes blocks from a reader. It requires that the framedecoder has been initialized first. + /// The Strategy influences how many blocks will be decoded before the function returns + /// This is important if you want to manage memory consumption carefully. If you don't care + /// about that you can just choose the strategy "All" and have all blocks of the frame decoded into the buffer + pub fn decode_blocks( + &mut self, + mut source: impl Read, + strat: BlockDecodingStrategy, + ) -> Result { + use FrameDecoderError as err; + let state = self.state.as_mut().ok_or(err::NotYetInitialized)?; + + if let Some(id) = state.frame.header.dictionary_id().map_err( + //should never happen we check this directly after decoding the frame header + err::FailedToInitialize, + )? { + match state.using_dict { + Some(using_id) => { + //happy + debug_assert!(id == using_id); + } + None => { + let dict = self.dicts.get(&id).ok_or(err::DictNotProvided)?; + state.decoder_scratch.use_dict(dict); + state.using_dict = Some(id); + } + } + } + + let mut block_dec = decoding::block_decoder::new(); + + let buffer_size_before = state.decoder_scratch.buffer.len(); + let block_counter_before = state.block_counter; + loop { + if crate::VERBOSE { + println!("################"); + println!("Next Block: {}", state.block_counter); + println!("################"); + } + let (block_header, block_header_size) = block_dec + .read_block_header(&mut source) + .map_err(err::FailedToReadBlockHeader)?; + state.bytes_read_counter += u64::from(block_header_size); + + if crate::VERBOSE { + println!(); + println!( + "Found {} block with size: {}, which will be of size: {}", + block_header.block_type, + block_header.content_size, + block_header.decompressed_size + ); + } + + let bytes_read_in_block_body = block_dec + .decode_block_content(&block_header, &mut state.decoder_scratch, &mut source) + .map_err(err::FailedToReadBlockBody)?; + state.bytes_read_counter += bytes_read_in_block_body; + + state.block_counter += 1; + + if crate::VERBOSE { + println!("Output: {}", state.decoder_scratch.buffer.len()); + } + + if block_header.last_block { + state.frame_finished = true; + if state.frame.header.descriptor.content_checksum_flag() { + let mut chksum = [0u8; 4]; + source + .read_exact(&mut chksum) + .map_err(err::FailedToReadChecksum)?; + state.bytes_read_counter += 4; + let chksum = u32::from_le_bytes(chksum); + state.check_sum = Some(chksum); + } + break; + } + + match strat { + BlockDecodingStrategy::All => { /* keep going */ } + BlockDecodingStrategy::UptoBlocks(n) => { + if state.block_counter - block_counter_before >= n { + break; + } + } + BlockDecodingStrategy::UptoBytes(n) => { + if state.decoder_scratch.buffer.len() - buffer_size_before >= n { + break; + } + } + } + } + + Ok(state.frame_finished) + } + + /// Collect bytes and retain window_size bytes while decoding is still going on. + /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes + pub fn collect(&mut self) -> Option> { + let finished = self.is_finished(); + let state = self.state.as_mut()?; + if finished { + Some(state.decoder_scratch.buffer.drain()) + } else { + state.decoder_scratch.buffer.drain_to_window_size() + } + } + + /// Collect bytes and retain window_size bytes while decoding is still going on. + /// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes + pub fn collect_to_writer(&mut self, w: impl std::io::Write) -> Result { + let finished = self.is_finished(); + let state = match &mut self.state { + None => return Ok(0), + Some(s) => s, + }; + if finished { + state.decoder_scratch.buffer.drain_to_writer(w) + } else { + state.decoder_scratch.buffer.drain_to_window_size_writer(w) + } + } + + /// How many bytes can currently be collected from the decodebuffer, while decoding is going on this will be lower than the actual decodbuffer size + /// because window_size bytes need to be retained for decoding. + /// After decoding of the frame (is_finished() == true) has finished it will report all remaining bytes + pub fn can_collect(&self) -> usize { + let finished = self.is_finished(); + let state = match &self.state { + None => return 0, + Some(s) => s, + }; + if finished { + state.decoder_scratch.buffer.can_drain() + } else { + state + .decoder_scratch + .buffer + .can_drain_to_window_size() + .unwrap_or(0) + } + } + + /// Decodes as many blocks as possible from the source slice and reads from the decodebuffer into the target slice + /// The source slice may contain only parts of a frame but must contain at least one full block to make progress + /// + /// By all means use decode_blocks if you have a io.Reader available. This is just for compatibility with other decompressors + /// which try to serve an old-style c api + /// + /// Returns (read, written), if read == 0 then the source did not contain a full block and further calls with the same + /// input will not make any progress! + /// + /// Note that no kind of block can be bigger than 128kb. + /// So to be safe use at least 128*1024 (max block content size) + 3 (block_header size) + 18 (max frame_header size) bytes as your source buffer + /// + /// You may call this function with an empty source after all bytes have been decoded. This is equivalent to just call decoder.read(&mut target) + pub fn decode_from_to( + &mut self, + source: &[u8], + target: &mut [u8], + ) -> Result<(usize, usize), FrameDecoderError> { + use FrameDecoderError as err; + let bytes_read_at_start = match &self.state { + Some(s) => s.bytes_read_counter, + None => 0, + }; + + if !self.is_finished() || self.state.is_none() { + let mut mt_source = source; + + if self.state.is_none() { + self.init(&mut mt_source)?; + } + + //pseudo block to scope "state" so we can borrow self again after the block + { + let mut state = match &mut self.state { + Some(s) => s, + None => panic!("Bug in library"), + }; + let mut block_dec = decoding::block_decoder::new(); + + if state.frame.header.descriptor.content_checksum_flag() + && state.frame_finished + && state.check_sum.is_none() + { + //this block is needed if the checksum were the only 4 bytes that were not included in the last decode_from_to call for a frame + if mt_source.len() >= 4 { + let chksum = mt_source[..4].try_into().expect("optimized away"); + state.bytes_read_counter += 4; + let chksum = u32::from_le_bytes(chksum); + state.check_sum = Some(chksum); + } + return Ok((4, 0)); + } + + if let Some(id) = state.frame.header.dictionary_id().map_err( + //should never happen we check this directly after decoding the frame header + err::FailedToInitialize, + )? { + match state.using_dict { + Some(using_id) => { + //happy + debug_assert!(id == using_id); + } + None => { + let dict = self.dicts.get(&id).ok_or(err::DictNotProvided)?; + state.decoder_scratch.use_dict(dict); + state.using_dict = Some(id); + } + } + } + + loop { + //check if there are enough bytes for the next header + if mt_source.len() < 3 { + break; + } + let (block_header, block_header_size) = block_dec + .read_block_header(&mut mt_source) + .map_err(err::FailedToReadBlockHeader)?; + + // check the needed size for the block before updating counters. + // If not enough bytes are in the source, the header will have to be read again, so act like we never read it in the first place + if mt_source.len() < block_header.content_size as usize { + break; + } + state.bytes_read_counter += u64::from(block_header_size); + + let bytes_read_in_block_body = block_dec + .decode_block_content( + &block_header, + &mut state.decoder_scratch, + &mut mt_source, + ) + .map_err(err::FailedToReadBlockBody)?; + state.bytes_read_counter += bytes_read_in_block_body; + state.block_counter += 1; + + if block_header.last_block { + state.frame_finished = true; + if state.frame.header.descriptor.content_checksum_flag() { + //if there are enough bytes handle this here. Else the block at the start of this function will handle it at the next call + if mt_source.len() >= 4 { + let chksum = mt_source[..4].try_into().expect("optimized away"); + state.bytes_read_counter += 4; + let chksum = u32::from_le_bytes(chksum); + state.check_sum = Some(chksum); + } + } + break; + } + } + } + } + + let result_len = self.read(target).map_err(err::FailedToDrainDecodebuffer)?; + let bytes_read_at_end = match &mut self.state { + Some(s) => s.bytes_read_counter, + None => panic!("Bug in library"), + }; + let read_len = bytes_read_at_end - bytes_read_at_start; + Ok((read_len as usize, result_len)) + } +} + +/// Read bytes from the decode_buffer that are no longer needed. While the frame is not yet finished +/// this will retain window_size bytes, else it will drain it completely +impl std::io::Read for FrameDecoder { + fn read(&mut self, target: &mut [u8]) -> std::result::Result { + let state = match &mut self.state { + None => return Ok(0), + Some(s) => s, + }; + if state.frame_finished { + state.decoder_scratch.buffer.read_all(target) + } else { + state.decoder_scratch.buffer.read(target) + } + } +} diff --git a/vendor/ruzstd/src/fse/fse_decoder.rs b/vendor/ruzstd/src/fse/fse_decoder.rs new file mode 100644 index 000000000..1847da728 --- /dev/null +++ b/vendor/ruzstd/src/fse/fse_decoder.rs @@ -0,0 +1,336 @@ +use crate::decoding::bit_reader::BitReader; +use crate::decoding::bit_reader_reverse::{BitReaderReversed, GetBitsError}; + +#[derive(Clone)] +pub struct FSETable { + pub decode: Vec, //used to decode symbols, and calculate the next state + + pub accuracy_log: u8, + pub symbol_probabilities: Vec, //used while building the decode Vector + symbol_counter: Vec, +} + +impl Default for FSETable { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum FSETableError { + #[error("Acclog must be at least 1")] + AccLogIsZero, + #[error("Found FSE acc_log: {got} bigger than allowed maximum in this case: {max}")] + AccLogTooBig { got: u8, max: u8 }, + #[error(transparent)] + GetBitsError(#[from] GetBitsError), + #[error("The counter ({got}) exceeded the expected sum: {expected_sum}. This means an error or corrupted data \n {symbol_probabilities:?}")] + ProbabilityCounterMismatch { + got: u32, + expected_sum: u32, + symbol_probabilities: Vec, + }, + #[error("There are too many symbols in this distribution: {got}. Max: 256")] + TooManySymbols { got: usize }, +} + +pub struct FSEDecoder<'table> { + pub state: Entry, + table: &'table FSETable, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum FSEDecoderError { + #[error(transparent)] + GetBitsError(#[from] GetBitsError), + #[error("Tried to use an uninitialized table!")] + TableIsUninitialized, +} + +#[derive(Copy, Clone)] +pub struct Entry { + pub base_line: u32, + pub num_bits: u8, + pub symbol: u8, +} + +const ACC_LOG_OFFSET: u8 = 5; + +fn highest_bit_set(x: u32) -> u32 { + assert!(x > 0); + u32::BITS - x.leading_zeros() +} + +impl<'t> FSEDecoder<'t> { + pub fn new(table: &'t FSETable) -> FSEDecoder<'_> { + FSEDecoder { + state: table.decode.get(0).copied().unwrap_or(Entry { + base_line: 0, + num_bits: 0, + symbol: 0, + }), + table, + } + } + + pub fn decode_symbol(&self) -> u8 { + self.state.symbol + } + + pub fn init_state(&mut self, bits: &mut BitReaderReversed<'_>) -> Result<(), FSEDecoderError> { + if self.table.accuracy_log == 0 { + return Err(FSEDecoderError::TableIsUninitialized); + } + self.state = self.table.decode[bits.get_bits(self.table.accuracy_log)? as usize]; + + Ok(()) + } + + pub fn update_state( + &mut self, + bits: &mut BitReaderReversed<'_>, + ) -> Result<(), FSEDecoderError> { + let num_bits = self.state.num_bits; + let add = bits.get_bits(num_bits)?; + let base_line = self.state.base_line; + let new_state = base_line + add as u32; + self.state = self.table.decode[new_state as usize]; + + //println!("Update: {}, {} -> {}", base_line, add, self.state); + Ok(()) + } +} + +impl FSETable { + pub fn new() -> FSETable { + FSETable { + symbol_probabilities: Vec::with_capacity(256), //will never be more than 256 symbols because u8 + symbol_counter: Vec::with_capacity(256), //will never be more than 256 symbols because u8 + decode: Vec::new(), //depending on acc_log. + accuracy_log: 0, + } + } + + pub fn reset(&mut self) { + self.symbol_counter.clear(); + self.symbol_probabilities.clear(); + self.decode.clear(); + self.accuracy_log = 0; + } + + //returns how many BYTEs (not bits) were read while building the decoder + pub fn build_decoder(&mut self, source: &[u8], max_log: u8) -> Result { + self.accuracy_log = 0; + + let bytes_read = self.read_probabilities(source, max_log)?; + self.build_decoding_table(); + + Ok(bytes_read) + } + + pub fn build_from_probabilities( + &mut self, + acc_log: u8, + probs: &[i32], + ) -> Result<(), FSETableError> { + if acc_log == 0 { + return Err(FSETableError::AccLogIsZero); + } + self.symbol_probabilities = probs.to_vec(); + self.accuracy_log = acc_log; + self.build_decoding_table(); + Ok(()) + } + + fn build_decoding_table(&mut self) { + self.decode.clear(); + + let table_size = 1 << self.accuracy_log; + if self.decode.len() < table_size { + self.decode.reserve(table_size - self.decode.len()); + } + //fill with dummy entries + self.decode.resize( + table_size, + Entry { + base_line: 0, + num_bits: 0, + symbol: 0, + }, + ); + + let mut negative_idx = table_size; //will point to the highest index with is already occupied by a negative-probability-symbol + + //first scan for all -1 probabilities and place them at the top of the table + for symbol in 0..self.symbol_probabilities.len() { + if self.symbol_probabilities[symbol] == -1 { + negative_idx -= 1; + let entry = &mut self.decode[negative_idx]; + entry.symbol = symbol as u8; + entry.base_line = 0; + entry.num_bits = self.accuracy_log; + } + } + + //then place in a semi-random order all of the other symbols + let mut position = 0; + for idx in 0..self.symbol_probabilities.len() { + let symbol = idx as u8; + if self.symbol_probabilities[idx] <= 0 { + continue; + } + + //for each probability point the symbol gets on slot + let prob = self.symbol_probabilities[idx]; + for _ in 0..prob { + let entry = &mut self.decode[position]; + entry.symbol = symbol; + + position = next_position(position, table_size); + while position >= negative_idx { + position = next_position(position, table_size); + //everything above negative_idx is already taken + } + } + } + + // baselines and num_bits can only be calculated when all symbols have been spread + self.symbol_counter.clear(); + self.symbol_counter + .resize(self.symbol_probabilities.len(), 0); + for idx in 0..negative_idx { + let entry = &mut self.decode[idx]; + let symbol = entry.symbol; + let prob = self.symbol_probabilities[symbol as usize]; + + let symbol_count = self.symbol_counter[symbol as usize]; + let (bl, nb) = calc_baseline_and_numbits(table_size as u32, prob as u32, symbol_count); + + //println!("symbol: {:2}, table: {}, prob: {:3}, count: {:3}, bl: {:3}, nb: {:2}", symbol, table_size, prob, symbol_count, bl, nb); + + assert!(nb <= self.accuracy_log); + self.symbol_counter[symbol as usize] += 1; + + entry.base_line = bl; + entry.num_bits = nb; + } + } + + fn read_probabilities(&mut self, source: &[u8], max_log: u8) -> Result { + self.symbol_probabilities.clear(); //just clear, we will fill a probability for each entry anyways. No need to force new allocs here + + let mut br = BitReader::new(source); + self.accuracy_log = ACC_LOG_OFFSET + (br.get_bits(4)? as u8); + if self.accuracy_log > max_log { + return Err(FSETableError::AccLogTooBig { + got: self.accuracy_log, + max: max_log, + }); + } + if self.accuracy_log == 0 { + return Err(FSETableError::AccLogIsZero); + } + + let probablility_sum = 1 << self.accuracy_log; + let mut probability_counter = 0; + + while probability_counter < probablility_sum { + let max_remaining_value = probablility_sum - probability_counter + 1; + let bits_to_read = highest_bit_set(max_remaining_value); + + let unchecked_value = br.get_bits(bits_to_read as usize)? as u32; + + let low_threshold = ((1 << bits_to_read) - 1) - (max_remaining_value); + let mask = (1 << (bits_to_read - 1)) - 1; + let small_value = unchecked_value & mask; + + let value = if small_value < low_threshold { + br.return_bits(1); + small_value + } else if unchecked_value > mask { + unchecked_value - low_threshold + } else { + unchecked_value + }; + //println!("{}, {}, {}", self.symbol_probablilities.len(), unchecked_value, value); + + let prob = (value as i32) - 1; + + self.symbol_probabilities.push(prob); + if prob != 0 { + if prob > 0 { + probability_counter += prob as u32; + } else { + // probability -1 counts as 1 + assert!(prob == -1); + probability_counter += 1; + } + } else { + //fast skip further zero probabilities + loop { + let skip_amount = br.get_bits(2)? as usize; + + self.symbol_probabilities + .resize(self.symbol_probabilities.len() + skip_amount, 0); + if skip_amount != 3 { + break; + } + } + } + } + + if probability_counter != probablility_sum { + return Err(FSETableError::ProbabilityCounterMismatch { + got: probability_counter, + expected_sum: probablility_sum, + symbol_probabilities: self.symbol_probabilities.clone(), + }); + } + if self.symbol_probabilities.len() > 256 { + return Err(FSETableError::TooManySymbols { + got: self.symbol_probabilities.len(), + }); + } + + let bytes_read = if br.bits_read() % 8 == 0 { + br.bits_read() / 8 + } else { + (br.bits_read() / 8) + 1 + }; + Ok(bytes_read) + } +} + +//utility functions for building the decoding table from probabilities +fn next_position(mut p: usize, table_size: usize) -> usize { + p += (table_size >> 1) + (table_size >> 3) + 3; + p &= table_size - 1; + p +} + +fn calc_baseline_and_numbits( + num_states_total: u32, + num_states_symbol: u32, + state_number: u32, +) -> (u32, u8) { + let num_state_slices = if 1 << (highest_bit_set(num_states_symbol) - 1) == num_states_symbol { + num_states_symbol + } else { + 1 << (highest_bit_set(num_states_symbol)) + }; //always power of two + + let num_double_width_state_slices = num_state_slices - num_states_symbol; //leftovers to the power of two need to be distributed + let num_single_width_state_slices = num_states_symbol - num_double_width_state_slices; //these will not receive a double width slice of states + let slice_width = num_states_total / num_state_slices; //size of a single width slice of states + let num_bits = highest_bit_set(slice_width) - 1; //number of bits needed to read for one slice + + if state_number < num_double_width_state_slices { + let baseline = num_single_width_state_slices * slice_width + state_number * slice_width * 2; + (baseline, num_bits as u8 + 1) + } else { + let index_shifted = state_number - num_double_width_state_slices; + ((index_shifted * slice_width), num_bits as u8) + } +} diff --git a/vendor/ruzstd/src/fse/mod.rs b/vendor/ruzstd/src/fse/mod.rs new file mode 100644 index 000000000..ba4beb511 --- /dev/null +++ b/vendor/ruzstd/src/fse/mod.rs @@ -0,0 +1,2 @@ +mod fse_decoder; +pub use fse_decoder::*; diff --git a/vendor/ruzstd/src/huff0/huff0_decoder.rs b/vendor/ruzstd/src/huff0/huff0_decoder.rs new file mode 100644 index 000000000..831ddd69c --- /dev/null +++ b/vendor/ruzstd/src/huff0/huff0_decoder.rs @@ -0,0 +1,388 @@ +use crate::decoding::bit_reader_reverse::{BitReaderReversed, GetBitsError}; +use crate::fse::{FSEDecoder, FSEDecoderError, FSETable, FSETableError}; + +#[derive(Clone)] +pub struct HuffmanTable { + decode: Vec, + + weights: Vec, + pub max_num_bits: u8, + bits: Vec, + bit_ranks: Vec, + rank_indexes: Vec, + + fse_table: FSETable, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum HuffmanTableError { + #[error(transparent)] + GetBitsError(#[from] GetBitsError), + #[error(transparent)] + FSEDecoderError(#[from] FSEDecoderError), + #[error(transparent)] + FSETableError(#[from] FSETableError), + #[error("Source needs to have at least one byte")] + SourceIsEmpty, + #[error("Header says there should be {expected_bytes} bytes for the weights but there are only {got_bytes} bytes in the stream")] + NotEnoughBytesForWeights { + got_bytes: usize, + expected_bytes: u8, + }, + #[error("Padding at the end of the sequence_section was more than a byte long: {skipped_bits} bits. Probably caused by data corruption")] + ExtraPadding { skipped_bits: i32 }, + #[error("More than 255 weights decoded (got {got} weights). Stream is probably corrupted")] + TooManyWeights { got: usize }, + #[error("Can't build huffman table without any weights")] + MissingWeights, + #[error("Leftover must be power of two but is: {got}")] + LeftoverIsNotAPowerOf2 { got: u32 }, + #[error("Not enough bytes in stream to decompress weights. Is: {have}, Should be: {need}")] + NotEnoughBytesToDecompressWeights { have: usize, need: usize }, + #[error("FSE table used more bytes: {used} than were meant to be used for the whole stream of huffman weights ({available_bytes})")] + FSETableUsedTooManyBytes { used: usize, available_bytes: u8 }, + #[error("Source needs to have at least {need} bytes, got: {got}")] + NotEnoughBytesInSource { got: usize, need: usize }, + #[error("Cant have weight: {got} bigger than max_num_bits: {MAX_MAX_NUM_BITS}")] + WeightBiggerThanMaxNumBits { got: u8 }, + #[error("max_bits derived from weights is: {got} should be lower than: {MAX_MAX_NUM_BITS}")] + MaxBitsTooHigh { got: u8 }, +} + +pub struct HuffmanDecoder<'table> { + table: &'table HuffmanTable, + pub state: u64, +} + +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum HuffmanDecoderError { + #[error(transparent)] + GetBitsError(#[from] GetBitsError), +} + +#[derive(Copy, Clone)] +pub struct Entry { + symbol: u8, + num_bits: u8, +} + +const MAX_MAX_NUM_BITS: u8 = 11; + +fn highest_bit_set(x: u32) -> u32 { + assert!(x > 0); + u32::BITS - x.leading_zeros() +} + +impl<'t> HuffmanDecoder<'t> { + pub fn new(table: &'t HuffmanTable) -> HuffmanDecoder<'t> { + HuffmanDecoder { table, state: 0 } + } + + pub fn reset(mut self, new_table: Option<&'t HuffmanTable>) { + self.state = 0; + if let Some(next_table) = new_table { + self.table = next_table; + } + } + + pub fn decode_symbol(&mut self) -> u8 { + self.table.decode[self.state as usize].symbol + } + + pub fn init_state( + &mut self, + br: &mut BitReaderReversed<'_>, + ) -> Result { + let num_bits = self.table.max_num_bits; + let new_bits = br.get_bits(num_bits)?; + self.state = new_bits; + Ok(num_bits) + } + + pub fn next_state( + &mut self, + br: &mut BitReaderReversed<'_>, + ) -> Result { + let num_bits = self.table.decode[self.state as usize].num_bits; + let new_bits = br.get_bits(num_bits)?; + self.state <<= num_bits; + self.state &= self.table.decode.len() as u64 - 1; + self.state |= new_bits; + Ok(num_bits) + } +} + +impl Default for HuffmanTable { + fn default() -> Self { + Self::new() + } +} + +impl HuffmanTable { + pub fn new() -> HuffmanTable { + HuffmanTable { + decode: Vec::new(), + + weights: Vec::with_capacity(256), + max_num_bits: 0, + bits: Vec::with_capacity(256), + bit_ranks: Vec::with_capacity(11), + rank_indexes: Vec::with_capacity(11), + fse_table: FSETable::new(), + } + } + + pub fn reset(&mut self) { + self.decode.clear(); + self.weights.clear(); + self.max_num_bits = 0; + self.bits.clear(); + self.bit_ranks.clear(); + self.rank_indexes.clear(); + self.fse_table.reset(); + } + + pub fn build_decoder(&mut self, source: &[u8]) -> Result { + self.decode.clear(); + + let bytes_used = self.read_weights(source)?; + self.build_table_from_weights()?; + Ok(bytes_used) + } + + fn read_weights(&mut self, source: &[u8]) -> Result { + use HuffmanTableError as err; + + if source.is_empty() { + return Err(err::SourceIsEmpty); + } + let header = source[0]; + let mut bits_read = 8; + + match header { + 0..=127 => { + let fse_stream = &source[1..]; + if header as usize > fse_stream.len() { + return Err(err::NotEnoughBytesForWeights { + got_bytes: fse_stream.len(), + expected_bytes: header, + }); + } + //fse decompress weights + let bytes_used_by_fse_header = self + .fse_table + .build_decoder(fse_stream, /*TODO find actual max*/ 100)?; + + if bytes_used_by_fse_header > header as usize { + return Err(err::FSETableUsedTooManyBytes { + used: bytes_used_by_fse_header, + available_bytes: header, + }); + } + + if crate::VERBOSE { + println!( + "Building fse table for huffman weights used: {}", + bytes_used_by_fse_header + ); + } + let mut dec1 = FSEDecoder::new(&self.fse_table); + let mut dec2 = FSEDecoder::new(&self.fse_table); + + let compressed_start = bytes_used_by_fse_header; + let compressed_length = header as usize - bytes_used_by_fse_header; + + let compressed_weights = &fse_stream[compressed_start..]; + if compressed_weights.len() < compressed_length { + return Err(err::NotEnoughBytesToDecompressWeights { + have: compressed_weights.len(), + need: compressed_length, + }); + } + let compressed_weights = &compressed_weights[..compressed_length]; + let mut br = BitReaderReversed::new(compressed_weights); + + bits_read += (bytes_used_by_fse_header + compressed_length) * 8; + + //skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found + let mut skipped_bits = 0; + loop { + let val = br.get_bits(1)?; + skipped_bits += 1; + if val == 1 || skipped_bits > 8 { + break; + } + } + if skipped_bits > 8 { + //if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data + return Err(err::ExtraPadding { skipped_bits }); + } + + dec1.init_state(&mut br)?; + dec2.init_state(&mut br)?; + + self.weights.clear(); + + loop { + let w = dec1.decode_symbol(); + self.weights.push(w); + dec1.update_state(&mut br)?; + + if br.bits_remaining() <= -1 { + //collect final states + self.weights.push(dec2.decode_symbol()); + break; + } + + let w = dec2.decode_symbol(); + self.weights.push(w); + dec2.update_state(&mut br)?; + + if br.bits_remaining() <= -1 { + //collect final states + self.weights.push(dec1.decode_symbol()); + break; + } + //maximum number of weights is 255 because we use u8 symbols and the last weight is inferred from the sum of all others + if self.weights.len() > 255 { + return Err(err::TooManyWeights { + got: self.weights.len(), + }); + } + } + } + _ => { + // weights are directly encoded + let weights_raw = &source[1..]; + let num_weights = header - 127; + self.weights.resize(num_weights as usize, 0); + + let bytes_needed = if num_weights % 2 == 0 { + num_weights as usize / 2 + } else { + (num_weights as usize / 2) + 1 + }; + + if weights_raw.len() < bytes_needed { + return Err(err::NotEnoughBytesInSource { + got: weights_raw.len(), + need: bytes_needed, + }); + } + + for idx in 0..num_weights { + if idx % 2 == 0 { + self.weights[idx as usize] = weights_raw[idx as usize / 2] >> 4; + } else { + self.weights[idx as usize] = weights_raw[idx as usize / 2] & 0xF; + } + bits_read += 4; + } + } + } + + let bytes_read = if bits_read % 8 == 0 { + bits_read / 8 + } else { + (bits_read / 8) + 1 + }; + Ok(bytes_read as u32) + } + + fn build_table_from_weights(&mut self) -> Result<(), HuffmanTableError> { + use HuffmanTableError as err; + + self.bits.clear(); + self.bits.resize(self.weights.len() + 1, 0); + + let mut weight_sum: u32 = 0; + for w in &self.weights { + if *w > MAX_MAX_NUM_BITS { + return Err(err::WeightBiggerThanMaxNumBits { got: *w }); + } + weight_sum += if *w > 0 { 1_u32 << (*w - 1) } else { 0 }; + } + + if weight_sum == 0 { + return Err(err::MissingWeights); + } + + let max_bits = highest_bit_set(weight_sum) as u8; + let left_over = (1 << max_bits) - weight_sum; + + //left_over must be power of two + if !left_over.is_power_of_two() { + return Err(err::LeftoverIsNotAPowerOf2 { got: left_over }); + } + + let last_weight = highest_bit_set(left_over) as u8; + + for symbol in 0..self.weights.len() { + let bits = if self.weights[symbol] > 0 { + max_bits + 1 - self.weights[symbol] + } else { + 0 + }; + self.bits[symbol] = bits; + } + + self.bits[self.weights.len()] = max_bits + 1 - last_weight; + self.max_num_bits = max_bits; + + if max_bits > MAX_MAX_NUM_BITS { + return Err(err::MaxBitsTooHigh { got: max_bits }); + } + + self.bit_ranks.clear(); + self.bit_ranks.resize((max_bits + 1) as usize, 0); + for num_bits in &self.bits { + self.bit_ranks[(*num_bits) as usize] += 1; + } + + //fill with dummy symbols + self.decode.resize( + 1 << self.max_num_bits, + Entry { + symbol: 0, + num_bits: 0, + }, + ); + + //starting codes for each rank + self.rank_indexes.clear(); + self.rank_indexes.resize((max_bits + 1) as usize, 0); + + self.rank_indexes[max_bits as usize] = 0; + for bits in (1..self.rank_indexes.len() as u8).rev() { + self.rank_indexes[bits as usize - 1] = self.rank_indexes[bits as usize] + + self.bit_ranks[bits as usize] as usize * (1 << (max_bits - bits)); + } + + assert!( + self.rank_indexes[0] == self.decode.len(), + "rank_idx[0]: {} should be: {}", + self.rank_indexes[0], + self.decode.len() + ); + + for symbol in 0..self.bits.len() { + let bits_for_symbol = self.bits[symbol]; + if bits_for_symbol != 0 { + // allocate code for the symbol and set in the table + // a code ignores all max_bits - bits[symbol] bits, so it gets + // a range that spans all of those in the decoding table + let base_idx = self.rank_indexes[bits_for_symbol as usize]; + let len = 1 << (max_bits - bits_for_symbol); + self.rank_indexes[bits_for_symbol as usize] += len; + for idx in 0..len { + self.decode[base_idx + idx].symbol = symbol as u8; + self.decode[base_idx + idx].num_bits = bits_for_symbol; + } + } + } + + Ok(()) + } +} diff --git a/vendor/ruzstd/src/huff0/mod.rs b/vendor/ruzstd/src/huff0/mod.rs new file mode 100644 index 000000000..445c7fab4 --- /dev/null +++ b/vendor/ruzstd/src/huff0/mod.rs @@ -0,0 +1,2 @@ +mod huff0_decoder; +pub use huff0_decoder::*; diff --git a/vendor/ruzstd/src/lib.rs b/vendor/ruzstd/src/lib.rs new file mode 100644 index 000000000..e079ad31b --- /dev/null +++ b/vendor/ruzstd/src/lib.rs @@ -0,0 +1,15 @@ +#![deny(trivial_casts, trivial_numeric_casts, rust_2018_idioms)] + +pub mod blocks; +pub mod decoding; +pub mod frame; +pub mod frame_decoder; +pub mod fse; +pub mod huff0; +pub mod streaming_decoder; +mod tests; + +pub const VERBOSE: bool = false; +pub use frame_decoder::BlockDecodingStrategy; +pub use frame_decoder::FrameDecoder; +pub use streaming_decoder::StreamingDecoder; diff --git a/vendor/ruzstd/src/streaming_decoder.rs b/vendor/ruzstd/src/streaming_decoder.rs new file mode 100644 index 000000000..2613a363b --- /dev/null +++ b/vendor/ruzstd/src/streaming_decoder.rs @@ -0,0 +1,66 @@ +use crate::frame_decoder::{BlockDecodingStrategy, FrameDecoder, FrameDecoderError}; +use std::io::Read; + +/// High level decoder that implements a io::Read that can be used with +/// io::Read::read_to_end / io::Read::read_exact or passing this to another library / module as a source for the decoded content +/// +/// The lower level FrameDecoder by comparison allows for finer grained control but need sto have it's decode_blocks method called continously +/// to decode the zstd-frame. +pub struct StreamingDecoder { + pub decoder: FrameDecoder, + source: READ, +} + +impl StreamingDecoder { + pub fn new(mut source: READ) -> Result, FrameDecoderError> { + let mut decoder = FrameDecoder::new(); + decoder.init(&mut source)?; + Ok(StreamingDecoder { decoder, source }) + } + + pub fn new_with_decoder( + mut source: READ, + mut decoder: FrameDecoder, + ) -> Result, FrameDecoderError> { + decoder.init(&mut source)?; + Ok(StreamingDecoder { decoder, source }) + } + + pub fn inner(self) -> FrameDecoder { + self.decoder + } +} + +impl Read for StreamingDecoder { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.decoder.is_finished() && self.decoder.can_collect() == 0 { + //No more bytes can ever be decoded + return Ok(0); + } + + // need to loop. The UpToBytes strategy doesn't take any effort to actually reach that limit. + // The first few calls can result in just filling the decode buffer but these bytes can not be collected. + // So we need to call this until we can actually collect enough bytes + + // TODO add BlockDecodingStrategy::UntilCollectable(usize) that pushes this logic into the decode_blocks function + while self.decoder.can_collect() < buf.len() && !self.decoder.is_finished() { + //More bytes can be decoded + let additional_bytes_needed = buf.len() - self.decoder.can_collect(); + match self.decoder.decode_blocks( + &mut self.source, + BlockDecodingStrategy::UptoBytes(additional_bytes_needed), + ) { + Ok(_) => { /*Nothing to do*/ } + Err(e) => { + let err = std::io::Error::new( + std::io::ErrorKind::Other, + format!("Error in the zstd decoder: {:?}", e), + ); + return Err(err); + } + } + } + + self.decoder.read(buf) + } +} diff --git a/vendor/ruzstd/src/tests/bit_reader.rs b/vendor/ruzstd/src/tests/bit_reader.rs new file mode 100644 index 000000000..84f5ad5f6 --- /dev/null +++ b/vendor/ruzstd/src/tests/bit_reader.rs @@ -0,0 +1,79 @@ +#[test] +fn test_bitreader_reversed() { + use crate::decoding::bit_reader_reverse::BitReaderReversed; + + let encoded: [u8; 16] = [ + 0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5, + 0x48, + ]; + //just the u128 in encoded + let num_rev: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1; + + let mut br = BitReaderReversed::new(&encoded[..]); + let mut accumulator = 0; + let mut bits_read = 0; + let mut x = 0; + + loop { + x += 3; + //semi random access pattern + let mut num_bits = x % 16; + if bits_read > 128 - num_bits { + num_bits = 128 - bits_read; + } + + let bits = br.get_bits(num_bits).unwrap(); + bits_read += num_bits; + accumulator |= u128::from(bits) << (128 - bits_read); + if bits_read >= 128 { + break; + } + } + + if accumulator != num_rev { + panic!( + "Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}", + accumulator, num_rev + ); + } +} + +#[test] +fn test_bitreader_normal() { + use crate::decoding::bit_reader::BitReader; + + let encoded: [u8; 16] = [ + 0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5, + 0x48, + ]; + //just the u128 in encoded + let num: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1; + + let mut br = BitReader::new(&encoded[..]); + let mut accumulator = 0; + let mut bits_read = 0; + let mut x = 0; + + loop { + x += 3; + //semi random access pattern + let mut num_bits = x % 16; + if bits_read > 128 - num_bits { + num_bits = 128 - bits_read; + } + + let bits = br.get_bits(num_bits).unwrap(); + accumulator |= u128::from(bits) << bits_read; + bits_read += num_bits; + if bits_read >= 128 { + break; + } + } + + if accumulator != num { + panic!( + "Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}", + accumulator, num + ); + } +} diff --git a/vendor/ruzstd/src/tests/decode_corpus.rs b/vendor/ruzstd/src/tests/decode_corpus.rs new file mode 100644 index 000000000..ea1a723a3 --- /dev/null +++ b/vendor/ruzstd/src/tests/decode_corpus.rs @@ -0,0 +1,181 @@ +#[test] +fn test_decode_corpus_files() { + use crate::frame_decoder; + use std::fs; + use std::io::Read; + + let mut success_counter = 0; + let mut fail_counter_diff = 0; + let mut fail_counter_size = 0; + let mut fail_counter_bytes_read = 0; + let mut fail_counter_chksum = 0; + let mut total_counter = 0; + let mut failed: Vec = Vec::new(); + + let mut speeds = Vec::new(); + let mut speeds_read = Vec::new(); + + let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect(); + if fs::read_dir("./local_corpus_files").is_ok() { + files.extend(fs::read_dir("./local_corpus_files").unwrap()); + } + + files.sort_by_key(|x| match x { + Err(_) => "".to_owned(), + Ok(entry) => entry.path().to_str().unwrap().to_owned(), + }); + + let mut frame_dec = frame_decoder::FrameDecoder::new(); + + for file in files { + let f = file.unwrap(); + let metadata = f.metadata().unwrap(); + let file_size = metadata.len(); + + let p = String::from(f.path().to_str().unwrap()); + if !p.ends_with(".zst") { + continue; + } + println!("Trying file: {}", p); + + let mut content = fs::File::open(f.path()).unwrap(); + + frame_dec.reset(&mut content).unwrap(); + + let start_time = std::time::Instant::now(); + /////DECODING + frame_dec + .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) + .unwrap(); + let result = frame_dec.collect().unwrap(); + let end_time = start_time.elapsed(); + + match frame_dec.get_checksum_from_data() { + Some(chksum) => { + if frame_dec.get_calculated_checksum().unwrap() != chksum { + println!( + "Checksum did not match! From data: {}, calculated while decoding: {}\n", + chksum, + frame_dec.get_calculated_checksum().unwrap() + ); + fail_counter_chksum += 1; + failed.push(p.clone().to_string()); + } else { + println!("Checksums are ok!\n"); + } + } + None => println!("No checksums to test\n"), + } + + let mut original_p = p.clone(); + original_p.truncate(original_p.len() - 4); + let original_f = fs::File::open(original_p).unwrap(); + let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); + + println!("Results for file: {}", p.clone()); + let mut success = true; + + if original.len() != result.len() { + println!( + "Result has wrong length: {}, should be: {}", + result.len(), + original.len() + ); + success = false; + fail_counter_size += 1; + } + + if frame_dec.bytes_read_from_source() != file_size { + println!( + "Framedecoder counted wrong amount of bytes: {}, should be: {}", + frame_dec.bytes_read_from_source(), + file_size + ); + success = false; + fail_counter_bytes_read += 1; + } + + let mut counter = 0; + let min = if original.len() < result.len() { + original.len() + } else { + result.len() + }; + for idx in 0..min { + if original[idx] != result[idx] { + counter += 1; + //println!( + // "Original {} not equal to result {} at byte: {}", + // original[idx], result[idx], idx, + //); + } + } + + if counter > 0 { + println!("Result differs in at least {} bytes from original", counter); + success = false; + fail_counter_diff += 1; + } + + if success { + success_counter += 1; + } else { + failed.push(p.clone().to_string()); + } + total_counter += 1; + + let dur = end_time.as_micros() as usize; + let speed = result.len() / if dur == 0 { 1 } else { dur }; + let speed_read = file_size as usize / if dur == 0 { 1 } else { dur }; + println!("SPEED: {}", speed); + println!("SPEED_read: {}", speed_read); + speeds.push(speed); + speeds_read.push(speed_read); + } + + println!("###################"); + println!("Summary:"); + println!("###################"); + println!( + "Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, WrongChecksum: {}, Diffs: {}", + total_counter, + success_counter, + fail_counter_size, + fail_counter_bytes_read, + fail_counter_chksum, + fail_counter_diff + ); + println!("Failed files: "); + for f in &failed { + println!("{}", f); + } + + let speed_len = speeds.len(); + let sum_speed: usize = speeds.into_iter().sum(); + let avg_speed = sum_speed / speed_len; + let avg_speed_bps = avg_speed * 1_000_000; + if avg_speed_bps < 1000 { + println!("Average speed: {} B/s", avg_speed_bps); + } else if avg_speed_bps < 1_000_000 { + println!("Average speed: {} KB/s", avg_speed_bps / 1000); + } else { + println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000); + } + + let speed_read_len = speeds_read.len(); + let sum_speed_read: usize = speeds_read.into_iter().sum(); + let avg_speed_read = sum_speed_read / speed_read_len; + let avg_speed_read_bps = avg_speed_read * 1_000_000; + if avg_speed_read_bps < 1000 { + println!("Average speed reading: {} B/s", avg_speed_read_bps); + } else if avg_speed_bps < 1_000_000 { + println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000); + } else { + println!( + "Average speed reading: {} MB/s", + avg_speed_read_bps / 1_000_000 + ); + } + + assert!(failed.is_empty()); +} diff --git a/vendor/ruzstd/src/tests/dict_test.rs b/vendor/ruzstd/src/tests/dict_test.rs new file mode 100644 index 000000000..a8a5fd46b --- /dev/null +++ b/vendor/ruzstd/src/tests/dict_test.rs @@ -0,0 +1,252 @@ +#[test] +fn test_dict_parsing() { + use crate::decoding::dictionary::Dictionary; + let mut raw = vec![0u8; 8]; + + // correct magic num + raw[0] = 0x37; + raw[1] = 0xA4; + raw[2] = 0x30; + raw[3] = 0xEC; + + //dict-id + let dict_id = 0x47232101; + raw[4] = 0x01; + raw[5] = 0x21; + raw[6] = 0x23; + raw[7] = 0x47; + + // tables copied from ./dict_tests/dictionary + let raw_tables = &[ + 54, 16, 192, 155, 4, 0, 207, 59, 239, 121, 158, 116, 220, 93, 114, 229, 110, 41, 249, 95, + 165, 255, 83, 202, 254, 68, 74, 159, 63, 161, 100, 151, 137, 21, 184, 183, 189, 100, 235, + 209, 251, 174, 91, 75, 91, 185, 19, 39, 75, 146, 98, 177, 249, 14, 4, 35, 0, 0, 0, 40, 40, + 20, 10, 12, 204, 37, 196, 1, 173, 122, 0, 4, 0, 128, 1, 2, 2, 25, 32, 27, 27, 22, 24, 26, + 18, 12, 12, 15, 16, 11, 69, 37, 225, 48, 20, 12, 6, 2, 161, 80, 40, 20, 44, 137, 145, 204, + 46, 0, 0, 0, 0, 0, 116, 253, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + raw.extend(&raw_tables[..]); + + //offset history 3,10,0x00ABCDEF + raw.extend(vec![3, 0, 0, 0]); + raw.extend(vec![10, 0, 0, 0]); + raw.extend(vec![0xEF, 0xCD, 0xAB, 0]); + + //just some random bytes + let raw_content = vec![ + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 123, 3, 234, 23, 234, 34, 23, 234, 34, 34, 234, 234, + ]; + raw.extend(&raw_content); + + let dict = Dictionary::decode_dict(&raw).unwrap(); + + if dict.id != dict_id { + panic!( + "Dict-id did not get parsed correctly. Is: {}, Should be: {}", + dict.id, dict_id + ); + } + + if !dict.dict_content.eq(&raw_content) { + panic!( + "dict content did not get parsed correctly. Is: {:?}, Should be: {:?}", + dict.dict_content, raw_content + ); + } + + if !dict.offset_hist.eq(&[3, 10, 0x00ABCDEF]) { + panic!( + "offset history did not get parsed correctly. Is: {:?}, Should be: {:?}", + dict.offset_hist, + [3, 10, 0x00ABCDEF] + ); + } + + // test magic num checking + raw[0] = 1; + raw[1] = 1; + raw[2] = 1; + raw[3] = 1; + match Dictionary::decode_dict(&raw) { + Ok(_) => panic!("The dict got decoded but the magic num was incorrect!"), + Err(_) => { /* This is what should happen*/ } + } +} + +#[test] +fn test_dict_decoding() { + use crate::frame_decoder; + use std::fs; + use std::io::Read; + + let mut success_counter = 0; + let mut fail_counter_diff = 0; + let mut fail_counter_size = 0; + let mut fail_counter_bytes_read = 0; + let mut total_counter = 0; + let mut failed: Vec = Vec::new(); + + let mut speeds = Vec::new(); + let mut speeds_read = Vec::new(); + + let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect(); + let dict = fs::File::open("./dict_tests/dictionary").unwrap(); + let dict: Vec = dict.bytes().map(|x| x.unwrap()).collect(); + + files.sort_by_key(|x| match x { + Err(_) => "".to_owned(), + Ok(entry) => entry.path().to_str().unwrap().to_owned(), + }); + + let mut frame_dec = frame_decoder::FrameDecoder::new(); + frame_dec.add_dict(&dict).unwrap(); + + for file in files { + let f = file.unwrap(); + let metadata = f.metadata().unwrap(); + let file_size = metadata.len(); + + let p = String::from(f.path().to_str().unwrap()); + if !p.ends_with(".zst") { + continue; + } + println!("Trying file: {}", p); + + let mut content = fs::File::open(f.path()).unwrap(); + + frame_dec.reset(&mut content).unwrap(); + + let start_time = std::time::Instant::now(); + /////DECODING + frame_dec + .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) + .unwrap(); + let result = frame_dec.collect().unwrap(); + let end_time = start_time.elapsed(); + + match frame_dec.get_checksum_from_data() { + Some(chksum) => { + if frame_dec.get_calculated_checksum().unwrap() != chksum { + println!( + "Checksum did not match! From data: {}, calculated while decoding: {}\n", + chksum, + frame_dec.get_calculated_checksum().unwrap() + ); + } else { + println!("Checksums are ok!\n"); + } + } + None => println!("No checksums to test\n"), + } + + let mut original_p = p.clone(); + original_p.truncate(original_p.len() - 4); + let original_f = fs::File::open(original_p).unwrap(); + let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); + + println!("Results for file: {}", p.clone()); + let mut success = true; + + if original.len() != result.len() { + println!( + "Result has wrong length: {}, should be: {}", + result.len(), + original.len() + ); + success = false; + fail_counter_size += 1; + } + + if frame_dec.bytes_read_from_source() != file_size { + println!( + "Framedecoder counted wrong amount of bytes: {}, should be: {}", + frame_dec.bytes_read_from_source(), + file_size + ); + success = false; + fail_counter_bytes_read += 1; + } + + let mut counter = 0; + let min = if original.len() < result.len() { + original.len() + } else { + result.len() + }; + for idx in 0..min { + if original[idx] != result[idx] { + counter += 1; + //println!( + // "Original {} not equal to result {} at byte: {}", + // original[idx], result[idx], idx, + //); + } + } + + if counter > 0 { + println!("Result differs in at least {} bytes from original", counter); + success = false; + fail_counter_diff += 1; + } + + if success { + success_counter += 1; + } else { + failed.push(p.clone().to_string()); + } + total_counter += 1; + + let dur = end_time.as_micros() as usize; + let speed = result.len() / if dur == 0 { 1 } else { dur }; + let speed_read = file_size as usize / if dur == 0 { 1 } else { dur }; + println!("SPEED: {}", speed); + println!("SPEED_read: {}", speed_read); + speeds.push(speed); + speeds_read.push(speed_read); + } + + println!("###################"); + println!("Summary:"); + println!("###################"); + println!( + "Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, Diffs: {}", + total_counter, + success_counter, + fail_counter_size, + fail_counter_bytes_read, + fail_counter_diff + ); + println!("Failed files: "); + for f in &failed { + println!("{}", f); + } + + let speed_len = speeds.len(); + let sum_speed: usize = speeds.into_iter().sum(); + let avg_speed = sum_speed / speed_len; + let avg_speed_bps = avg_speed * 1_000_000; + if avg_speed_bps < 1000 { + println!("Average speed: {} B/s", avg_speed_bps); + } else if avg_speed_bps < 1_000_000 { + println!("Average speed: {} KB/s", avg_speed_bps / 1000); + } else { + println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000); + } + + let speed_read_len = speeds_read.len(); + let sum_speed_read: usize = speeds_read.into_iter().sum(); + let avg_speed_read = sum_speed_read / speed_read_len; + let avg_speed_read_bps = avg_speed_read * 1_000_000; + if avg_speed_read_bps < 1000 { + println!("Average speed reading: {} B/s", avg_speed_read_bps); + } else if avg_speed_bps < 1_000_000 { + println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000); + } else { + println!( + "Average speed reading: {} MB/s", + avg_speed_read_bps / 1_000_000 + ); + } + + assert!(failed.is_empty()); +} diff --git a/vendor/ruzstd/src/tests/fuzz_regressions.rs b/vendor/ruzstd/src/tests/fuzz_regressions.rs new file mode 100644 index 000000000..2e293af4d --- /dev/null +++ b/vendor/ruzstd/src/tests/fuzz_regressions.rs @@ -0,0 +1,26 @@ +#[test] +fn test_all_artifacts() { + use crate::frame_decoder; + use std::fs; + use std::fs::File; + + let mut frame_dec = frame_decoder::FrameDecoder::new(); + + for file in fs::read_dir("./fuzz/artifacts/decode").unwrap() { + let file_name = file.unwrap().path(); + + let fnstr = file_name.to_str().unwrap().to_owned(); + if !fnstr.contains("/crash-") { + continue; + } + + let mut f = File::open(file_name.clone()).unwrap(); + match frame_dec.reset(&mut f) { + Ok(_) => { + let _ = frame_dec.decode_blocks(&mut f, frame_decoder::BlockDecodingStrategy::All); + /* ignore errors. It just should never panic on invalid input */ + } + Err(_) => {} /* ignore errors. It just should never panic on invalid input */ + } + } +} diff --git a/vendor/ruzstd/src/tests/mod.rs b/vendor/ruzstd/src/tests/mod.rs new file mode 100644 index 000000000..070565c36 --- /dev/null +++ b/vendor/ruzstd/src/tests/mod.rs @@ -0,0 +1,324 @@ +#[cfg(test)] +#[test] +fn skippable_frame() { + use crate::frame; + + let mut content = vec![]; + content.extend_from_slice(&0x184D2A50u32.to_le_bytes()); + content.extend_from_slice(&300u32.to_le_bytes()); + assert_eq!(8, content.len()); + let err = frame::read_frame_header(content.as_slice()); + assert!(matches!( + err, + Err(frame::ReadFrameHeaderError::SkipFrame(0x184D2A50u32, 300)) + )); + + content.clear(); + content.extend_from_slice(&0x184D2A5Fu32.to_le_bytes()); + content.extend_from_slice(&0xFFFFFFFFu32.to_le_bytes()); + assert_eq!(8, content.len()); + let err = frame::read_frame_header(content.as_slice()); + assert!(matches!( + err, + Err(frame::ReadFrameHeaderError::SkipFrame( + 0x184D2A5Fu32, + 0xFFFFFFFF + )) + )); +} + +#[cfg(test)] +#[test] +fn test_frame_header_reading() { + use crate::frame; + use std::fs; + + let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); + let (frame, _) = frame::read_frame_header(&mut content).unwrap(); + frame.check_valid().unwrap(); +} + +#[test] +fn test_block_header_reading() { + use crate::decoding; + use crate::frame; + use std::fs; + + let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); + let (frame, _) = frame::read_frame_header(&mut content).unwrap(); + frame.check_valid().unwrap(); + + let mut block_dec = decoding::block_decoder::new(); + let block_header = block_dec.read_block_header(&mut content).unwrap(); + let _ = block_header; //TODO validate blockheader in a smart way +} + +#[test] +fn test_frame_decoder() { + use crate::frame_decoder; + use std::fs; + + let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); + + struct NullWriter(()); + impl std::io::Write for NullWriter { + fn write(&mut self, buf: &[u8]) -> Result { + Ok(buf.len()) + } + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + let mut _null_target = NullWriter(()); + + let mut frame_dec = frame_decoder::FrameDecoder::new(); + frame_dec.reset(&mut content).unwrap(); + frame_dec + .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) + .unwrap(); +} + +#[test] +fn test_decode_from_to() { + use crate::frame_decoder; + use std::fs::File; + use std::io::Read; + let f = File::open("./decodecorpus_files/z000088.zst").unwrap(); + let mut frame_dec = frame_decoder::FrameDecoder::new(); + + let content: Vec = f.bytes().map(|x| x.unwrap()).collect(); + + let mut target = vec![0u8; 1024 * 1024]; + + // first part + let source1 = &content[..50 * 1024]; + let (read1, written1) = frame_dec + .decode_from_to(source1, target.as_mut_slice()) + .unwrap(); + + //second part explicitely without checksum + let source2 = &content[read1..content.len() - 4]; + let (read2, written2) = frame_dec + .decode_from_to(source2, &mut target[written1..]) + .unwrap(); + + //must have decoded until checksum + assert!(read1 + read2 == content.len() - 4); + + //insert checksum separatly to test that this is handled correctly + let chksum_source = &content[read1 + read2..]; + let (read3, written3) = frame_dec + .decode_from_to(chksum_source, &mut target[written1 + written2..]) + .unwrap(); + + //this must result in these values because just the checksum was processed + assert!(read3 == 4); + assert!(written3 == 0); + + let read = read1 + read2 + read3; + let written = written1 + written2; + + let result = &target.as_slice()[..written]; + + if read != content.len() { + panic!( + "Byte counter: {} was wrong. Should be: {}", + read, + content.len() + ); + } + + match frame_dec.get_checksum_from_data() { + Some(chksum) => { + if frame_dec.get_calculated_checksum().unwrap() != chksum { + println!( + "Checksum did not match! From data: {}, calculated while decoding: {}\n", + chksum, + frame_dec.get_calculated_checksum().unwrap() + ); + } else { + println!("Checksums are ok!\n"); + } + } + None => println!("No checksums to test\n"), + } + + let original_f = File::open("./decodecorpus_files/z000088").unwrap(); + let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); + + if original.len() != result.len() { + panic!( + "Result has wrong length: {}, should be: {}", + result.len(), + original.len() + ); + } + + let mut counter = 0; + let min = if original.len() < result.len() { + original.len() + } else { + result.len() + }; + for idx in 0..min { + if original[idx] != result[idx] { + counter += 1; + //println!( + // "Original {:3} not equal to result {:3} at byte: {}", + // original[idx], result[idx], idx, + //); + } + } + if counter > 0 { + panic!("Result differs in at least {} bytes from original", counter); + } +} + +#[test] +fn test_specific_file() { + use crate::frame_decoder; + use std::fs; + use std::io::Read; + + let path = "./decodecorpus_files/z000068.zst"; + let mut content = fs::File::open(path).unwrap(); + + struct NullWriter(()); + impl std::io::Write for NullWriter { + fn write(&mut self, buf: &[u8]) -> Result { + Ok(buf.len()) + } + fn flush(&mut self) -> Result<(), std::io::Error> { + Ok(()) + } + } + let mut _null_target = NullWriter(()); + + let mut frame_dec = frame_decoder::FrameDecoder::new(); + frame_dec.reset(&mut content).unwrap(); + frame_dec + .decode_blocks(&mut content, frame_decoder::BlockDecodingStrategy::All) + .unwrap(); + let result = frame_dec.collect().unwrap(); + + let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); + let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); + + println!("Results for file: {}", path); + + if original.len() != result.len() { + println!( + "Result has wrong length: {}, should be: {}", + result.len(), + original.len() + ); + } + + let mut counter = 0; + let min = if original.len() < result.len() { + original.len() + } else { + result.len() + }; + for idx in 0..min { + if original[idx] != result[idx] { + counter += 1; + //println!( + // "Original {:3} not equal to result {:3} at byte: {}", + // original[idx], result[idx], idx, + //); + } + } + if counter > 0 { + println!("Result differs in at least {} bytes from original", counter); + } +} + +#[test] +fn test_streaming() { + use std::fs; + use std::io::Read; + + let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap(); + let mut stream = crate::streaming_decoder::StreamingDecoder::new(&mut content).unwrap(); + + let mut result = Vec::new(); + Read::read_to_end(&mut stream, &mut result).unwrap(); + + let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap(); + let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); + + if original.len() != result.len() { + panic!( + "Result has wrong length: {}, should be: {}", + result.len(), + original.len() + ); + } + + let mut counter = 0; + let min = if original.len() < result.len() { + original.len() + } else { + result.len() + }; + for idx in 0..min { + if original[idx] != result[idx] { + counter += 1; + //println!( + // "Original {:3} not equal to result {:3} at byte: {}", + // original[idx], result[idx], idx, + //); + } + } + if counter > 0 { + panic!("Result differs in at least {} bytes from original", counter); + } + + // Test resetting to a new file while keeping the old decoder + + let mut content = fs::File::open("./decodecorpus_files/z000068.zst").unwrap(); + let mut stream = + crate::streaming_decoder::StreamingDecoder::new_with_decoder(&mut content, stream.inner()) + .unwrap(); + + let mut result = Vec::new(); + Read::read_to_end(&mut stream, &mut result).unwrap(); + + let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap(); + let original: Vec = original_f.bytes().map(|x| x.unwrap()).collect(); + + println!("Results for file:"); + + if original.len() != result.len() { + panic!( + "Result has wrong length: {}, should be: {}", + result.len(), + original.len() + ); + } + + let mut counter = 0; + let min = if original.len() < result.len() { + original.len() + } else { + result.len() + }; + for idx in 0..min { + if original[idx] != result[idx] { + counter += 1; + //println!( + // "Original {:3} not equal to result {:3} at byte: {}", + // original[idx], result[idx], idx, + //); + } + } + if counter > 0 { + panic!("Result differs in at least {} bytes from original", counter); + } +} + +pub mod bit_reader; +pub mod decode_corpus; +pub mod dict_test; +pub mod fuzz_regressions; -- cgit v1.2.3